# Modeling

## Import data

In [1]:
import pandas as pd

# Read dataframe from the pickle format
wide_df = pd.read_pickle("../../data/processed/preprocessed_df.pkl")

## Select features

In [14]:
#X = wide_df[["sex_gender", "age", "eeg", "engel_outcomes", "mri",
#             "neuroanatomical_labeling", "neuropsychological_testing",
#             "surgical_information"]]

X = wide_df[["sex_gender", "age", "eeg", "engel_outcomes", "mri", "phys_findings",
             "neuroanatomical_labeling", "neuropsychological_testing", "surg_stage", "neuro_findings",
             "surgical_information", "resect_procedure___1", "resect_procedure___2", "resect_procedure___3",
             "resect_procedure___4", "resect_procedure___5", "resect_procedure___6", "resect_procedure___7",
             "resect_procedure___8", "resect_procedure___9", "resect_procedure___10", "resect_procedure___11",
             "resect_procedure___12", "resect_procedure___14", "resect_procedure___15"]]

#X2 = wide_df.drop(["surg_engel", "surg_engel___1", "surg_engel___2", "surg_engel___3",
#                   "surg_engel___4", "engel_class1", "engel_class2", "engel_class3",
#                   "engel_class4",], axis=1)
y = wide_df["surg_engel"]
y = y - 1

## Split data into train/test

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

## Scale continuous variables

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_train.mean(), inplace=True)

column_transformer = ColumnTransformer(
    transformers=[
        ('age', StandardScaler(), ['age'])
        # Add more transformers for other columns if needed
    ],
    remainder='passthrough'  # Keep the columns not specified in transformers unchanged
)

X_train_scaled = column_transformer.fit_transform(X_train)
X_test_scaled = column_transformer.transform(X_test)

## Decision Tree classifier

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

clf = DecisionTreeClassifier(random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.54


## Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create a Random Forest classifier
# You can customize parameters such as n_estimators, max_depth, etc.
clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.63


## Support Vector Machines (SVM)

In [19]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Create an SVM classifier
# You can customize parameters such as C, kernel, etc.
clf = SVC(kernel='linear', C=1.0, random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.67


## Neural Network

In [27]:
import tensorflow as tf

# Create a simple neural network model using Keras
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(len(set(y_train)), activation='softmax')  # Output layer for multiclass classification
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model on the training data
model.fit(X_train_scaled, y_train, epochs=15, batch_size=32, validation_split=0.2, verbose=2)

# Make predictions on the test data
y_pred_prob = model.predict(X_test_scaled)
y_pred = tf.argmax(y_pred_prob, axis=1).numpy()

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Epoch 1/15
14/14 - 1s - loss: 1.4469 - accuracy: 0.2199 - val_loss: 1.3149 - val_accuracy: 0.6330 - 1s/epoch - 101ms/step
Epoch 2/15
14/14 - 0s - loss: 1.2540 - accuracy: 0.6250 - val_loss: 1.1679 - val_accuracy: 0.6422 - 76ms/epoch - 5ms/step
Epoch 3/15
14/14 - 0s - loss: 1.1340 - accuracy: 0.6227 - val_loss: 1.1099 - val_accuracy: 0.6422 - 80ms/epoch - 6ms/step
Epoch 4/15
14/14 - 0s - loss: 1.0748 - accuracy: 0.6227 - val_loss: 1.0618 - val_accuracy: 0.6422 - 72ms/epoch - 5ms/step
Epoch 5/15
14/14 - 0s - loss: 1.0234 - accuracy: 0.6319 - val_loss: 1.0377 - val_accuracy: 0.6422 - 78ms/epoch - 6ms/step
Epoch 6/15
14/14 - 0s - loss: 1.0042 - accuracy: 0.6389 - val_loss: 1.0208 - val_accuracy: 0.6422 - 72ms/epoch - 5ms/step
Epoch 7/15
14/14 - 0s - loss: 0.9819 - accuracy: 0.6412 - val_loss: 1.0209 - val_accuracy: 0.6514 - 77ms/epoch - 6ms/step
Epoch 8/15
14/14 - 0s - loss: 0.9739 - accuracy: 0.6528 - val_loss: 1.0287 - val_accuracy: 0.6422 - 75ms/epoch - 5ms/step
Epoch 9/15
14/14 - 0s - 

## Print system information

In [28]:
# Print Python version
import sys
print("Python Version:", sys.version)

Python Version: 3.11.0 (main, Oct 24 2022, 18:26:48) [MSC v.1933 64 bit (AMD64)]
