# **Libraries**

In [81]:
# Data Load Code
from load_data import data_loader

# Preprocessing functions
from preprocessing import classifier_pipeline

from sklearn.model_selection import train_test_split, ShuffleSplit, KFold, StratifiedKFold
from sklearn.metrics import f1_score, recall_score, precision_score, mean_squared_error, roc_curve
from sklearn.preprocessing import StandardScaler

# **Load the data**

In [2]:
full_dataset, fa, func, gm = data_loader(unzip=False)

  clinical_data = pd.read_excel(path + "subject_clinical_data.xlsx")


# **Classification Pipeline**

In [3]:
X, y = classifier_pipeline(fa, gm, func)

[neuroCombat] Creating design matrix
[neuroCombat] Standardizing data across features
[neuroCombat] Fitting L/S model and finding priors
[neuroCombat] Finding parametric adjustments
[neuroCombat] Final adjustment of data
Statistically diferences in 1132 of connections
Statistically diferences in 185 of connections with FDR
Statistically diferences in 381 of connections
Statistically diferences in 6 of connections with FDR
Statistically diferences in 130 of connections
Statistically diferences in 4 of connections with FDR


# **Some ML Models**

## 1. Logistic Regression

In [83]:
from sklearn.linear_model import LogisticRegression

def evaluation(X, y, model):
    mean_score = 0
    logistic_model = model
    kf = StratifiedKFold(n_splits=5)
    sc = StandardScaler()
    for train_index, test_index in kf.split(X, y):
        X_train, y_train = sc.fit_transform(X.iloc[train_index, :]), y.iloc[train_index]
        X_test, y_test = sc.transform(X.iloc[test_index, :]), y.iloc[test_index]
        logistic_model.fit(X_train, y_train)
        y_hat = logistic_model.predict(X_test)
        mean_score += f1_score(y_hat, y_test)
    print(mean_score/5)

evaluation(X, y, LogisticRegression())

0.9631795820064148


## 2. Suport Vector Regressor

In [16]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

sc = StandardScaler()
X_std = sc.fit_transform(X)
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 
                     'gamma': [1e-3, 1e-4, 1e-2, 1e-5],
                     'C': [1, 10, 100, 500, 1000, 3000]}]

clf = GridSearchCV(
        SVC(), tuned_parameters, scoring='f1')

clf.fit(X_std, y)
print(clf.best_params_)

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}


In [38]:
evaluation(X, y, SVC(kernel="rbf", C=15, gamma=0.001))

0.9700761337305529


## 3. Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [200, 500, 700, 1000],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [2, 3, 5, 7]
}

CV_rfc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv= 5)
CV_rfc.fit(X_std, y)
CV_rfc.best_params_

{'max_depth': 7, 'max_features': 'sqrt', 'n_estimators': 200}

In [51]:
evaluation(X, y, RandomForestClassifier(max_depth=13, max_features = "sqrt", n_estimators = 200))

0.967251461988304


## 4. XGBoost Classifier

In [60]:
from xgboost import XGBClassifier

sc = StandardScaler()
X_std = sc.fit_transform(X)
# Set the parameters by cross-validation
tuned_parameters = [{"learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
                     "max_depth": [ 3, 4, 5, 6, 8, 10, 12, 15],
                     "min_child_weight": [ 1, 3, 5, 7 ],
                     "gamma": [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
                     "colsample_bytree": [ 0.3, 0.4, 0.5 , 0.7 ] }]

xgb_gs = GridSearchCV(
        XGBClassifier(), tuned_parameters, scoring='f1')

xgb_gs.fit(X_std, y)
print(xgb_gs.best_params_)

{'colsample_bytree': 0.5, 'gamma': 0.2, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 5}


In [76]:
evaluation(X, y, XGBClassifier(colsample_bytree= 0.5, gamma= 0.2, learning_rate= 0.05, max_depth= 3, min_child_weight= 5))

0.9785714285714286
