In [None]:
import timeit

import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import cross_val_score

r_s = 42
import time

In [None]:
df_x = pd.read_pickle("../../BIO_Ml/Schizophrenia/one_by_one/mvals_train_val.pkl")
df_y = pd.read_pickle("../../BIO_Ml/Schizophrenia/one_by_one/pheno_train_val.pkl")["Status"]

In [None]:
print(df_x.head())
df_y = LabelEncoder().fit_transform(df_y)
print(df_y)

In [None]:
"""
from sklearn.svm import SVC

start = time.time()
selector_k = SelectKBest(score_func=f_classif, k=100)
fit = selector_k.fit(df_x, df_y)
cols = selector_k.get_support(indices=True)
df_x = df_x.iloc[:, cols]

model = SVC(kernel="linear",
            C=1.4,
            random_state=r_s)
model.fit(df_x, df_y)
model.predict(df_x)

finish = time.time()
print(finish - start)
"""

In [None]:
"""
from sklearn.pipeline import Pipeline

X_train, X_test, y_train, y_test = train_test_split(df_x, df_y,
                                                    test_size=0.4,
                                                    random_state=r_s,
                                                    shuffle=True,
                                                    stratify=df_y)
"""

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

start = time.time()
kbest = SelectKBest()

pipeline_svm = Pipeline([('kbest', kbest), ('svm', SVC(random_state=r_s))])

svm_grid_params = {'kbest__score_func': ('f_classif', 'mutual_info_classif'),
                   'kbest__k': [30, 100, 120],
                   'svm__kernel': ('linear', 'poly'),
                   'svm__C': [1.3, 1.4, 1.5]}

clf_1 = GridSearchCV(pipeline_svm,
                     svm_grid_params,
                     refit=True,
                     scoring="accuracy",
                     verbose=3,
                     cv=2)
clf_1.fit(df_x, df_y)
print(
    f"The best SVC variant is {clf_1.best_estimator_} with parameters {clf_1.best_params_} and its accu score = {clf_1.score(df_x, df_y)}")
finish = time.time()
print(finish - start)

In [None]:
from catboost import CatBoostClassifier

pipeline_catboost = Pipeline([('kbest', kbest), ('catboost', CatBoostClassifier(silent=True, random_state=r_s))])

cat_grid_params = {'kbest__score_func': ('f_classif', 'mutual_info_classif'),
                   'kbest__k': [30, 100, 120],
                   'catboost__loss_function': ('logloss', 'CrossEntropy'),
                   'catboost__depth': [4, 6, 8],
                   'catboost__l2_leaf_reg': [3, 5, 7, 15, 20]}

clf_2 = GridSearchCV(pipeline_catboost,
                     cat_grid_params,
                     refit=True,
                     scoring="accuracy",
                     n_jobs=2,
                     verbose=3)
clf_2.fit(df_x, df_y)
print(
    f"The best Catboost variant model is {clf_2.best_estimator_} with parameters {clf_2.best_params_} and its accu score = {clf_2.score(df_x, df_y)}")

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

pipeline_ada = Pipeline(
    [('kbest', kbest), ('ada', AdaBoostClassifier(random_state=r_s, base_estimator=RandomForestClassifier()))])

ada_grid_params = {'kbest__score_func': ('f_classif', 'mutual_info_classif'),
                   'kbest__k': [30, 100, 120],
                   'ada__n_estimators': [25, 50, 100],
                   'ada__learning_rate': [0.25, 0.5, 1, 1.5],
                   'ada__algorithm': ('SAMME', 'SAMME.R')}

clf_3 = GridSearchCV(pipeline_ada,
                     ada_grid_params,
                     refit=True,
                     scoring="accuracy",
                     n_jobs=2,
                     verbose=3)
clf_3.fit(df_x, df_y)
print(
    f"The best AdaBoost variant model is {clf_3.best_estimator_} with parameters {clf_3.best_params_} and its accu score = {clf_3.score(df_x, df_y)}")

In [None]:
"""
# {'algorithm': 'SAMME', 'learning_rate': 0.25, 'n_estimators': 25} and its accu score = 1.0
classifier = AdaBoostClassifier(algorithm='SAMME',
                                learning_rate=0.25,
                                n_estimators=25,
                                random_state=r_s,
                                base_estimator=RandomForestClassifier())
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(accuracy_score(y_true=y_test, y_pred=y_pred))
"""

In [None]:
# {'depth': 4, 'l2_leaf_reg': 15, 'loss_function': 'CrossEntropy'}
"""
classifier = CatBoostClassifier(depth=4, l2_leaf_reg=15,
                                loss_function='CrossEntropy',
                                silent=True)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(accuracy_score(y_true=y_test, y_pred=y_pred))
"""