In [2]:
from pipeline import *    
import pandas as pd
import itertools
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression  # New import
from sklearn.svm import SVC                         # New import
from sklearn.neighbors import KNeighborsClassifier   # New import
from sklearn.tree import DecisionTreeClassifier      # New import
from sklearn.metrics import classification_report

questionnaires = [
    ('mmse', 'data/mmse.csv', CRITERIA_MMSE),
    ('moca', 'data/moca.csv', CRITERIA_MOCA),
    ('npiq', 'data/npiq.csv', CRITERIA_NPIQ),
]

diagnosis_path = 'data/diagnosis.csv'

# Iterate over all combinations
for n in range(1, len(questionnaires) + 1):
    for combo in itertools.combinations(questionnaires, n):
        combo_names = [q[0] for q in combo]
        data_paths = [q[1] for q in combo]
        criterias = [q[2] for q in combo]

        # Load and preprocess data
        X, y = load_and_preprocess(data_paths, criterias, diagnosis_path, balance=True)
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Define classifiers to test
        classifiers = [
            ('Random Forest', RandomForestClassifier(n_estimators=100)),
            ('Logistic Regression', LogisticRegression()),
            ('Support Vector Machine', SVC()),
            ('K-Nearest Neighbors', KNeighborsClassifier()),
            ('Decision Tree', DecisionTreeClassifier())
        ]

        # Iterate over classifiers
        for clf_name, clf in classifiers:
            # Train the model
            clf.fit(X_train, y_train)

            # Evaluate the model
            y_pred = clf.predict(X_test)
            print(f"Model trained with: {combo_names} using {clf_name}")
            print(y.value_counts())
            print(classification_report(y_test, y_pred))

        # Remove or comment out the model saving part
        # model_filename = 'saved_models/'+'rf_' + '_'.join(combo_names) + '.pkl'
        # joblib.dump((clf, X.columns.tolist()), model_filename)


Model trained with: ['mmse'] using Random Forest
DIAGNOSIS
0.0    2280
1.0    2280
2.0    2280
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.60      0.80      0.69       449
         1.0       0.56      0.40      0.46       496
         2.0       0.80      0.80      0.80       423

    accuracy                           0.65      1368
   macro avg       0.65      0.66      0.65      1368
weighted avg       0.65      0.65      0.64      1368

Model trained with: ['mmse'] using Logistic Regression
DIAGNOSIS
0.0    2280
1.0    2280
2.0    2280
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.61      0.80      0.69       449
         1.0       0.56      0.41      0.47       496
         2.0       0.82      0.81      0.82       423

    accuracy                           0.66      1368
   macro avg       0.66      0.67      0.66      1368
weighted avg       0.66      0.66      0.65  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained with: ['mmse'] using Support Vector Machine
DIAGNOSIS
0.0    2280
1.0    2280
2.0    2280
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.61      0.74      0.67       449
         1.0       0.56      0.43      0.48       496
         2.0       0.81      0.83      0.82       423

    accuracy                           0.66      1368
   macro avg       0.66      0.67      0.66      1368
weighted avg       0.65      0.66      0.65      1368

Model trained with: ['mmse'] using K-Nearest Neighbors
DIAGNOSIS
0.0    2280
1.0    2280
2.0    2280
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.45      0.29      0.35       449
         1.0       0.42      0.61      0.50       496
         2.0       0.84      0.71      0.77       423

    accuracy                           0.54      1368
   macro avg       0.57      0.54      0.54      1368
weighted avg       0.56      0.54   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained with: ['moca'] using Logistic Regression
DIAGNOSIS
0.0    1456
1.0    1456
2.0    1456
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.59      0.71      0.64       286
         1.0       0.50      0.45      0.47       283
         2.0       0.86      0.77      0.82       305

    accuracy                           0.65       874
   macro avg       0.65      0.65      0.64       874
weighted avg       0.66      0.65      0.65       874

Model trained with: ['moca'] using Support Vector Machine
DIAGNOSIS
0.0    1456
1.0    1456
2.0    1456
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.60      0.77      0.67       286
         1.0       0.54      0.47      0.51       283
         2.0       0.89      0.76      0.82       305

    accuracy                           0.67       874
   macro avg       0.68      0.67      0.67       874
weighted avg       0.68      0.67   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained with: ['npiq'] using Support Vector Machine
DIAGNOSIS
0.0    1150
1.0    1150
2.0    1150
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.64      0.77      0.70       242
         1.0       0.37      0.37      0.37       208
         2.0       0.63      0.51      0.56       240

    accuracy                           0.56       690
   macro avg       0.55      0.55      0.54       690
weighted avg       0.55      0.56      0.55       690

Model trained with: ['npiq'] using K-Nearest Neighbors
DIAGNOSIS
0.0    1150
1.0    1150
2.0    1150
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.58      0.82      0.68       242
         1.0       0.40      0.37      0.38       208
         2.0       0.65      0.43      0.52       240

    accuracy                           0.55       690
   macro avg       0.54      0.54      0.53       690
weighted avg       0.55      0.55   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained with: ['mmse', 'moca'] using Logistic Regression
DIAGNOSIS
0.0    1103
1.0    1103
2.0    1103
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.58      0.72      0.64       217
         1.0       0.50      0.41      0.45       225
         2.0       0.88      0.83      0.86       220

    accuracy                           0.65       662
   macro avg       0.66      0.66      0.65       662
weighted avg       0.66      0.65      0.65       662

Model trained with: ['mmse', 'moca'] using Support Vector Machine
DIAGNOSIS
0.0    1103
1.0    1103
2.0    1103
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.56      0.79      0.65       217
         1.0       0.50      0.32      0.39       225
         2.0       0.89      0.85      0.87       220

    accuracy                           0.65       662
   macro avg       0.65      0.65      0.64       662
weighted avg       0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained with: ['mmse', 'npiq'] using Support Vector Machine
DIAGNOSIS
0.0    1147
1.0    1147
2.0    1147
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.77      0.79      0.78       243
         1.0       0.56      0.59      0.57       206
         2.0       0.86      0.79      0.82       240

    accuracy                           0.73       689
   macro avg       0.73      0.72      0.72       689
weighted avg       0.74      0.73      0.73       689

Model trained with: ['mmse', 'npiq'] using K-Nearest Neighbors
DIAGNOSIS
0.0    1147
1.0    1147
2.0    1147
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.63      0.76      0.69       243
         1.0       0.43      0.47      0.45       206
         2.0       0.85      0.59      0.70       240

    accuracy                           0.61       689
   macro avg       0.63      0.61      0.61       689
weighted avg       0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained with: ['mmse', 'moca', 'npiq'] using Random Forest
DIAGNOSIS
0.0    51
1.0    51
2.0    51
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.80      0.67      0.73        12
         1.0       0.60      0.67      0.63         9
         2.0       0.91      1.00      0.95        10

    accuracy                           0.77        31
   macro avg       0.77      0.78      0.77        31
weighted avg       0.78      0.77      0.77        31

Model trained with: ['mmse', 'moca', 'npiq'] using Logistic Regression
DIAGNOSIS
0.0    51
1.0    51
2.0    51
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.82      0.75      0.78        12
         1.0       0.60      0.67      0.63         9
         2.0       0.90      0.90      0.90        10

    accuracy                           0.77        31
   macro avg       0.77      0.77      0.77        31
weighted avg       0.78  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
