In [3]:
from pipeline import *    
import pandas as pd
import itertools
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression  # New import
from sklearn.svm import SVC                         # New import
from sklearn.neighbors import KNeighborsClassifier   # New import
from sklearn.tree import DecisionTreeClassifier      # New import
from sklearn.metrics import classification_report

questionnaires = [
    ('mmse', 'cleaned_data/mmse_clean.csv', CRITERIA_MMSE),
    ('moca', 'cleaned_data/moca_clean.csv', CRITERIA_MOCA),
    ('npiq', 'cleaned_data/npiq_clean.csv', CRITERIA_NPIQ),
]

diagnosis_path = 'data/diagnosis.csv'

# Iterate over all combinations
for n in range(1, len(questionnaires) + 1):
    for combo in itertools.combinations(questionnaires, n):
        combo_names = [q[0] for q in combo]
        data_paths = [q[1] for q in combo]
        criterias = [q[2] for q in combo]

        # Load and preprocess data
        X, y = load_and_preprocess(data_paths, criterias, diagnosis_path)
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        # Define classifiers to test
        classifiers = [
            ('Random Forest', RandomForestClassifier(n_estimators=100)),
            ('Logistic Regression', LogisticRegression()),
            ('Support Vector Machine', SVC()),
            ('K-Nearest Neighbors', KNeighborsClassifier()),
            ('Decision Tree', DecisionTreeClassifier())
        ]

        # Iterate over classifiers
        for clf_name, clf in classifiers:
            # Train the model
            clf.fit(X_train, y_train)

            # Evaluate the model
            y_pred = clf.predict(X_test)
            print(f"Model trained with: {combo_names} using {clf_name}")
            print(y.value_counts())
            print(classification_report(y_test, y_pred))

        # Remove or comment out the model saving part
        # model_filename = 'saved_models/'+'rf_' + '_'.join(combo_names) + '.pkl'
        # joblib.dump((clf, X.columns.tolist()), model_filename)


Model trained with: ['mmse'] using Random Forest
DIAGNOSIS
1.0    4526
0.0    3168
2.0    2280
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.61      0.66      0.63       656
         1.0       0.62      0.61      0.61       905
         2.0       0.78      0.70      0.73       434

    accuracy                           0.65      1995
   macro avg       0.67      0.66      0.66      1995
weighted avg       0.65      0.65      0.65      1995

Model trained with: ['mmse'] using Logistic Regression
DIAGNOSIS
1.0    4526
0.0    3168
2.0    2280
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.62      0.65      0.63       656
         1.0       0.62      0.65      0.63       905
         2.0       0.81      0.69      0.75       434

    accuracy                           0.66      1995
   macro avg       0.68      0.66      0.67      1995
weighted avg       0.66      0.66      0.66  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained with: ['mmse'] using Support Vector Machine
DIAGNOSIS
1.0    4526
0.0    3168
2.0    2280
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.63      0.69      0.66       656
         1.0       0.63      0.63      0.63       905
         2.0       0.80      0.68      0.73       434

    accuracy                           0.66      1995
   macro avg       0.69      0.66      0.67      1995
weighted avg       0.66      0.66      0.66      1995

Model trained with: ['mmse'] using K-Nearest Neighbors
DIAGNOSIS
1.0    4526
0.0    3168
2.0    2280
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.57      0.75      0.64       656
         1.0       0.61      0.55      0.58       905
         2.0       0.82      0.61      0.70       434

    accuracy                           0.63      1995
   macro avg       0.67      0.63      0.64      1995
weighted avg       0.64      0.63   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained with: ['moca'] using Support Vector Machine
DIAGNOSIS
1.0    3941
0.0    3559
2.0    1448
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.64      0.68      0.66       736
         1.0       0.59      0.63      0.61       762
         2.0       0.90      0.57      0.70       292

    accuracy                           0.64      1790
   macro avg       0.71      0.63      0.66      1790
weighted avg       0.66      0.64      0.65      1790

Model trained with: ['moca'] using K-Nearest Neighbors
DIAGNOSIS
1.0    3941
0.0    3559
2.0    1448
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.58      0.69      0.63       736
         1.0       0.56      0.57      0.57       762
         2.0       0.89      0.45      0.60       292

    accuracy                           0.60      1790
   macro avg       0.68      0.57      0.60      1790
weighted avg       0.62      0.60   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained with: ['mmse', 'moca'] using Support Vector Machine
DIAGNOSIS
1.0    2838
0.0    1978
2.0    1098
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.62      0.55      0.58       413
         1.0       0.62      0.73      0.67       546
         2.0       0.93      0.74      0.82       224

    accuracy                           0.67      1183
   macro avg       0.72      0.67      0.69      1183
weighted avg       0.68      0.67      0.67      1183

Model trained with: ['mmse', 'moca'] using K-Nearest Neighbors
DIAGNOSIS
1.0    2838
0.0    1978
2.0    1098
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.54      0.65      0.59       413
         1.0       0.59      0.59      0.59       546
         2.0       0.95      0.55      0.69       224

    accuracy                           0.60      1183
   macro avg       0.69      0.60      0.62      1183
weighted avg       0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained with: ['mmse', 'npiq'] using Support Vector Machine
DIAGNOSIS
1.0    3816
0.0    2621
2.0    2002
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.69      0.66      0.67       544
         1.0       0.65      0.68      0.67       755
         2.0       0.80      0.77      0.78       389

    accuracy                           0.70      1688
   macro avg       0.71      0.70      0.71      1688
weighted avg       0.70      0.70      0.70      1688

Model trained with: ['mmse', 'npiq'] using K-Nearest Neighbors
DIAGNOSIS
1.0    3816
0.0    2621
2.0    2002
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.60      0.77      0.68       544
         1.0       0.63      0.59      0.61       755
         2.0       0.85      0.63      0.72       389

    accuracy                           0.66      1688
   macro avg       0.69      0.66      0.67      1688
weighted avg       0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained with: ['moca', 'npiq'] using Support Vector Machine
DIAGNOSIS
1.0    2269
0.0    1951
2.0     883
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.66      0.73      0.69       396
         1.0       0.64      0.65      0.65       458
         2.0       0.88      0.64      0.74       167

    accuracy                           0.68      1021
   macro avg       0.73      0.67      0.69      1021
weighted avg       0.69      0.68      0.68      1021

Model trained with: ['moca', 'npiq'] using K-Nearest Neighbors
DIAGNOSIS
1.0    2269
0.0    1951
2.0     883
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.58      0.79      0.67       396
         1.0       0.64      0.55      0.59       458
         2.0       0.91      0.51      0.66       167

    accuracy                           0.64      1021
   macro avg       0.71      0.62      0.64      1021
weighted avg       0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model trained with: ['mmse', 'moca', 'npiq'] using Support Vector Machine
DIAGNOSIS
1.0    2254
0.0    1557
2.0     878
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.61      0.72      0.66       309
         1.0       0.70      0.69      0.69       452
         2.0       0.97      0.72      0.83       177

    accuracy                           0.70       938
   macro avg       0.76      0.71      0.73       938
weighted avg       0.72      0.70      0.71       938

Model trained with: ['mmse', 'moca', 'npiq'] using K-Nearest Neighbors
DIAGNOSIS
1.0    2254
0.0    1557
2.0     878
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.54      0.75      0.63       309
         1.0       0.65      0.59      0.62       452
         2.0       0.96      0.51      0.66       177

    accuracy                           0.63       938
   macro avg       0.72      0.62      0.64       938
weig