In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
from sklearn.kernel_approximation import RBFSampler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
from sklearn.metrics import accuracy_score, f1_score, make_scorer

---
__PREPARATORY CODE CELLS__

---

In [None]:
#load the datasets
Lorenzo = pd.read_csv("./../Dati_unificati/lorenzo_smooth.csv")
Leo = pd.read_csv("./../Dati_unificati/leo_smooth.csv")
Irene = pd.read_csv("./../Dati_unificati/irene_smooth.csv")
Carlotta = pd.read_csv("./../Dati_unificati/carlotta_smooth.csv")

ds = pd.concat([Lorenzo, Leo, Irene, Carlotta], ignore_index=True)

X = ds.drop(columns=['Class', 'Tester'])
y = ds['Class']
groups = ds['Tester']
print(X.shape)
print(y.shape)
print(groups.shape)

print(groups.drop_duplicates())

loso_cv = LeaveOneGroupOut()

In [None]:
#testers order
for train, test in loso_cv.split(X,y,groups):
    if (test[0] == 0):
        print("Lorenzo")
    elif (test[0] == 272):
        print('Leo')
    elif (test[0] == 450):
        print("Irene")
    elif (test[0] == 635):
        print('Carlotta')


---
__KNN__

---

Search for the best parameters

In [None]:
plt.figure(figsize=(10, 6))

results = pd.DataFrame(columns=['X', 'Mean', 'Min', 'Max'])

#Search between 1 nearest neighbor and 80 nearest neighbor
for n in range(1, 80):
    #Pipeline parameters
    classifier_KNN = KNeighborsClassifier(n_neighbors=n)
    scaler = RobustScaler()
    feat_sel = SelectKBest(k=2)

    estimators_KNN = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_KNN)]
    pipe_KNN = Pipeline(estimators_KNN)

    #Cross validation with leave-one-subject-out
    scores_KNN = cross_validate(pipe_KNN,
                                X,
                                y,
                                return_estimator=True,
                                cv=loso_cv,
                                n_jobs=-1,
                                groups=groups,
                                error_score="raise",
                                scoring={'fscore': make_scorer(f1_score, average='weighted'),
                                         'accuracy': make_scorer(accuracy_score)}
                                )

    # Concatenate the new values as a DataFrame and append to the existing results
    new_row = pd.DataFrame({
        'X': [n],
        'Mean': [np.mean(scores_KNN['test_fscore'])],
        'Min': [np.min(scores_KNN['test_fscore'])],
        'Max': [np.max(scores_KNN['test_fscore'])]
    })
    results = pd.concat([results, new_row], ignore_index=True)

# Find the index of the maximum mean F-score
max_mean_index = results['Mean'].idxmax()

# Find the index of the minimum and maximum F-score in the other two plots
min_index = results['Min'].idxmin()
max_index = results['Max'].idxmax()

# Plot the mean F-score
plt.plot(results['X'], results['Mean'], color='blue', linewidth=2, label='Mean F-score')

# Highlight the point of maximum mean F-score
plt.scatter(results['X'][max_mean_index], results['Mean'][max_mean_index], color='red', marker='*', s=100)
plt.annotate(f'Max Mean F-score ({results["Mean"][max_mean_index]:.3f})\nNeighbors: {results["X"][max_mean_index]:.3f}',
             (results['X'][max_mean_index], results['Mean'][max_mean_index]),
             textcoords="offset points",
             xytext=(0,10),
             ha='center',
             color='red')

# Plot the minimum F-score
plt.plot(results['X'], results['Min'], color='orange', linewidth=2, linestyle='--', label='Min F-score')

# Plot the maximum F-score
plt.plot(results['X'], results['Max'], color='green', linewidth=2, linestyle='--', label='Max F-score')

# Highlight the point of minimum F-score corresponding to max_mean_index
plt.scatter(results['X'][max_mean_index], results['Min'][max_mean_index], color='brown', marker='*', s=100)
plt.annotate(f'Corresponding Min F-score\n({results["Min"][max_mean_index]:.3f})',
             (results['X'][max_mean_index], results['Min'][max_mean_index]),
             textcoords="offset points",
             xytext=(0,10),
             ha='center',
             color='brown')


# Highlight the point of maximum F-score corresponding to max_mean_index
plt.scatter(results['X'][max_mean_index], results['Max'][max_mean_index], color='purple', marker='*', s=100)
plt.annotate(f'Corresponding Max F-score\n({results["Max"][max_mean_index]:.3f})',
             (results['X'][max_mean_index], results['Max'][max_mean_index]),
             textcoords="offset points",
             xytext=(0, -30),
             ha='center',
             color='purple')

plt.xlabel('Number of Neighbors')
plt.ylabel('F-score')
plt.legend()
plt.title('F-score Statistics for Different Numbers of Neighbors')
plt.show()

Model

In [None]:
classifier_KNN = KNeighborsClassifier(n_neighbors=6)
scaler = RobustScaler()
feat_sel = SelectKBest(k=2)

estimators_KNN = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_KNN)]
pipe_KNN = Pipeline(estimators_KNN)

scores_KNN = cross_validate(pipe_KNN,
                            X,
                            y,
                            return_estimator=True,
                            cv=loso_cv,
                            n_jobs=-1,
                            groups=groups,
                            error_score="raise",
                            scoring={'fscore': make_scorer(f1_score, average='weighted'),
                                        'accuracy': make_scorer(accuracy_score)}
                            )

print(scores_KNN)
print(np.mean(scores_KNN['test_fscore']))
print(np.mean(scores_KNN['test_accuracy']))

- __Dataset senza smoothing__: 
    * __F-score (AVG: 0.609)__: 
        + 0.82418721
        + 0.5358844 
        + 0.50821999        
        + 0.57001965
    * __Accuracy (AVG: 0.694)__:
        + 0.83261803
        + 0.65405405
        + 0.62921348
        + 0.66176471

-  __Dataset con smoothing__: 
    * __F-score (AVG: 0.724)__:
        + 0.78116751
        + 0.5358844
        + 1
        + 0.57844139
    * __Accuracy (AVG: 0.774)__: 
        + 0.78969957
        + 0.65405405
        + 1
        + 0.65073529

---
__RANDOM FOREST__

---

Search for the best parameters

In [9]:
results = pd.DataFrame(columns=['X', 'Mean', 'Min', 'Max'])

#iterate over min_sample_leaf
for n in range(1, 10):
    #iterate over min_samples_split
    for m in range(1, 50):
        #iterate over n_estimators
        for e in range(1, 500, 20):
            classifier_RF = RandomForestClassifier(n_estimators=e, min_samples_split=m, min_samples_leaf=n, max_features='sqrt', bootstrap=True, max_depth=10)
            scaler = RobustScaler()
            feat_sel = SelectKBest(k=2)
            estimators_RF = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_RF)]
            pipe_RF = Pipeline(estimators_RF)

            scores_RF = cross_validate(pipe_RF,
                                    X,
                                    y,
                                    return_estimator=True,
                                    cv=loso_cv,
                                    n_jobs=-1,
                                    groups=groups,
                                    error_score="raise",
                                    scoring={'fscore': make_scorer(f1_score, average='weighted'),
                                                'accuracy': make_scorer(accuracy_score)}
                                    )

            # Concatenate the new values as a DataFrame and append to the existing results
            new_row = pd.DataFrame({'X': [str(m) + '-' + str(n) + '-' + str(e)],
                                    'Mean': [np.mean(scores_RF['test_fscore'])],
                                    'Min': [np.min(scores_RF['test_fscore'])],
                                    'Max': [np.max(scores_RF['test_fscore'])]})
            
            results = pd.concat([results, new_row], ignore_index=True)

    print(n)



1
2
3
4
5
6
7
8
9


In [10]:
# Find the index of the maximum mean F-score
max_mean_index = results['Mean'].idxmax()

print(results.iloc[max_mean_index])

X         41-2-1
Mean    0.788409
Min     0.535884
Max          1.0
Name: 180, dtype: object


Model

In [None]:
classifier_RF = RandomForestClassifier(n_estimators=61, min_samples_split=30, min_samples_leaf=1, max_features='sqrt', bootstrap=True, max_depth=10)
scaler = RobustScaler()
feat_sel = SelectKBest(k=2)
estimators_RF = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_RF)]
pipe_RF = Pipeline(estimators_RF)

scores_RF = cross_validate(pipe_RF,
                        X,
                        y,
                        return_estimator=True,
                        cv=loso_cv,
                        n_jobs=-1,
                        groups=groups,
                        error_score="raise",
                        scoring={'fscore': make_scorer(f1_score, average='weighted'),
                                    'accuracy': make_scorer(accuracy_score)}
                        )

print(scores_RF)
print(np.mean(scores_RF['test_fscore']))
print(np.mean(scores_RF['test_accuracy']))

- __Dataset senza smoothing__: 
    * __F-score (AVG: 0.636)__: 
        + 0.58946152
        + 0.54697555
        + 1        
        + 0.40744893
    * __Accuracy (AVG: 0.708)__:
        + 0.68669528
        + 0.65405405
        + 1
        + 0.49264706

-  __Dataset con smoothing__: 
    * __F-score (AVG: 0.745)__:
        + 1
        + 0.5358844
        + 0.92478368
        + 0.52102544
    * __Accuracy (AVG: 0.783)__: 
        + 1
        + 0.65405405
        + 0.92696629
        + 0.55147059 

---
__ADABOOST__

---

In [None]:
classifier_AdaBoost = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=61, min_samples_split=30, min_samples_leaf=1, max_features='sqrt', bootstrap=True, max_depth=10), n_estimators=400, random_state=5)
scaler = RobustScaler()
feat_sel = SelectKBest(k=2)

estimators_AdaBoost = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_AdaBoost)]
pipe_AdaBoost = Pipeline(estimators_AdaBoost)

scores_AdaBoost = cross_validate(pipe_AdaBoost,
                         X,
                         y,
                         return_estimator = True,
                         cv = loso_cv,
                         n_jobs=-1,
                         groups=groups,
                         error_score="raise",
                         scoring = {'fscore': make_scorer(f1_score, average='weighted'),
                                    'accuracy': make_scorer(accuracy_score)}
)

print(scores_AdaBoost)
print(np.mean(scores_AdaBoost['test_fscore']))
print(np.mean(scores_AdaBoost['test_accuracy']))


- __Dataset senza smoothing__: 
    * __F-score (AVG: 0.637)__: 
        + 0.58946152 
        + 0.54697555 
        + 1        
        + 0.41180283
    * __Accuracy (AVG: 0.709)__:
        + 0.68669528
        + 0.65405405
        + 1
        + 0.49632353

-  __Dataset con smoothing__: 
    * __F-score (AVG: 0.748)__:
        + 1
        + 0.54697555
        + 0.92478368
        + 0.52102544
    * __Accuracy (AVG: 0.783)__: 
        + 1
        + 0.65405405
        + 0.92696629
        + 0.55147059

---
---
__OTHER TESTED MODELS WITH WORSE PERFORMANCE__
---

---
__SVC__

---

In [None]:
powers_of_10 = [-3, -2, -1, 1, 2, 3]  # corresponding to 0.001, 0.01, 0.1, 1, 10, 100, 1000

plt.figure(figsize=(10, 6))

results = pd.DataFrame(columns=['X', 'Mean', 'Min', 'Max'])

for power in powers_of_10:
    C_value = 10 ** power
    
    # Create an instance of the Support Vector Classification (SVC) with your desired parameters
    classifier_SVC = SVC(kernel='rbf', C=C_value)

    # Other parts of the code remain the same
    scaler = RobustScaler()
    feat_sel = SelectKBest(k=2)

    # Use the SVC classifier in the pipeline
    estimators_SVC = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_SVC)]
    pipe_SVC = Pipeline(estimators_SVC)

    # Assuming X, y, loso_cv, and groups are defined before this point

    # Perform cross-validation using Leave-One-Group-Out (LOSO) strategy
    scores_SVC = cross_validate(pipe_SVC,
                                X,
                                y,
                                return_estimator=True,
                                cv=LeaveOneGroupOut(),
                                n_jobs=-1,
                                groups=groups,
                                error_score="raise",
                                scoring={'fscore': make_scorer(f1_score, average='weighted'),
                                        'accuracy': make_scorer(accuracy_score)}
                                )
        # Concatenate the new values as a DataFrame and append to the existing results
    new_row = pd.DataFrame({
        'X': [power],
        'Mean': [np.mean(scores_SVC['test_fscore'])],
        'Min': [np.min(scores_SVC['test_fscore'])],
        'Max': [np.max(scores_SVC['test_fscore'])]
    })

    results = pd.concat([results, new_row], ignore_index=True)

# Find the index of the maximum mean F-score
max_mean_index = results['Mean'].idxmax()

# Find the index of the minimum and maximum F-score in the other two plots
min_index = results['Min'].idxmin()
max_index = results['Max'].idxmax()

# Plot the mean F-score
plt.plot(results['X'], results['Mean'], color='blue', linewidth=2, label='Mean F-score')

# Highlight the point of maximum mean F-score
plt.scatter(results['X'][max_mean_index], results['Mean'][max_mean_index], color='red', marker='*', s=100)
plt.annotate(f'Max Mean F-score ({results["Mean"][max_mean_index]:.3f})\nValue of C: {results["X"][max_mean_index]:.3f}',
             (results['X'][max_mean_index], results['Mean'][max_mean_index]),
             textcoords="offset points",
             xytext=(0,10),
             ha='center',
             color='red')

# Plot the minimum F-score
plt.plot(results['X'], results['Min'], color='orange', linewidth=2, linestyle='--', label='Min F-score')

# Plot the maximum F-score
plt.plot(results['X'], results['Max'], color='green', linewidth=2, linestyle='--', label='Max F-score')

# Highlight the point of minimum F-score
plt.scatter(results['X'][max_mean_index], results['Min'][max_mean_index], color='brown', marker='*', s=100)
plt.annotate(f'Corresponding Min F-score\n({results["Min"][max_mean_index]:.3f})',
             (results['X'][max_mean_index], results['Min'][max_mean_index]),
             textcoords="offset points",
             xytext=(0,10),
             ha='center',
             color='brown')


# Highlight the point of maximum F-score
plt.scatter(results['X'][max_mean_index], results['Max'][max_mean_index], color='purple', marker='*', s=100)
plt.annotate(f'Corresponding Max F-score\n({results["Max"][max_mean_index]:.3f})',
             (results['X'][max_mean_index], results['Max'][max_mean_index]),
             textcoords="offset points",
             xytext=(0, -30),
             ha='center',
             color='purple')

plt.xlabel('Value of C')
plt.ylabel('F-score')
plt.legend()
plt.title('F-score Statistics for Different C')
plt.show()

In [None]:
classifier_SVC = SVC(kernel='rbf', C=0.1)

# Other parts of the code remain the same
scaler = RobustScaler(quantile_range=(25, 75))
feat_sel = SelectKBest(k=2)

# Use the SVC classifier in the pipeline
estimators_SVC = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_SVC)]
pipe_SVC = Pipeline(estimators_SVC)

# Assuming X, y, loso_cv, and groups are defined before this point

# Perform cross-validation using Leave-One-Group-Out (LOSO) strategy
scores_SVC = cross_validate(pipe_SVC,
                            X,
                            y,
                            return_estimator=True,
                            cv=LeaveOneGroupOut(),
                            n_jobs=-1,
                            groups=groups,
                            error_score="raise",
                            scoring={'fscore': make_scorer(f1_score, average='weighted'),
                                    'accuracy': make_scorer(accuracy_score)}
                            )

print(scores_SVC)

- __Dataset senza smoothing__: 
    * __F-score (AVG: 0.660)__: 
        + 0.52911271
        + 0.5358844
        + 1       
        + 0.57587227
    * __Accuracy (AVG: 0.741)__:
        + 0.64377682
        + 0.65405405
        + 1
        + 0.66911765

-  __Dataset con smoothing__: 
    * __F-score (AVG: 0.650)__:
        + 0.52911271
        + 0.5358844
        + 1
        + 0.53681049
    * __Accuracy (AVG: 0.731)__: 
        + 0.64377682
        + 0.65405405
        + 1
        + 0.62867647

---
__Gradient Boosting__

---

In [None]:

plt.figure(figsize=(10, 6))

results = pd.DataFrame(columns=['X', 'Mean', 'Min', 'Max'])

for i in range(1, 50, 1):
    # Create an instance of the Gradient Boosting Classifier with desired parameters
    classifier_GB = GradientBoostingClassifier(n_estimators=20, learning_rate=0.1, max_depth=i)

    # Other parts of the code remain the same
    scaler = RobustScaler()
    feat_sel = SelectKBest(k=2)

    # Use the Gradient Boosting classifier in the pipeline
    estimators_GB = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_GB)]
    pipe_GB = Pipeline(estimators_GB)

    # Assuming X, y, loso_cv, and groups are defined before this point

    # Perform cross-validation using Leave-One-Group-Out (LOSO) strategy
    scores_GB = cross_validate(pipe_GB,
                            X,
                            y,
                            return_estimator=True,
                            cv=LeaveOneGroupOut(),
                            n_jobs=-1,
                            groups=groups,
                            error_score="raise",
                            scoring={'fscore': make_scorer(f1_score, average='weighted'),
                                        'accuracy': make_scorer(accuracy_score)}
                            )

    new_row = pd.DataFrame({
        'X': [i],
        'Mean': [np.mean(scores_SVC['test_fscore'])],
        'Min': [np.min(scores_SVC['test_fscore'])],
        'Max': [np.max(scores_SVC['test_fscore'])]
    })

    results = pd.concat([results, new_row], ignore_index=True)

print(scores_GB)

- __Dataset senza smoothing__: 
    * __F-score (AVG: 0.643)__: 
        + 0.34773977
        + 1
        + 0.6172367        
        + 0.60698666
    * __Accuracy (AVG: 0.716)__:
        + 0.43347639
        + 1
        + 0.71910112
        + 0.70955882

-  __Dataset con smoothing__: 
    * __F-score (AVG: 0.650)__:
        + 0.52911271
        + 0.5358844
        + 1
        + 0.53681049
    * __Accuracy (AVG: 0.731)__: 
        + 0.64377682
        + 0.65405405
        + 1
        + 0.62867647

---
__SGD__

---

In [None]:
# Create an instance of the SGD Classifier with desired parameters
classifier_SGD = SGDClassifier(loss='hinge', alpha=0.0001, max_iter=1000, random_state=42)

# Other parts of the code remain the same
scaler = RobustScaler()
feat_sel = SelectKBest(k=2)

# Use the SGD classifier in the pipeline
estimators_SGD = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_SGD)]
pipe_SGD = Pipeline(estimators_SGD)

# Assuming X, y, loso_cv, and groups are defined before this point

# Perform cross-validation using Leave-One-Group-Out (LOSO) strategy
scores_SGD = cross_validate(pipe_SGD,
                            X,
                            y,
                            return_estimator=True,
                            cv=LeaveOneGroupOut(),
                            n_jobs=-1,
                            groups=groups,
                            error_score="raise",
                            scoring={'fscore': make_scorer(f1_score, average='weighted'),
                                     'accuracy': make_scorer(accuracy_score)}
                            )

print(scores_SGD)

- __Dataset senza smoothing__: 
    * __F-score (AVG: 0.645)__: 
        + 0.52911271
        + 1
        + 0.50821999
        + 0.54167635
    * __Accuracy (AVG: 0.731)__:
        + 0.64377682
        + 1
        + 0.62921348
        + 0.65073529

-  __Dataset con smoothing__: 
    * __F-score (AVG: 0.618)__:
        + 0.52911271
        + 1
        + 0.50821999
        + 0.43330882
    * __Accuracy (AVG: 0.704)__: 
        + 0.64377682
        + 1
        + 0.62921348
        + 0.54411765

---
__SGD with RBFSampler__

---

In [None]:
# Create an instance of the SGD Classifier with desired parameters
classifier_SGD = SGDClassifier(loss='hinge', alpha=0.0001, max_iter=1000, random_state=42)

# Use RBFSampler with SGD Classifier
rbf_sampler = RBFSampler(gamma=0.5, n_components=100, random_state=42)

# Other parts of the code remain the same
scaler = RobustScaler()
feat_sel = SelectKBest(k=2)

# Use the SGD classifier with RBFSampler in the pipeline
estimators_SGD_rbf = [('scaling', scaler), ('feature-selection', feat_sel), ('rbf-sampler', rbf_sampler), ('clf', classifier_SGD)]
pipe_SGD_rbf = Pipeline(estimators_SGD_rbf)

# Assuming X, y, loso_cv, and groups are defined before this point

# Perform cross-validation using Leave-One-Group-Out (LOSO) strategy
scores_SGD_rbf = cross_validate(pipe_SGD_rbf,
                                X,
                                y,
                                return_estimator=True,
                                cv=LeaveOneGroupOut(),
                                n_jobs=-1,
                                groups=groups,
                                error_score="raise",
                                scoring={'fscore': make_scorer(f1_score, average='weighted'),
                                         'accuracy': make_scorer(accuracy_score)}
                                )

print(scores_SGD_rbf)

- __Dataset senza smoothing__: 
    * __F-score (AVG: 0.635)__: 
        + 0.52911271
        + 0.5358844
        + 0.90904965        
        + 0.56706786
    * __Accuracy (AVG: 0.717)__:
        + 0.64377682
        + 0.65405405
        + 0.91011236
        + 0.65808824

-  __Dataset con smoothing__: 
    * __F-score (AVG: 0.564)__:
        + 0.49429745
        + 0.5358844
        + 0.65607763
        + 0.56881985
    * __Accuracy (AVG: 0.655)__: 
        + 0.59656652
        + 0.65405405
        + 0.70786517
        + 0.66176471