In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.tree import _tree
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import LeaveOneGroupOut, cross_validate
from sklearn.metrics import accuracy_score, f1_score, make_scorer

---
__UTILITY FUNCTIONS__

---

In [None]:
def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print("{}elif {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print("{}return {}".format(indent, np.argmax(tree_.value[node])))

    recurse(0, 1)

In [None]:
def scatter_plot(df, var1, var2):
    x = df[var1]
    y = df[var2]
    classes = df['Class']

    # Mappa dei valori
    mappa_valori = {0.0: 'Left', 1.0: 'Centered', 2.0: 'Right'}

    # Sostituisci i valori nell'array
    array_con_stringhe = np.where(np.isin(classes, list(mappa_valori.keys())), [mappa_valori[val] for val in classes], classes)

    unique = list(set(array_con_stringhe))

    # Assign colors based on unique values
    colors = plt.cm.get_cmap('jet', len(unique))

    for i, u in enumerate(unique):
        xi = [x[j] for j in range(len(x)) if array_con_stringhe[j] == u]
        yi = [y[j] for j in range(len(x)) if array_con_stringhe[j] == u]
        plt.scatter(xi, yi, c=[colors(i)], label=str(u))

    plt.legend()
    plt.show()

---
__DATA LOADING__

---

In [None]:
Lorenzo = pd.read_csv("./../../Data_unificati/lorenzo_smooth.csv")
Leo = pd.read_csv("./../../Data_unificati/leo_smooth.csv")
Irene = pd.read_csv("./../../Data_unificati/irene_smooth.csv")
Carlotta = pd.read_csv("./../../Data_unificati/carlotta_smooth.csv")

ds = pd.concat([Lorenzo, Leo, Irene, Carlotta], ignore_index=True)

X = ds.drop(columns=['Class', 'Tester'])
y = ds['Class']
groups = ds['Tester']
print(X.shape)
print(y.shape)
print(groups.shape)

print(groups.drop_duplicates())

loso_cv = LeaveOneGroupOut()

---
__KNN__

---

In [None]:
results_KNN = []
for i in range(1, 201):    
    classifier_KNN = KNeighborsClassifier(n_neighbors=6)
    scaler = RobustScaler()
    feat_sel = SelectKBest(k=2)

    estimators_KNN = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_KNN)]
    pipe_KNN = Pipeline(estimators_KNN)

    scores_KNN = cross_validate(pipe_KNN,
                                X,
                                y,
                                return_estimator=True,
                                cv=loso_cv,
                                n_jobs=-1,
                                groups=groups,
                                error_score="raise",
                                scoring={'fscore': make_scorer(f1_score, average='weighted'),
                                            'accuracy': make_scorer(accuracy_score)}
                                )
    results_KNN.append(pd.DataFrame(scores_KNN))

# Concatenate results outside the loop
final_results_KNN = pd.concat(results_KNN, ignore_index=True)

print(final_results_KNN)

---
__RANDOM FOREST__

---

In [None]:
results_RF = []  # List to store results

for i in range(1, 201):
    classifier_RF = RandomForestClassifier(n_estimators=61, min_samples_split=30, min_samples_leaf=1, max_features='sqrt', bootstrap=True, max_depth=10, random_state=i)
    scaler = RobustScaler()
    feat_sel = SelectKBest(k=2)
    estimators_RF = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_RF)]
    pipe_RF = Pipeline(estimators_RF)

    scores_RF = cross_validate(pipe_RF,
                               X,
                               y,
                               return_estimator=True,
                               cv=loso_cv,
                               n_jobs=-1,
                               groups=groups,
                               error_score="raise",
                               scoring={'fscore': make_scorer(f1_score, average='weighted'),
                                        'accuracy': make_scorer(accuracy_score)}
                               )
    
    results_RF.append(pd.DataFrame(scores_RF))

# Concatenate results outside the loop
final_results_RF = pd.concat(results_RF, ignore_index=True)

print(final_results_RF)
print(final_results_RF.iloc[final_results_RF['test_fscore'].idxmin()])

---
__ADABOOST__

---

In [None]:
results_AB = []
for i in range(1,201):
    classifier_AdaBoost = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=61, min_samples_split=30, min_samples_leaf=1, max_features='sqrt', bootstrap=True, max_depth=10, random_state=i), n_estimators=400, random_state=i)
    scaler = RobustScaler()
    feat_sel = SelectKBest(k=2)

    estimators_AdaBoost = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_AdaBoost)]
    pipe_AdaBoost = Pipeline(estimators_AdaBoost)

    scores_AdaBoost = cross_validate(pipe_AdaBoost,
                            X,
                            y,
                            return_estimator = True,
                            cv = loso_cv,
                            n_jobs=-1,
                            groups=groups,
                            error_score="raise",
                            scoring = {'fscore': make_scorer(f1_score, average='weighted'),
                                        'accuracy': make_scorer(accuracy_score)}
    )
    results_AB.append(pd.DataFrame(scores_AdaBoost))
# Concatenate results outside the loop
final_results_AdaBoost = pd.concat(results_AB, ignore_index=True)
print(final_results_AdaBoost)

---
__PERFORMANCE EVALUATION__

---

In [None]:
metrics = pd.DataFrame({'KNN': final_results_KNN['test_fscore'],
                        'RF': final_results_RF['test_fscore'],
                        'AB': final_results_AdaBoost['test_fscore']})
metrics

In [None]:
from matplotlib import pyplot as plt
ax = metrics.boxplot(figsize = (3,3))
ax.set_ylabel('f-score')
plt.show()

In [None]:
from scipy.stats import wilcoxon
print(wilcoxon(metrics.RF, metrics.AB))

In [None]:
print("Mean of RF: " + str(np.mean(metrics.RF)))
print("Mean of AB: " + str(np.mean(metrics.AB)))

In [None]:
from scipy.stats import wilcoxon
print(wilcoxon(metrics.KNN, metrics.RF))

---
__TREE ANALYSIS OF ADABOOST__

---

In [None]:

print( ds[(ds['Tester'] == 'Lorenzo')].drop(columns=['Class', 'Tester']))

In [None]:
classifier_AdaBoost = AdaBoostClassifier(estimator=RandomForestClassifier(n_estimators=61, min_samples_split=30, min_samples_leaf=1, max_features='sqrt', bootstrap=True, max_depth=10), n_estimators=400, random_state=0)
scaler = RobustScaler()
feat_sel = SelectKBest(k=2)

estimators_AdaBoost = [('scaling', scaler), ('feature-selection', feat_sel), ('clf', classifier_AdaBoost)]
pipe_AdaBoost = Pipeline(estimators_AdaBoost)

X_train = ds[(ds['Tester'] != 'Lorenzo')].drop(columns=['Class', 'Tester'])
X_test= ds[(ds['Tester'] == 'Lorenzo')].drop(columns=['Class', 'Tester'])

y_train = ds[(ds['Tester'] != 'Lorenzo')]['Class']
y_test= ds[(ds['Tester'] == 'Lorenzo')]['Class']

pipe_AdaBoost.fit(X_train, y_train)
y_pred = pipe_AdaBoost.predict(X_test)

print(accuracy_score(y_pred, y_test))

#the selected k=2 best feature chosen by the model
print(X.columns[pipe_AdaBoost['feature-selection'].get_support()])

In [46]:
tree_to_code(pipe_AdaBoost['clf'].estimators_[0].estimators_[1], X.columns[pipe_AdaBoost['feature-selection'].get_support()])

def tree(meanFreq_MaxS11, mean_S11DEG):
  if meanFreq_MaxS11 <= 0.15777262393385172:
    if mean_S11DEG <= 0.1628556177020073:
      return 1
    elif mean_S11DEG > 0.1628556177020073
      if meanFreq_MaxS11 <= -1.5672853589057922:
        return 0
      elif meanFreq_MaxS11 > -1.5672853589057922
        return 0
  elif meanFreq_MaxS11 > 0.15777262393385172
    if meanFreq_MaxS11 <= 0.76450115442276:
      if meanFreq_MaxS11 <= 0.30162413418293:
        return 2
      elif meanFreq_MaxS11 > 0.30162413418293
        return 2
    elif meanFreq_MaxS11 > 0.76450115442276
      if meanFreq_MaxS11 <= 0.8689095079898834:
        return 1
      elif meanFreq_MaxS11 > 0.8689095079898834
        return 1


In [None]:
scatter_plot(ds[(ds['Tester'] == 'Lorenzo')], 'meanFreq_MaxS11', 'meanFreq_MaxS22')