## Classification

### Libraries

In [200]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sktime.classification.kernel_based import RocketClassifier
from joblib import dump

### Importing the dataset

In [201]:
df = pd.read_csv('dataset.csv', decimal = ',')
df['Tskin'] = df['Tskin'].astype(float)
df['Tamb'] = df['Tamb'].astype(float)
df['ambient_RH'] = df['ambient_RH'].astype(float)
df['skin_RH'] = df['skin_RH'].astype(float)
df['HR'] = df['HR'].astype(float)
df['HR_confid'] = df['HR_confid'].astype(float)
df['ACT_est'] = df['ACT_est'].astype(float)
df['height'] = df['height'].astype(float)
df['weight'] = df['weight'].astype(float)
df['exhaustion'] = df['exhaustion'].astype(str)
df['ts'] = pd.to_datetime(df['ts'])
df['date'] = df['ts'].dt.strftime('%Y-%m-%d') 
df['time'] = df['ts'].dt.strftime('%H:%M')
df = df.drop(['intensity_label', 'height', 'weight', 'session_id'], axis=1)

In [202]:
'''import dataframe_image as dfi
df_swr = df_swr.drop(['id', 'date', 'time', 'age', 'gender', 'intensity', 'session_id'], axis=1)
dfi.export(df_swr.head().style, "dataframe.png", table_conversion="Async")'''

'import dataframe_image as dfi\ndf_swr = df_swr.drop([\'id\', \'date\', \'time\', \'age\', \'gender\', \'intensity\', \'session_id\'], axis=1)\ndfi.export(df_swr.head().style, "dataframe.png", table_conversion="Async")'

## Random Forest

### 1. Method
Splitting by the id. For each ID, we use the 75% for the training set and the 25% for the test set.

In [203]:
def f(df, k):  #k is the dimension of the training set
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()
    id = df['id'].unique()

    
      # Creating the two datasets
    for element in id:
      df1 = df.loc[df['id'] == element]
      df1 = df1.sort_values('ts')
      split_point = int(k * len(df1))
      train_df = df1[:split_point]
      test_df = df1[split_point:]
      df_train = pd.concat([df_train, train_df])
      df_test = pd.concat([df_test, test_df])
        
     # Only numerical variables   
    df_train_n = df_train.select_dtypes(include = 'number') 
    df_test_n = df_test.select_dtypes(include = 'number') 
    
    
    target = 'exhaustion'
    columns = [c for c in df_train_n.columns if c != target]
    
    X_train = df_train_n[columns].values
    y_train = df_train[target].values
      
    X_test = df_test_n[columns].values
    y_test = df_test[target].values
      
    clf = RandomForestClassifier()
    
    param_dict = {
         'n_estimators': [50, 100],               # Numero alberi
         'max_depth': [3, 5, 7],                  # Profondità
         'min_samples_split': [10, 20],           # Dati per fare split
         'min_samples_leaf': [5, 10],             # Foglie 
         'max_features': ['sqrt', 'log2'],        # Feature per split 
         'bootstrap': [True]#,                     # Obbligatorio per varietà
         #'ccp_alpha': [0.05, 0.1, 0.2]            # Pruning
              }
    rands = RandomizedSearchCV(clf, param_dict, cv = 5, scoring = 'accuracy', refit = True, n_iter = 5)
    rands.fit(X_train, y_train) 

    clf = rands.best_estimator_

    clf.fit(X_train, y_train)

    #y_pred_train=clf.predict(X_train)

    y_pred = clf.predict(X_test)
        
        
        
    print('Using a training_set size of {}'.format(k))
   # print(classification_report(y_pred_train, y_train))
    print(classification_report(y_pred, y_test))
    return clf
   # return dump(clf, "model.pkl") 

In [204]:
f(df, 0.7) 

Using a training_set size of 0.7
              precision    recall  f1-score   support

         1.0       0.90      0.95      0.92       498
         2.0       0.80      0.87      0.83       491
         3.0       0.96      0.75      0.84       685
         4.0       0.35      0.97      0.52        33
         5.0       0.43      0.94      0.59        17

    accuracy                           0.85      1724
   macro avg       0.69      0.90      0.74      1724
weighted avg       0.88      0.85      0.85      1724



In [205]:
f(df, 0.8) 

Using a training_set size of 0.8
              precision    recall  f1-score   support

         1.0       0.64      0.78      0.70       288
         2.0       0.67      0.76      0.71       318
         3.0       0.94      0.61      0.74       549
         4.0       0.00      0.00      0.00         0
         5.0       0.00      0.00      0.00         0

    accuracy                           0.69      1155
   macro avg       0.45      0.43      0.43      1155
weighted avg       0.79      0.69      0.72      1155



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2. Method
We use the 75% of the dataset for the training set and the 25% for the test set, NORMALIZING the timestamps.

In [206]:
def f(df, k):
    
    df1 = df.copy()
    
    df1["t_norm"] = df1.groupby("id")["ts"].transform(lambda x: x - x.min()) # Normalization
    
    df1 = df1.sort_values('t_norm')
    
    df_n = df1.select_dtypes(include = 'float') 
    
    target = 'exhaustion' 
    columns = [c for c in df_n.columns if c != target]

    X = df_n[columns].values
    y = df1[target].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k, random_state = 0, shuffle = False)
    
    clf = RandomForestClassifier()
    
    param_dict = {
    'n_estimators': [50, 100],               # Numero alberi
    'max_depth': [3, 5, 7],                  # Profondità
    'min_samples_split': [10, 20],           # Dati per fare split
    'min_samples_leaf': [5, 10],             # Foglie 
    'max_features': ['sqrt', 'log2'],        # Feature per split 
    'bootstrap': [True],                     # Obbligatorio per varietà
    'ccp_alpha': [0.05, 0.1, 0.2]            # Pruning
          }
    rands = RandomizedSearchCV(clf, param_dict, cv = 5, scoring = 'accuracy', refit = True, n_iter = 5)
    rands.fit(X_train, y_train) 

    clf = rands.best_estimator_

    clf.fit(X_train, y_train)

    y_pred_train = clf.predict(X_train)

    y_pred = clf.predict(X_test)

    accuracy_score(y_test, y_pred)
    
    print('Using a test_set size of {}'.format(k))
    #print(classification_report(y_pred_train, y_train))
    print(classification_report(y_pred, y_test)) 
    
    #return dump(clf, "model.pkl") 

In [207]:
f(df, 0.2) #test_set 20%

Using a test_set size of 0.2
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00         0
         2.0       1.00      0.29      0.45      1141
         3.0       0.00      0.00      0.00         0
         4.0       0.00      0.00      0.00         0

    accuracy                           0.29      1141
   macro avg       0.25      0.07      0.11      1141
weighted avg       1.00      0.29      0.45      1141



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


['model.pkl']

In [None]:
f(df, 0.25) #test_set 25%

In [None]:
f(df, 0.3) #test_set 30%

In [None]:
f(df, 0.35) #test_set 35%

## Rocket

### 1. Method

In [None]:
def f(df, k):  #k is the dimension of the training set
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()
    id = df['id'].unique()
    
    # Creating the two datasets
    for element in id:
      df1 = df.loc[df['id'] == element]
      df1 = df1.sort_values('ts')
      split_point = int(k * len(df1))
      train_df = df1[:split_point]
      test_df = df1[split_point:]
      df_train = pd.concat([df_train, train_df])
      df_test = pd.concat([df_test, test_df])
        
     # Only numerical variables   
    df_train_n = df_train.select_dtypes(include = 'number') 
    df_test_n = df_test.select_dtypes(include = 'number') 
    
    target = 'exhaustion' 
    columns = [c for c in df_train_n.columns if c != target]
    
    X_train = df_train_n[columns].values
    y_train = df_train[target].values

    X_test = df_test_n[columns].values
    y_test = df_test[target].values
    
    clf = RocketClassifier(num_kernels=350, random_state=0)

    cv = StratifiedKFold(n_splits=5, shuffle = False)
    scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print('Using a training_set size of {}'.format(k))
    print(classification_report(y_pred, y_test)) 
    
    # return dump(clf, "model.pkl") 

In [None]:
f(df, 0.7)

In [None]:
f(df, 0.75)

In [None]:
f(df, 0.8)

In [None]:
f(df, 0.85)

In [None]:
f(df, 0.9)

### 2. Method

In [None]:
def f(df, k):
    
    df1 = df.copy()
    
    df1["t_norm"] = df1.groupby("id")["ts"].transform(lambda x: x - x.min()) # Normalization
    
    df1 = df1.sort_values('t_norm')
    
    df_n = df1.select_dtypes(include = 'float') 
    
    target = 'exhaustion' 
    columns = [c for c in df_n.columns if c != target]

    X = df_n[columns].values
    y = df[target].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = k, random_state = 0, shuffle = False)
    
    clf = RocketClassifier(num_kernels=350, random_state=0)

    cv = StratifiedKFold(n_splits=5, shuffle=False)
    scores = cross_val_score(clf, X_train, y_train, cv=cv, scoring='accuracy')

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print('Using a test_set size of {}'.format(k))
    print(classification_report(y_pred, y_test)) 
    
    # return dump(clf, "model.pkl") 

In [None]:
f(df, 0.2)

In [None]:
f(df, 0.25)

In [None]:
f(df, 0.3)