# "Stati emozionali alla guida" ("Emotinal states while driving")

## Parte 1 - manipolazione dei dati

Questo codice importa diverse librerie e moduli comunemente utilizzati per la manipolazione dei dati, l'apprendimento automatico e le attività di valutazione. Queste librerie forniscono funzionalità per lavorare con file, dataframes, array, suddividere i dataset, implementare algoritmi di apprendimento automatico, calcolare l'accuratezza e ridimensionare le caratteristiche.

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
fp = open("results.txt", "w")

Questo codice può essere utile quando bisogno di leggere più file Excel e memorizzarli come DataFrames separati per ulteriori elaborazioni o analisi.

In [3]:
# Imposto il percorso
folder_path2 = '..\Dati\Dati_Luigi_Nuovo'

excel_files2 = glob.glob(os.path.join(folder_path2, "*.xlsx"))

# Creo lista per i dataframes
dfs2 = []

# Creo una lista di colonna per import dei file excel, non essendoci intestazione
column_headers = [chr(ord('A') + i) for i in range(32)]

In [4]:
print(column_headers)

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`']


Questo codice legge più file Excel, estrae colonne specifiche che sono quelle che mi servono, aggiunge una colonna 'File_Name' basata sul nome del file e unisce tutti i DataFrames risultanti in un singolo DataFrame.

In [5]:
# Per ogni file, salvo i dati su dataframe
for excel_file2 in excel_files2:
    # leggo i file e specifico le colonne
    df2 = pd.read_excel(excel_file2, header=None, names=column_headers[:pd.read_excel(excel_file2).shape[1]])
    df2['File_Name'] = os.path.basename(excel_file2)  # Aggiungo la colonna 'File_Name' utilizzando il nome di base del file
    
    df2 = df2[['File_Name', 'D', 'H', 'J', 'L']] # Nome File, Time, HR, RR, HRV solo queste colonne da salvare

    dfs2.append(df2)

# Unisco tutti i DataFrames in un unico DataFrame
merged_df2 = pd.concat(dfs2)

In [6]:
print(merged_df2)

                  File_Name         D   H   J     L
0     AVL_BD_20211104G.xlsx  10:07:43  73  22   872
1     AVL_BD_20211104G.xlsx  10:07:43  73  22   872
2     AVL_BD_20211104G.xlsx  10:07:44  72  22   849
3     AVL_BD_20211104G.xlsx  10:07:45  74  22   842
4     AVL_BD_20211104G.xlsx  10:07:46  71  24  1068
...                     ...       ...  ..  ..   ...
5587  AVL_SN_20211103G.xlsx  18:25:41  73  15   812
5588  AVL_SN_20211103G.xlsx  18:25:42  73  15   733
5589  AVL_SN_20211103G.xlsx  18:25:43  74  15   760
5590  AVL_SN_20211103G.xlsx  18:25:44  74  15   747
5591  AVL_SN_20211103G.xlsx  18:25:45  75  15   762

[105616 rows x 5 columns]


Lo scopo di questo codice è leggere più file Excel, aggiungere colonne aggiuntive a ciascun DataFrame e raccogliere tutti i DataFrames in una lista per ulteriori elaborazioni.

In [7]:
folder_path1 = "..\Dati\Dati_Luigi_arousal"

excel_files1 = glob.glob(os.path.join(folder_path1, "*.xlsx"))

dfs1 = []

for i, excel_file1 in enumerate(excel_files1):
    df1 = pd.read_excel(excel_file1)
    df1['File_Name'] = os.path.basename(excel_file1) 
    df1['Progressivo'] = i  # Aggiungo la colonna 'Progressivo' con il numero del file, magari può essere utile se si vuole studiare singolo file
    dfs1.append(df1)

In [8]:
# join tutti i DataFrames
merged_df1 = pd.concat(dfs1)

merged_df1.reset_index(drop=True, inplace=True)

In [9]:
print(merged_df1)

            Time  Valance  Arousal              File_Name  Progressivo
0       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0
1       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0
2       10:07:44       -1      201  AVL_BD_20211104G.xlsx            0
3       10:07:45       -1      201  AVL_BD_20211104G.xlsx            0
4       10:07:46       -1      201  AVL_BD_20211104G.xlsx            0
...          ...      ...      ...                    ...          ...
105599  18:25:41       84      116  AVL_SN_20211103G.xlsx           14
105600  18:25:42       83      117  AVL_SN_20211103G.xlsx           14
105601  18:25:43       82      118  AVL_SN_20211103G.xlsx           14
105602  18:25:44       81      119  AVL_SN_20211103G.xlsx           14
105603  18:25:45       80      120  AVL_SN_20211103G.xlsx           14

[105604 rows x 5 columns]


In [10]:
# rinomino colonne per essere più parlanti
merged_df2 = merged_df2.rename(columns={'D':'Time','H':'HR','J':'RR','L':'HRV'})

In [11]:
print(merged_df2)

                  File_Name      Time  HR  RR   HRV
0     AVL_BD_20211104G.xlsx  10:07:43  73  22   872
1     AVL_BD_20211104G.xlsx  10:07:43  73  22   872
2     AVL_BD_20211104G.xlsx  10:07:44  72  22   849
3     AVL_BD_20211104G.xlsx  10:07:45  74  22   842
4     AVL_BD_20211104G.xlsx  10:07:46  71  24  1068
...                     ...       ...  ..  ..   ...
5587  AVL_SN_20211103G.xlsx  18:25:41  73  15   812
5588  AVL_SN_20211103G.xlsx  18:25:42  73  15   733
5589  AVL_SN_20211103G.xlsx  18:25:43  74  15   760
5590  AVL_SN_20211103G.xlsx  18:25:44  74  15   747
5591  AVL_SN_20211103G.xlsx  18:25:45  75  15   762

[105616 rows x 5 columns]


In [12]:
# join dei due dataframe per nome file e time. Se time non esiste nel join, i dati sono inservibili e vengono scartati
joined_df = pd.merge(merged_df1, merged_df2, on=['File_Name','Time'], how='left')

print(joined_df)

            Time  Valance  Arousal              File_Name  Progressivo  HR   
0       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0  73  \
1       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0  73   
2       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0  73   
3       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0  73   
4       10:07:44       -1      201  AVL_BD_20211104G.xlsx            0  72   
...          ...      ...      ...                    ...          ...  ..   
169461  18:25:41       84      116  AVL_SN_20211103G.xlsx           14  73   
169462  18:25:42       83      117  AVL_SN_20211103G.xlsx           14  73   
169463  18:25:43       82      118  AVL_SN_20211103G.xlsx           14  74   
169464  18:25:44       81      119  AVL_SN_20211103G.xlsx           14  74   
169465  18:25:45       80      120  AVL_SN_20211103G.xlsx           14  75   

        RR  HRV  
0       22  872  
1       22  872  
2       2

## Parte 2 - Equal Width Binning

Taglio automaticamente i dati dell'arousal in 4 bin

In [13]:
joined_df['Arousal_Bin'] = pd.cut(joined_df['Arousal'], bins=4)

In [14]:
print(joined_df)

            Time  Valance  Arousal              File_Name  Progressivo  HR   
0       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0  73  \
1       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0  73   
2       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0  73   
3       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0  73   
4       10:07:44       -1      201  AVL_BD_20211104G.xlsx            0  72   
...          ...      ...      ...                    ...          ...  ..   
169461  18:25:41       84      116  AVL_SN_20211103G.xlsx           14  73   
169462  18:25:42       83      117  AVL_SN_20211103G.xlsx           14  73   
169463  18:25:43       82      118  AVL_SN_20211103G.xlsx           14  74   
169464  18:25:44       81      119  AVL_SN_20211103G.xlsx           14  74   
169465  18:25:45       80      120  AVL_SN_20211103G.xlsx           14  75   

        RR  HRV     Arousal_Bin  
0       22  872  (151.0, 201.

In [15]:
# ogni bin [50,100) avrà un numero progressivo da 0 a 3
bin_mapping = {bin_val: i for i, bin_val in enumerate(joined_df['Arousal_Bin'].unique())}

# sort dei bin
sorted_bins = sorted(joined_df['Arousal_Bin'].unique())

# mappo ogni bin al numero
bin_mapping = {bin_val: i for i, bin_val in enumerate(sorted_bins)}

# salvo il mapping in una nuova colonna
joined_df['Bin_Num'] = joined_df['Arousal_Bin'].map(bin_mapping)

In [16]:
print(joined_df)

            Time  Valance  Arousal              File_Name  Progressivo  HR   
0       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0  73  \
1       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0  73   
2       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0  73   
3       10:07:43       -1      201  AVL_BD_20211104G.xlsx            0  73   
4       10:07:44       -1      201  AVL_BD_20211104G.xlsx            0  72   
...          ...      ...      ...                    ...          ...  ..   
169461  18:25:41       84      116  AVL_SN_20211103G.xlsx           14  73   
169462  18:25:42       83      117  AVL_SN_20211103G.xlsx           14  73   
169463  18:25:43       82      118  AVL_SN_20211103G.xlsx           14  74   
169464  18:25:44       81      119  AVL_SN_20211103G.xlsx           14  74   
169465  18:25:45       80      120  AVL_SN_20211103G.xlsx           14  75   

        RR  HRV     Arousal_Bin Bin_Num  
0       22  872  (151

## Parte 3 - Random Forest

Scelto:
150 alberi, parametro trovato con grid search per migliorare accuracy. 100 albero erano pochi per il problema
30 foglie ad albero massimo
10 livelli per ogni albero
5 dati necessari per fare split
2 dati minimo per foglia

In [17]:
fp.write("Experiment for Random Forest\n")

max_depth=10
min_samples_split=5
min_samples_leaf=2
max_features="sqrt"
n_estimators=150
max_leaf_nodes=30

fp.write(f"Parameters:\n max_depth\t{max_depth}\n min_samples_split\t{min_samples_split}\n min_samples_leaf\t{min_samples_leaf}\n max_features\t{max_features}\n n_estimators\t{n_estimators}\n max_leaf_nodes\t{max_leaf_nodes} \n")
         

124

In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# liste per salvare modelli e accuracies
rf_models = []
rf_accuracies = []

for file_name in joined_df['File_Name'].unique():

    file_data = joined_df[joined_df['File_Name'] == file_name]

    # ottengo i dati di input e output X->y dove y è la label
    X = file_data[['HR', 'RR', 'HRV']] 
    y = file_data['Bin_Num']

    # Converto la colonna 'Time' in ora minuto e secondo
    #X.loc[:, 'Hour'] = file_data['Time'].apply(lambda x: x.hour)
    #X.loc[:, 'Minute'] = file_data['Time'].apply(lambda x: x.minute)
    #X.loc[:, 'Second'] = file_data['Time'].apply(lambda x: x.second)

    X.loc[:, 'Timestamp'] = file_data['Time'].apply(lambda x: x.hour) * 3600 + file_data['Time'].apply(lambda x: x.minute) * 60 + file_data['Time'].apply(lambda x: x.second)
    #X = X.drop_duplicates(subset = "Timestamp")
    
    # scaling dei dati da 0 a 1
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # split in training e test (70% training, 30% testing)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

    # creo modello RF
    # rf_model = RandomForestClassifier()
    rf_model = RandomForestClassifier(max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_features=max_features, n_estimators=n_estimators, max_leaf_nodes=max_leaf_nodes)


    # fit modello su training
    rf_model.fit(X_train, y_train)

    # evaluation modello su test
    y_pred = rf_model.predict(X_test)

    # calcolo accuracy
    accuracy = accuracy_score(y_test, y_pred)
    rf_accuracies.append(accuracy)

    # salvo modello
    rf_models.append(rf_model)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'Timestamp'] = file_data['Time'].apply(lambda x: x.hour) * 3600 + file_data['Time'].apply(lambda x: x.minute) * 60 + file_data['Time'].apply(lambda x: x.second)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'Timestamp'] = file_data['Time'].apply(lambda x: x.hour) * 3600 + file_data['Time'].apply(lambda x: x.minute) * 60 + file_data['Time'].apply(lambda x: x.second)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See th

In [19]:
for i, accuracy in enumerate(rf_accuracies):
    print(f"Accuracy for File {i+1}: {accuracy}")
    fp.write(f"Accuracy for File {i+1}: {accuracy}\n")

Accuracy for File 1: 0.9287479406919276
Accuracy for File 2: 0.7963752665245203
Accuracy for File 3: 0.9948096885813149
Accuracy for File 4: 0.9113887274840209
Accuracy for File 5: 0.8528846153846154
Accuracy for File 6: 0.7814003206841261
Accuracy for File 7: 0.8906964921199797
Accuracy for File 8: 0.8063591347406828
Accuracy for File 9: 0.8788706739526412
Accuracy for File 10: 0.8657543173672206
Accuracy for File 11: 0.7956448911222781
Accuracy for File 12: 0.9561808118081181
Accuracy for File 13: 0.9365131578947369
Accuracy for File 14: 0.9547991071428571
Accuracy for File 15: 0.8768412438625205


In [20]:
print("Max  accuracy for RF: ",max(rf_accuracies))
print("Min  accuracy for RF: ",min(rf_accuracies))
print("Mean accuracy for RF: ",sum(rf_accuracies)/len(rf_accuracies))

fp.write(f"\nMax  accuracy for RF: {max(rf_accuracies)}")
fp.write(f"\nMin  accuracy for RF: {min(rf_accuracies)}")
fp.write(f"\nMean accuracy for RF: {sum(rf_accuracies)/len(rf_accuracies)}")

Max  accuracy for RF:  0.9948096885813149
Min  accuracy for RF:  0.7814003206841261
Mean accuracy for RF:  0.8818177592907707


41

## Parte 4 - SVM

data are not linearly separable, kernel='linear' points to worst results
parameter C = 500 to set a soft svm

In [21]:
fp.write("\nExperiment for SVM\n")
C=500
fp.write(f"Parameters:\n C\t{C}\n")

19

In [22]:
from sklearn.svm import SVC

svm_models = []
svm_accuracies = []

for file_name in joined_df['File_Name'].unique():
    
    file_data = joined_df[joined_df['File_Name'] == file_name]

    X = file_data[['HR', 'RR', 'HRV']]  
    y = file_data['Bin_Num']

    #X.loc[:, 'Hour'] = file_data['Time'].apply(lambda x: x.hour)
    #X.loc[:, 'Minute'] = file_data['Time'].apply(lambda x: x.minute)
    #X.loc[:, 'Second'] = file_data['Time'].apply(lambda x: x.second)

    X.loc[:, 'Timestamp'] = file_data['Time'].apply(lambda x: x.hour) * 3600 + file_data['Time'].apply(lambda x: x.minute) * 60 + file_data['Time'].apply(lambda x: x.second)
    #X = X.drop_duplicates(subset = "Timestamp")
    
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

    #svm_model = SVC()
    #svm_model = SVC(C=1.0, kernel='rbf', gamma='scale', shrinking=True)
    svm_model = SVC(C=C, kernel='rbf', gamma='scale', shrinking=True, decision_function_shape='ovr', random_state=42)

    svm_model.fit(X_train, y_train)

    y_pred = svm_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    svm_accuracies.append(accuracy)

    svm_models.append(svm_model)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'Timestamp'] = file_data['Time'].apply(lambda x: x.hour) * 3600 + file_data['Time'].apply(lambda x: x.minute) * 60 + file_data['Time'].apply(lambda x: x.second)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'Timestamp'] = file_data['Time'].apply(lambda x: x.hour) * 3600 + file_data['Time'].apply(lambda x: x.minute) * 60 + file_data['Time'].apply(lambda x: x.second)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See th

In [23]:
for i, accuracy in enumerate(svm_accuracies):
    print(f"Accuracy for File {i+1}: {accuracy}")
    fp.write(f"Accuracy for File {i+1}: {accuracy}\n")

Accuracy for File 1: 0.8179571663920923
Accuracy for File 2: 0.701137171286425
Accuracy for File 3: 0.9388696655132641
Accuracy for File 4: 0.7669959325973271
Accuracy for File 5: 0.7355769230769231
Accuracy for File 6: 0.7306253340459647
Accuracy for File 7: 0.7147941026944585
Accuracy for File 8: 0.7638780297107115
Accuracy for File 9: 0.8128415300546448
Accuracy for File 10: 0.7243401759530792
Accuracy for File 11: 0.7548408710217756
Accuracy for File 12: 0.7822878228782287
Accuracy for File 13: 0.8657894736842106
Accuracy for File 14: 0.7561383928571429
Accuracy for File 15: 0.8162847790507365


In [24]:
print("Max  accuracy per RF: ",max(svm_accuracies))
print("Min  accuracy per RF: ",min(svm_accuracies))
print("Mean accuracy per RF: ",sum(svm_accuracies)/len(svm_accuracies))

fp.write(f"\nMax  accuracy for RF: {max(svm_accuracies)}")
fp.write(f"\nMin  accuracy for RF: {min(svm_accuracies)}")
fp.write(f"\nMean accuracy for RF: {sum(svm_accuracies)/len(svm_accuracies)}")

Max  accuracy per RF:  0.9388696655132641
Min  accuracy per RF:  0.701137171286425
Mean accuracy per RF:  0.7788238247211322


41

# Part 5 

### Part with circular array implementation

In [25]:
fp.write("Experiment for Circular Array implementation\n")

45

In [26]:
def create_sequences(X, y, time_steps=10):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

from sklearn.svm import SVC

svm_models = []
svm_accuracies = []

for file_name in joined_df['File_Name'].unique():
    
    file_data = joined_df[joined_df['File_Name'] == file_name]

    X = file_data[['HR', 'RR', 'HRV']]  
    y = file_data['Bin_Num']

    #X.loc[:, 'Hour'] = file_data['Time'].apply(lambda x: x.hour)
    #X.loc[:, 'Minute'] = file_data['Time'].apply(lambda x: x.minute)
    #X.loc[:, 'Second'] = file_data['Time'].apply(lambda x: x.second)

    X.loc[:, 'Timestamp'] = file_data['Time'].apply(lambda x: x.hour) * 3600 + file_data['Time'].apply(lambda x: x.minute) * 60 + file_data['Time'].apply(lambda x: x.second)
    #X = X.drop_duplicates(subset = "Timestamp")
    
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    
    time_steps = 10
    X_seq, y_seq = create_sequences(X, y, time_steps)

    # Reshape X_seq to 2D because SVM doesn't accept 3D input
    X_seq = X_seq.reshape(X_seq.shape[0], -1)

    X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.3, random_state=42)
    
    svm_model = SVC(C=1.0, kernel='rbf', gamma='scale', shrinking=True, decision_function_shape='ovr', random_state=42)
    #print(f"Shapes: X_train={X_train.shape}, y_train={y_train.shape}, X_test={X_test.shape}, y_test={y_test.shape}")
    svm_model.fit(X_train, y_train)
    y_pred = svm_model.predict(X_test)
    #print(f"y_pred shape: {y_pred.shape}")
    accuracy = accuracy_score(y_test, y_pred)
    #print(f"Accuracy: {accuracy}")

    svm_accuracies.append(accuracy)

    svm_models.append(svm_model)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'Timestamp'] = file_data['Time'].apply(lambda x: x.hour) * 3600 + file_data['Time'].apply(lambda x: x.minute) * 60 + file_data['Time'].apply(lambda x: x.second)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, 'Timestamp'] = file_data['Time'].apply(lambda x: x.hour) * 3600 + file_data['Time'].apply(lambda x: x.minute) * 60 + file_data['Time'].apply(lambda x: x.second)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See th

In [27]:
for i, accuracy in enumerate(svm_accuracies):
    print(f"Accuracy for File {i+1}: {accuracy}")

Accuracy for File 1: 0.6160824742268042
Accuracy for File 2: 0.472073995019566
Accuracy for File 3: 0.6302715193529752
Accuracy for File 4: 0.5010177377144519
Accuracy for File 5: 0.4991979467436638
Accuracy for File 6: 0.5251605995717344
Accuracy for File 7: 0.40376782077393075
Accuracy for File 8: 0.44835680751173707
Accuracy for File 9: 0.6712266301869585
Accuracy for File 10: 0.4523809523809524
Accuracy for File 11: 0.5805522047982844
Accuracy for File 12: 0.47575057736720555
Accuracy for File 13: 0.7579848534738228
Accuracy for File 14: 0.4074902179988821
Accuracy for File 15: 0.507988529291274


In [28]:
print("Max  accuracy per RF: ",max(svm_accuracies))
print("Min  accuracy per RF: ",min(svm_accuracies))
print("Mean accuracy per RF: ",sum(svm_accuracies)/len(svm_accuracies))

Max  accuracy per RF:  0.7579848534738228
Min  accuracy per RF:  0.40376782077393075
Mean accuracy per RF:  0.5299535244274829


# Part 6

### Grid search

In [29]:
fp.write("\nGrid Search:")
C_values = [0.1, 1, 10, 20,100,1000]
time_steps_values = [1,5,10,20,50]
fp.write(f"\nParameters:\nC_Values=\t{C_values}\ntime_steps_values=\t{time_steps_values}\n")
fp.write("Results:")

8

In [30]:
def create_sequences(X, y, time_steps=10):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X.iloc[i:(i + time_steps)].values)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

from sklearn.svm import SVC

# Define the parameter grid
C_values = [0.1, 1, 10, 20,50,100,500,1000]
time_steps_values = [1,5,10,20,50,100]
#time_steps_values = [5, 10, 15, 20, 25, 30]

best_accuracy = -1
best_params = {'C': None, 'time_steps': None}
best_accuracies = []

for C in C_values:
    for time_steps in time_steps_values:
        svm_accuracies = []
        
        for file_name in joined_df['File_Name'].unique():
            file_data = joined_df[joined_df['File_Name'] == file_name]
            X = file_data[['HR', 'RR', 'HRV']].copy()
            y = file_data['Bin_Num'].copy()
            X.loc[:, 'Timestamp'] = file_data['Time'].apply(lambda x: x.hour) * 3600 + file_data['Time'].apply(lambda x: x.minute) * 60 + file_data['Time'].apply(lambda x: x.second)
            
            X_seq, y_seq = create_sequences(X, y, time_steps)
            X_seq = X_seq.reshape(X_seq.shape[0], -1)
            
            scaler = MinMaxScaler()
            X_scaled = scaler.fit_transform(X_seq)
            
            X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_seq, test_size=0.3, random_state=42)
            
            svm_model = SVC(C=C, kernel='rbf', gamma='scale', shrinking=True, decision_function_shape='ovr', random_state=42)
            svm_model.fit(X_train, y_train)
            y_pred = svm_model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            svm_accuracies.append(accuracy)
        
        mean_accuracy = np.mean(svm_accuracies)
        fp.write(f"\nFor C {C} and time_steps {time_steps} the mean_accuracy is {mean_accuracy}")
        
        # Update the best params if current configuration has better accuracy
        if mean_accuracy > best_accuracy:
            best_accuracies = svm_accuracies.copy()
            best_accuracy = mean_accuracy
            best_params['C'] = C
            best_params['time_steps'] = time_steps
            print(best_params, mean_accuracy)


{'C': 0.1, 'time_steps': 1} 0.6450206473037841
{'C': 0.1, 'time_steps': 5} 0.6503825603742721
{'C': 0.1, 'time_steps': 10} 0.6580756299762686
{'C': 0.1, 'time_steps': 20} 0.6678221920240468
{'C': 0.1, 'time_steps': 50} 0.6777818471678769
{'C': 0.1, 'time_steps': 100} 0.6953480548975433
{'C': 1, 'time_steps': 1} 0.6954786356846971
{'C': 1, 'time_steps': 5} 0.7024879889280181
{'C': 1, 'time_steps': 10} 0.7111373486050161
{'C': 1, 'time_steps': 20} 0.7257872619200472
{'C': 1, 'time_steps': 50} 0.7593372484426203
{'C': 1, 'time_steps': 100} 0.8023622286073772
{'C': 10, 'time_steps': 50} 0.8351246640250714
{'C': 10, 'time_steps': 100} 0.9044091618676444
{'C': 20, 'time_steps': 100} 0.9282312098389519
{'C': 50, 'time_steps': 100} 0.954498340233825
{'C': 100, 'time_steps': 100} 0.9654982416462282
{'C': 500, 'time_steps': 100} 0.975413353825979
{'C': 1000, 'time_steps': 100} 0.9764304532622989


In [31]:
print("Best parameters: ", best_params)
print(best_accuracies)

Best parameters:  {'C': 1000, 'time_steps': 100}
[0.9666388657214345, 0.9112787356321839, 0.9988262910798122, 0.9756740914419695, 0.9702265372168285, 0.9804454101032047, 0.9850283944243676, 0.9831888626214867, 0.989843028624192, 0.9753208292201382, 0.9589123867069487, 0.990177736202058, 0.9787375415282392, 0.9937570942111237, 0.9884009942004971]


In [32]:
fp.write("\n\nConclusion of Grid Search.")
fp.write(f"\n\nBest parameters are {best_params} and corresponding accuracies are:")
for i, accuracy in enumerate(best_accuracies):
    print(f"Accuracy for File {i+1}: {accuracy}")
    fp.write(f"\nAccuracy for File {i+1}: {accuracy}\n")

Accuracy for File 1: 0.9666388657214345
Accuracy for File 2: 0.9112787356321839
Accuracy for File 3: 0.9988262910798122
Accuracy for File 4: 0.9756740914419695
Accuracy for File 5: 0.9702265372168285
Accuracy for File 6: 0.9804454101032047
Accuracy for File 7: 0.9850283944243676
Accuracy for File 8: 0.9831888626214867
Accuracy for File 9: 0.989843028624192
Accuracy for File 10: 0.9753208292201382
Accuracy for File 11: 0.9589123867069487
Accuracy for File 12: 0.990177736202058
Accuracy for File 13: 0.9787375415282392
Accuracy for File 14: 0.9937570942111237
Accuracy for File 15: 0.9884009942004971


In [33]:
fp.close()