# Machine Learning

In [35]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.ensemble import RUSBoostClassifier


In [31]:
X_resampled = pd.read_csv('C:/Users/antho/Documents/MEMOIRE_M2/c3d_audeline/ON_OFF_all_features_final_over100/train/X_train_A_P_1956-02-21_OFF_OFF_all_extraction_features_over100.csv')
y_resampled = pd.read_csv('C:/Users/antho/Documents/MEMOIRE_M2/c3d_audeline/ON_OFF_all_features_final_over100/train/y_train_A_P_1956-02-21_OFF_OFF_all_extraction_features_over100.csv')
X_test = pd.read_csv('C:/Users/antho/Documents/MEMOIRE_M2/c3d_audeline/ON_OFF_all_features_final_over100/test/X_test_A_P_1956-02-21_OFF_OFF_all_extraction_features_over100.csv')
y_test = pd.read_csv('C:/Users/antho/Documents/MEMOIRE_M2/c3d_audeline/ON_OFF_all_features_final_over100/test/y_test_A_P_1956-02-21_OFF_OFF_all_extraction_features_over100.csv')



In [32]:
# On transforme les étiquettes en Series
y_resampled = y_resampled.iloc[:, -1]
y_test = y_test.iloc[:, -1]

# Nb échantillons de chaque classe dans train et test

In [33]:
# Pour y_resampled
class_counts_resampled = y_resampled.value_counts()

# Pour y_test
class_counts_test = y_test.value_counts()

print("Nombre d'échantillons par classe dans y_resampled :")
print(class_counts_resampled)

print("Nombre d'échantillons par classe dans y_test :")
print(class_counts_test)


Nombre d'échantillons par classe dans y_resampled :
0    1524
1    1524
Name: label, dtype: int64
Nombre d'échantillons par classe dans y_test :
0    654
1    165
Name: label, dtype: int64


# Application du RandomForest

In [38]:
# Initialisation du RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Entraînement du modèle sur les données d'entraînement
rf.fit(X_resampled, y_resampled)

# Prédictions
y_pred = rf.predict(X_test)

# Évaluation
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Matrice de confusion :\n", conf_matrix)
print("Rapport de classification :\n", class_report)

Matrice de confusion :
 [[629  25]
 [ 23 142]]
Rapport de classification :
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       654
           1       0.85      0.86      0.86       165

    accuracy                           0.94       819
   macro avg       0.91      0.91      0.91       819
weighted avg       0.94      0.94      0.94       819



True Negatives (TN): 487 - Le modèle a correctement prédit la classe négative (non-FOG) pour 487 échantillons.

False Positives (FP): 29 - Le modèle a incorrectement prédit la classe positive (FOG) pour 29 échantillons qui étaient en réalité négatifs.

False Negatives (FN): 32 - Le modèle a incorrectement prédit la classe négative pour 32 échantillons qui étaient en réalité positifs.

True Positives (TP): 182 - Le modèle a correctement prédit la classe positive pour 182 échantillons.

Rappel (également appelé sensibilité ou taux de vrais positifs) est la proportion des cas positifs réels que le modèle a correctement identifié

# Application du RUSBOOSTING

In [7]:
# Initialiser le modèle RUSBoost
rusboost = RUSBoostClassifier(n_estimators=100, random_state=42)

# Entraînement du modèle sur les données d'entraînement
rusboost.fit(X_resampled, y_resampled)

# Prédictions
y_pred = rusboost.predict(X_test)

# Évaluation
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Matrice de confusion :\n", conf_matrix)
print("Rapport de classification :\n", class_report)

Matrice de confusion :
 [[303  92]
 [ 54 116]]
Rapport de classification :
               precision    recall  f1-score   support

           0       0.85      0.77      0.81       395
           1       0.56      0.68      0.61       170

    accuracy                           0.74       565
   macro avg       0.70      0.72      0.71       565
weighted avg       0.76      0.74      0.75       565



# SVM

In [39]:
from sklearn import svm
# Pour la classification
model = svm.SVC(kernel='linear')  # Le noyau peut être 'linear', 'poly', 'rbf', 'sigmoid', etc.
model.fit(X_resampled, y_resampled)

# Prédiction
y_pred = model.predict(X_test)

# Évaluation
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))




[[632  22]
 [ 13 152]]
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       654
           1       0.87      0.92      0.90       165

    accuracy                           0.96       819
   macro avg       0.93      0.94      0.93       819
weighted avg       0.96      0.96      0.96       819



# Batch Process

In [None]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import svm

def process_folder(train_path, test_path):
    # Lister les fichiers dans les dossiers
    train_files = os.listdir(train_path)
    test_files = os.listdir(test_path)

    # Création de dictionnaires pour stocker les chemins des fichiers X et y
    train_files_x = {}
    train_files_y = {}
    test_files_x = {}
    test_files_y = {}

    # Organiser les fichiers par type
    for file in train_files:
        if 'X_train' in file:
            train_files_x[file.split('_')[5]] = os.path.join(train_path, file)
            print(train_files_x[file.split('_')[5]])
        elif 'y_train' in file:
            train_files_y[file.split('_')[5]] = os.path.join(train_path, file)

    for file in test_files:
        if 'X_test' in file:
            test_files_x[file.split('_')[5]] = os.path.join(test_path, file)
        elif 'y_test' in file:
            test_files_y[file.split('_')[5]] = os.path.join(test_path, file)

    # Traitement pour chaque paire de fichiers
    for key in train_files_x.keys():
        print(train_files_x.keys())
        if key in train_files_y and key in test_files_x and key in test_files_y:
            # Chargement des données
            X_resampled = pd.read_csv(train_files_x[key])
            y_resampled = pd.read_csv(train_files_y[key])
            X_test = pd.read_csv(test_files_x[key])
            y_test = pd.read_csv(test_files_y[key])

            # Transformation des étiquettes en Series
            y_resampled = y_resampled.iloc[:, -1]
            y_test = y_test.iloc[:, -1]

            # Entraînement RandomForest
            rf = RandomForestClassifier(n_estimators=100, random_state=42)
            rf.fit(X_resampled, y_resampled)
            y_pred_rf = rf.predict(X_test)
            conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
            class_report_rf = classification_report(y_test, y_pred_rf)

            # Entraînement SVM
            model_svm = svm.SVC(kernel='linear')
            model_svm.fit(X_resampled, y_resampled)
            y_pred_svm = model_svm.predict(X_test)
            conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
            class_report_svm = classification_report(y_test, y_pred_svm)

            # Affichage des résultats
            print(f"Résultats pour {key}:")
            print("RandomForest - Matrice de confusion:\n", conf_matrix_rf)
            print("RandomForest - Rapport de classification:\n", class_report_rf)
            print("SVM - Matrice de confusion:\n", conf_matrix_svm)
            print("SVM - Rapport de classification:\n", class_report_svm)

# Exemple d'appel à la fonction
# process_folder('chemin/vers/train', 'chemin/vers/test')
process_folder('C:/Users/antho/Documents/MEMOIRE_M2/c3d_audeline/ON_OFF_all_features_final_over100/train/', 'C:/Users/antho/Documents/MEMOIRE_M2/c3d_audeline/ON_OFF_all_features_final_over100/test/')
process_folder('C:/Users/antho/Documents/MEMOIRE_M2/c3d_audeline/ON_OFF_all_features_final_optimise/train/', 'C:/Users/antho/Documents/MEMOIRE_M2/c3d_audeline/ON_OFF_all_features_final_optimise/test/')


In [None]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import svm

def train_and_evaluate_models(train_path, test_path):
    # Liste tous les fichiers dans le répertoire d'entraînement et de test
    train_files = os.listdir(train_path)
    test_files = os.listdir(test_path)
    
    # Dictionnaire pour stocker les résultats des évaluations
    results = {}
    
    # Parcours de tous les fichiers d'entraînement
    for file in train_files:
        if file.startswith('X_train'):
            # Identifier le nom de base pour associer X et y
            base_name = file.split('_X_train')[0]
            y_file = f"{base_name}_y_train.csv"
            
            # Chemins complets des fichiers X et y
            x_train_path = os.path.join(train_path, file)
            y_train_path = os.path.join(train_path, y_file)
            
            # Chargement des données d'entraînement
            X_train = pd.read_csv(x_train_path)
            y_train = pd.read_csv(y_train_path).iloc[:, -1]  # Supposant que la colonne cible est la dernière
            
            # Charger les données de test correspondantes
            x_test_file = f"{base_name}_X_test.csv"
            y_test_file = f"{base_name}_y_test.csv"
            x_test_path = os.path.join(test_path, x_test_file)
            y_test_path = os.path.join(test_path, y_test_file)
            
            X_test = pd.read_csv(x_test_path)
            y_test = pd.read_csv(y_test_path).iloc[:, -1]
            
            # Initialisation et entraînement de RandomForestClassifier
            rf = RandomForestClassifier(n_estimators=100, random_state=42)
            rf.fit(X_train, y_train)
            
            # Prédiction et évaluation pour RandomForest
            y_pred_rf = rf.predict(X_test)
            conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
            class_report_rf = classification_report(y_test, y_pred_rf)
            
            # Initialisation et entraînement de SVM
            svc = svm.SVC(kernel='linear')
            svc.fit(X_train, y_train)
            
            # Prédiction et évaluation pour SVM
            y_pred_svc = svc.predict(X_test)
            conf_matrix_svc = confusion_matrix(y_test, y_pred_svc)
            class_report_svc = classification_report(y_test, y_pred_svc)
            
            # Stockage des résultats
            results[base_name] = {
                'RandomForest': {
                    'Confusion Matrix': conf_matrix_rf,
                    'Classification Report': class_report_rf
                },
                'SVM': {
                    'Confusion Matrix': conf_matrix_svc,
                    'Classification Report': class_report_svc
                }
            }
    
    return results

# Commented out function call to adhere to GPT guideline of not executing code
# train_and_evaluate_models('train_folder_path', 'test_folder_path')


In [24]:
def load_train(train_folder):
    data_dict = {}
    for file in os.listdir(train_folder):
        file_path = os.path.join(train_folder, file)
        # Extract a common identifier by stripping off 'X_train_' or 'y_train_' and everything after '_over'
        if file.startswith('X_train') or file.startswith('y_train'):
            identifier = file.split('_over')[0].replace('X_train_', '').replace('y_train_', '')
            print("Processing:", identifier)
            if identifier not in data_dict:
                data_dict[identifier] = {}
            if 'X_train' in file:
                print(f"Ajout de X_train {identifier}")
                data_dict[identifier]['X_train'] = pd.read_csv(file_path)
                
            elif 'y_train' in file:
                print(f"Ajout de y_train {identifier}")
                data_dict[identifier]['y_train'] = pd.read_csv(file_path).squeeze()  # Convert DataFrame to Series directly
                
    print("Data loaded for identifiers:", list(data_dict.keys()))
    return data_dict

def load_test(test_folder):
    data_dict = {}
    for file in os.listdir(test_folder):
        file_path = os.path.join(test_folder, file)
        # Extract a common identifier by stripping off 'X_train_' or 'y_train_' and everything after '_over'
        if file.startswith('X_test') or file.startswith('y_test'):
            identifier = file.split('_over')[0].replace('X_test_', '').replace('y_test_', '')
            print("Processing:", identifier)
            if identifier not in data_dict:
                data_dict[identifier] = {}
            if 'X_test' in file:
                print(f"Ajout de X_test {identifier}")
                data_dict[identifier]['X_test'] = pd.read_csv(file_path)
            elif 'y_test' in file:
                print(f"Ajout de X_test {identifier}")
                data_dict[identifier]['y_test'] = pd.read_csv(file_path).squeeze()  # Convert DataFrame to Series directly
    print("Data loaded for identifiers:", list(data_dict.keys()))
    return data_dict

# from collections import defaultdict

# def group_data(data_dicts):
#     grouped_data = defaultdict(dict)
#     for data_dict in data_dicts:
#         for key, value in data_dict.items():
#             grouped_data[key].update(value)
#     return dict(grouped_data)


# def train_models(data_dict):
#     results = {}
#     for identifier, data in data_dict.items():
#         print("Training models for:", identifier)
#         X_train = data['X_train']
#         y_train = data['y_train']
#         X_test = data['X_test']
#         y_test = data['y_test']
#         # Standardize the data
#         scaler = StandardScaler()
#         X_train_scaled = scaler.fit_transform(X_train)
#         X_test_scaled = scaler.transform(X_test)
#         # Initialize models
#         models = {
#             "RandomForest": RandomForestClassifier(),
#             "SVM": SVC()
#         }
#         results[identifier] = {}
#         for model_name, model in models.items():
#             print(f"Training {model_name}...")
#             model.fit(X_train_scaled, y_train)
#             y_pred = model.predict(X_test_scaled)
#             accuracy = accuracy_score(y_test, y_pred)
#             conf_matrix = confusion_matrix(y_test, y_pred)
#             report = classification_report(y_test, y_pred)
#             results[identifier][model_name] = {
#                 "accuracy": accuracy,
#                 "confusion_matrix": conf_matrix,
#                 "classification_report": report
#             }
#     return results


In [25]:
base_train_path = "C:/Users/antho/Documents/MEMOIRE_M2/c3d_audeline/ON_OFF_all_features_final_over100/train"
base_test_path = "C:/Users/antho/Documents/MEMOIRE_M2/c3d_audeline/ON_OFF_all_features_final_over100/test"

train_data = load_train(base_train_path)
test_data = load_test(base_test_path)

Processing: A_P_1956-02-21_OFF_OFF_all_extraction_features
Ajout de X_train A_P_1956-02-21_OFF_OFF_all_extraction_features
Processing: L_J_1942-01-17_OFF_OFF_all_extraction_features
Ajout de X_train L_J_1942-01-17_OFF_OFF_all_extraction_features
Processing: A_P_1956-02-21_OFF_OFF_all_extraction_features
Ajout de y_train A_P_1956-02-21_OFF_OFF_all_extraction_features
Processing: L_J_1942-01-17_OFF_OFF_all_extraction_features
Ajout de y_train L_J_1942-01-17_OFF_OFF_all_extraction_features
Data loaded for identifiers: ['A_P_1956-02-21_OFF_OFF_all_extraction_features', 'L_J_1942-01-17_OFF_OFF_all_extraction_features']
Processing: A_P_1956-02-21_OFF_OFF_all_extraction_features
Ajout de X_test A_P_1956-02-21_OFF_OFF_all_extraction_features
Processing: L_J_1942-01-17_OFF_OFF_all_extraction_features
Ajout de X_test L_J_1942-01-17_OFF_OFF_all_extraction_features
Processing: A_P_1956-02-21_OFF_OFF_all_extraction_features
Ajout de X_test A_P_1956-02-21_OFF_OFF_all_extraction_features
Processing: 

In [30]:
from collections import ChainMap
def merge_dicts(*dicts):
    merged_dict = {}
    for d in dicts:
        for key, value in d.items():
            if key not in merged_dict:
                merged_dict[key] = {}
            merged_dict[key].update(value)
    return merged_dict

train_merged = merge_dicts(train_data, test_data)
print(train_merged)

{'A_P_1956-02-21_OFF_OFF_all_extraction_features': {'X_train':       Foot_Left_ACC_X_Mean_Temporal  Foot_Left_ACC_X_Ecart_Type_Temporal  \
0                          1.547405                             0.938144   
1                          0.811107                            -2.841605   
2                          0.038937                            -1.857406   
3                          0.537374                             1.245338   
4                          0.097774                             0.732755   
...                             ...                                  ...   
3043                      -0.641692                             0.475596   
3044                      -1.054681                             0.764137   
3045                      -0.207475                             0.712694   
3046                       0.404954                            -1.107628   
3047                       1.498389                            -1.583578   

      Foot_Left_ACC_X_Va