In [None]:
import pandas as pd
import numpy as np

# Chargement du DataFrame depuis le fichier CSV
df = pd.read_csv("../csv/scraping_commentaires_4_banques_nettoye.csv")

df.head()

In [None]:
df.info()
print("Shape of DataFrame:", df.shape)

In [None]:
# Vérifier les valeurs manquantes dans le DataFrame df et les supprimer
missing_values = df.isnull().sum()
df.dropna(inplace=True)

In [None]:
# Importation de librairies
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report

In [None]:
# Sélectionner les caractéristiques et la variable cible
X = df[['Avis_nettoyé', 'Longueur_commentaire', 'Nombre_ponctuations', 'Nombre_points_exclamation', 'Polarite']]
y = df['Note']

# Diviser les données en ensembles d'entraînement et de test (80% pour l'entraînement, 20% pour le test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, stratify = y, random_state=42)

# StandardScaler / CountVectorizer 

In [None]:
# Séparation des colonnes numériques de la colonne textuelle
numeric_features = ['Longueur_commentaire', 'Nombre_ponctuations', 'Nombre_points_exclamation', 'Polarite']
text_feature = 'Avis_nettoyé'

# Instanciation d'un StandardScaler
scaler = StandardScaler()

# Utilisation du StandardScaler sur les colonnes numériques de X_train, et Conversion du résultat en DataFrame
numeric_features_train_array = scaler.fit_transform(X_train[numeric_features])
numeric_features_train_scaled = pd.DataFrame(numeric_features_train_array,
                                             columns = numeric_features)

# Utilisation du StandardScaler sur les colonnes numériques de X_test, et Conversion du résultat en DataFrame
numeric_features_test_array = scaler.transform(X_test[numeric_features])
numeric_features_test_scaled = pd.DataFrame(numeric_features_test_array,
                                            columns = numeric_features)

# Instanciation d'un CountVectorizer
count_vectorizer = CountVectorizer(min_df = 50)

# Utilisation du CountVectorizer sur la colonne textuelle de X_train, et Conversion du résultat en DataFrame
text_feature_train_matrix = count_vectorizer.fit_transform(X_train[text_feature])
text_feature_train_count = pd.DataFrame(text_feature_train_matrix.toarray(),
                                        columns = count_vectorizer.get_feature_names_out())

# Utilisation du CountVectorizer sur la colonne textuelle de X_test, et Conversion du résultat en DataFrame
text_feature_test_matrix = count_vectorizer.transform(X_test[text_feature])
text_feature_test_count = pd.DataFrame(text_feature_test_matrix.toarray(),
                                       columns = count_vectorizer.get_feature_names_out())

# Jointure des données transformées de X_train
X_train_scaled_count = numeric_features_train_scaled.join(text_feature_train_count)

# Jointure des données transformées de X_test
X_test_scaled_count = numeric_features_test_scaled.join(text_feature_test_count)

In [None]:
print(f"L'argument \"min_df = [valeur]\" dans CountVectorizer() permet de passer \
de 18.321 colonnes à {X_train_scaled_count.shape[1]} colonnes.")

## RandomOverSampler

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Instanciation du RandomOverSampler
ros = RandomOverSampler()

# Application du RandomOverSampler sur X_train_scaled_count_pca et y_train
X_train_ros, y_train_ros = ros.fit_resample(X_train_scaled_count, y_train)

# Affichage de la distribution des classes après l'application de RandomOverSampler
print("Distribution des classes après l'application de RandomOverSampler:", Counter(y_train_ros))

## Tester les modeles avec (RandomOverSampler + StandardScaler + CountVectorizer)

### LogisticRegression

In [None]:
%%time

# Hyperparamètres pour la GridSearchCV de la LogisticRegression
hyperparameters_LR1 = {"C" : [0.1, 1, 10],
                       "penalty" : ["l2", "l1"],   # "elasticnet"
                       "solver" : ["liblinear"]}
# Autres hyperparamètres : solver = "lbfgs", class_weight = "balanced", max_iter = 1000, tol, random_state, ...

# Instanciation d'une GridSearchCV pour la LogisticRegression
clf_LR1 = LogisticRegression()
grid_LR1 = GridSearchCV(estimator = clf_LR1, param_grid = hyperparameters_LR1, cv = 2)
grid_LR1.fit(X_train_ros, y_train_ros)

In [None]:
# print("\033[1mRésultats de la GridSearchCV pour la LogisticRegression :\033[0m \n")
# display(pd.DataFrame(grid_LR1.cv_results_))

print("\033[1mMeilleurs hyperparamètres de la GridSearchCV pour la LogisticRegression :\033[0m \n")
print(grid_LR1.best_params_)

In [None]:
# Prédiction avec la GridSearchCV pour la LogisticRegression
y_pred_LR1 = grid_LR1.predict(X_test_scaled_count)

print("\033[1mRapport de classification de la GridSearchCV pour la LogisticRegression :\033[0m \n")
print(classification_report(y_test, y_pred_LR1))

print("\033[1mMatrice de confusion de la GridSearchCV pour la LogisticRegression :\033[0m \n")
display(pd.crosstab(y_test, y_pred_LR1, rownames = ['Classes réelles'], colnames = ['Classes prédites']))

### SVC

In [None]:
%%time

# Classification SVC
clf_SVC1 = SVC(gamma = 'scale')
clf_SVC1.fit(X_train_ros, y_train_ros)

In [None]:
# Prédiction du SVC
y_pred_SVC1 = clf_SVC1.predict(X_test_scaled_count)

print("\033[1mRapport de classification de SVC :\033[0m \n")
print(classification_report(y_test, y_pred_SVC1))

print("\033[1mMatrice de confusion de SVC :\033[0m \n")
display(pd.crosstab(y_test, y_pred_SVC1, rownames = ['Classes réelles'], colnames = ['Classes prédites']))

### KNN

In [None]:
pip install --upgrade numpy scikit-learn threadpoolctl

In [None]:
%%time

# Hyperparamètres pour la GridSearchCV du KNeighborsClassifier
hyperparameters_KNN1 = {"n_neighbors" : [3, 5, 7],
                        "metric" : ["minkowski", "manhattan", "chebyshev", "euclidean"]}
# Autres hyperparamètres : weights = "uniform" ou "distance", ...

# Instanciation d'une GridSearchCV du KNeighborsClassifier
clf_KNN1 = KNeighborsClassifier()
grid_KNN1 = GridSearchCV(estimator = clf_KNN1, param_grid = hyperparameters_KNN1, cv = 2)
grid_KNN1.fit(X_train_ros, y_train_ros)

In [None]:
# print("\033[1mRésultats de la GridSearchCV du KNeighborsClassifier :\033[0m \n")
# display(pd.DataFrame(grid_KNN1.cv_results_))

print("\033[1mMeilleurs hyperparamètres de la GridSearchCV du KNeighborsClassifier :\033[0m \n")
print(grid_KNN1.best_params_)

In [None]:
# Prédiction avec la GridSearchCV du KNeighborsClassifier
y_pred_KNN1 = grid_KNN1.predict(X_test_scaled_count.values)

print("\033[1mRapport de classification de la GridSearchCV du KNeighborsClassifier :\033[0m \n")
print(classification_report(y_test, y_pred_KNN1))

print("\033[1mMatrice de confusion de la GridSearchCV du KNeighborsClassifier :\033[0m \n")
display(pd.crosstab(y_test, y_pred_KNN1, rownames=['Classes réelles'], colnames=['Classes prédites']))

### DecisionTreeClassifier

In [None]:
%%time

# Hyperparamètres pour la GridSearchCV du DecisionTreeClassifier
hyperparameters_DT1 = {"criterion" : ["gini", "entropy"],
                       "max_depth" : [None, 3, 5, 7]}

# Instanciation d'une GridSearchCV du DecisionTreeClassifier
clf_DT1 = DecisionTreeClassifier()
grid_DT1 = GridSearchCV(estimator = clf_DT1, param_grid = hyperparameters_DT1, cv = 2)
grid_DT1.fit(X_train_ros, y_train_ros)

In [None]:
print("\033[1mMeilleurs hyperparamètres de la GridSearchCV du DecisionTreeClassifier :\033[0m \n")
print(grid_DT1.best_params_)

In [None]:
# Prédiction avec la GridSearchCV du DecisionTreeClassifier
y_pred_DT1 = grid_DT1.predict(X_test_scaled_count)

print("\033[1mRapport de classification de la GridSearchCV du DecisionTreeClassifier :\033[0m \n")
print(classification_report(y_test, y_pred_DT1))

print("\033[1mMatrice de confusion de la GridSearchCV du DecisionTreeClassifier :\033[0m \n")
display(pd.crosstab(y_test, y_pred_DT1, rownames = ['Classes réelles'], colnames = ['Classes prédites']))

### GradientBoostingClassifier

In [None]:
%%time
from sklearn.ensemble import GradientBoostingClassifier

clf_GB1 = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.5, max_depth = 2)
clf_GB1.fit(X_train_ros, y_train_ros)

In [None]:
# Prédiction du GradientBoostingClassifier
y_pred_GB1 = clf_GB1.predict(X_test_scaled_count)

print("\033[1mRapport de classification de la GridSearchCV du GradientBoostingClassifier :\033[0m \n")
print(classification_report(y_test, y_pred_GB1))

print("\033[1mMatrice de confusion de la GridSearchCV du GradientBoostingClassifier :\033[0m \n")
display(pd.crosstab(y_test, y_pred_GB1, rownames = ['Classes réelles'], colnames = ['Classes prédites']))

### MultinomialNB

In [None]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Instanciation du RandomOverSampler
ros = RandomOverSampler()

# Application du RandomOverSampler sur X_train_scaled_count_pca et y_train
X_train_ros_NB, y_train_ros_NB = ros.fit_resample(text_feature_train_count, y_train)

# Affichage de la distribution des classes après l'application de RandomOverSampler
print("Distribution des classes après l'application de RandomOverSampler:", Counter(y_train_ros))

clf_NB1 = MultinomialNB()
clf_NB1.fit(X_train_ros_NB, y_train_ros_NB)

In [None]:
# Prédiction du MultinomialNB
y_pred_NB1 = clf_NB1.predict(text_feature_test_count)

print("\033[1mRapport de classification de la GridSearchCV du MultinomialNB :\033[0m \n")
print(classification_report(y_test, y_pred_NB1))

print("\033[1mMatrice de confusion de la GridSearchCV du MultinomialNB :\033[0m \n")
display(pd.crosstab(y_test, y_pred_NB1, rownames = ['Classes réelles'], colnames = ['Classes prédites']))

## RandomUnderSampler

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Instanciation du RandomUnderSampler
rus = RandomUnderSampler()

# Application du RandomUnderSampler sur X_train_scaled_count_pca et y_train
X_train_rus, y_train_rus = rus.fit_resample(X_train_scaled_count, y_train)

# Affichage de la distribution des classes après l'application de RandomUnderSampler
print("Distribution des classes après l'application de RandomUnderSampler:", Counter(y_train_rus))

### LogisticRegression

In [None]:
%%time

# Hyperparamètres pour la GridSearchCV de la LogisticRegression
hyperparameters_LR2 = {
    "C": [0.1, 1, 10],
    "penalty": ["l2", "l1"],
    "solver": ["liblinear"]
}

# Instanciation d'une GridSearchCV pour la LogisticRegression
clf_LR2 = LogisticRegression()
grid_LR2 = GridSearchCV(estimator=clf_LR2, param_grid=hyperparameters_LR2, cv=2)
grid_LR2.fit(X_train_rus, y_train_rus)

In [None]:
# Prédiction avec la GridSearchCV pour la LogisticRegression
y_pred_LR2 = grid_LR2.predict(X_test_scaled_count)

print("\033[1mRapport de classification de la GridSearchCV pour la LogisticRegression :\033[0m \n")
print(classification_report(y_test, y_pred_LR2))

print("\033[1mMatrice de confusion de la GridSearchCV pour la LogisticRegression :\033[0m \n")
display(pd.crosstab(y_test, y_pred_LR2, rownames = ['Classes réelles'], colnames = ['Classes prédites']))

### SVC

In [None]:
%%time

# Classification SVC
clf_SVC2 = SVC(gamma = 'scale')
clf_SVC2.fit(X_train_rus, y_train_rus)

In [None]:
# Prédiction du SVC
y_pred_SVC2 = clf_SVC2.predict(X_test_scaled_count)

print("\033[1mRapport de classification de SVC :\033[0m \n")
print(classification_report(y_test, y_pred_SVC2))

print("\033[1mMatrice de confusion de SVC :\033[0m \n")
display(pd.crosstab(y_test, y_pred_SVC2, rownames = ['Classes réelles'], colnames = ['Classes prédites']))

## SMOTE (Synthetic Minority Over-sampling Technique)

In [None]:
from imblearn.over_sampling import SMOTE

# Instanciation de SMOTE sans spécifier les paramètres
smote = SMOTE()

# Application de SMOTE sur X_train_scaled_count_pca
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled_count, y_train)

# Affichage de la distribution des classes après l'application de SMOTE
print("Distribution des classes après l'application de SMOTE:", Counter(y_train_smote))

In [None]:
%%time

# Hyperparamètres pour la GridSearchCV de la LogisticRegression
hyperparameters_LR3 = {
    "C": [0.1, 1, 10],
    "penalty": ["l2", "l1"],
    "solver": ["liblinear"]
}

# Instanciation d'une GridSearchCV pour la LogisticRegression
clf_LR3 = LogisticRegression()
grid_LR3 = GridSearchCV(estimator=clf_LR3, param_grid=hyperparameters_LR3, cv=2)
grid_LR3.fit(X_train_smote, y_train_smote)

In [None]:
# Prédiction avec la GridSearchCV pour la LogisticRegression
y_pred_LR3 = grid_LR3.predict(X_test_scaled_count)

print("\033[1mRapport de classification de la GridSearchCV pour la LogisticRegression :\033[0m \n")
print(classification_report(y_test, y_pred_LR3))

print("\033[1mMatrice de confusion de la GridSearchCV pour la LogisticRegression :\033[0m \n")
display(pd.crosstab(y_test, y_pred_LR3, rownames = ['Classes réelles'], colnames = ['Classes prédites']))

# Regrouper la variable cible

In [None]:
note_distribution = df['Note'].value_counts().sort_index()
print(note_distribution)

In [None]:
# Définition des catégories
def categorize_rating(rating):
    if rating in [4, 5]:
        return 1  # Positif
    else:
        return 0  # Négatif

# Appliquer la fonction de catégorisation à la colonne des notes
df['Note_cat'] = df['Note'].apply(categorize_rating)

# Vérifier la distribution des nouvelles catégories
note_cat_distribution = df['Note_cat'].value_counts()
print(note_cat_distribution)

In [None]:
# Sélectionner les caractéristiques et la variable cible
X = df[['Avis_nettoyé', 'Longueur_commentaire', 'Nombre_ponctuations', 'Nombre_points_exclamation', 'Polarite']]
y = df['Note_cat']

# Diviser les données en ensembles d'entraînement et de test (80% pour l'entraînement, 20% pour le test)
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X, y, test_size=0.2, shuffle = True, stratify = y, random_state=42)

In [None]:
# Séparation des colonnes numériques de la colonne textuelle
numeric_features_cat = ['Longueur_commentaire', 'Nombre_ponctuations', 'Nombre_points_exclamation', 'Polarite']
text_feature_cat = 'Avis_nettoyé'

# Instanciation d'un StandardScaler
scaler_cat = StandardScaler()

# Utilisation du StandardScaler sur les colonnes numériques de X_train_cat, et Conversion du résultat en DataFrame
numeric_features_train_array_cat = scaler_cat.fit_transform(X_train_cat[numeric_features_cat])
numeric_features_train_scaled_cat = pd.DataFrame(numeric_features_train_array_cat,
                                             columns = [f"{feature}_cat" for feature in numeric_features_cat])

# Utilisation du StandardScaler sur les colonnes numériques de X_test_cat, et Conversion du résultat en DataFrame
numeric_features_test_array_cat = scaler_cat.transform(X_test_cat[numeric_features_cat])
numeric_features_test_scaled_cat = pd.DataFrame(numeric_features_test_array_cat,
                                            columns = [f"{feature}_cat" for feature in numeric_features_cat])

# Instanciation d'un CountVectorizer
count_vectorizer_cat = CountVectorizer(min_df = 50)

# Utilisation du CountVectorizer sur la colonne textuelle de X_train_cat, et Conversion du résultat en DataFrame
text_feature_train_matrix_cat = count_vectorizer_cat.fit_transform(X_train_cat[text_feature_cat])
text_feature_train_count_cat = pd.DataFrame(text_feature_train_matrix_cat.toarray(),
                                        columns = count_vectorizer_cat.get_feature_names_out())

# Utilisation du CountVectorizer sur la colonne textuelle de X_test_cat, et Conversion du résultat en DataFrame
text_feature_test_matrix_cat = count_vectorizer_cat.transform(X_test_cat[text_feature_cat])
text_feature_test_count_cat = pd.DataFrame(text_feature_test_matrix_cat.toarray(),
                                       columns = count_vectorizer_cat.get_feature_names_out())

# Jointure des données transformées de X_train_cat
X_train_scaled_count_cat = numeric_features_train_scaled_cat.join(text_feature_train_count_cat)

# Jointure des données transformées de X_test_cat
X_test_scaled_count_cat = numeric_features_test_scaled_cat.join(text_feature_test_count_cat)

In [None]:
print(f"L'argument \"min_df = [valeur]\" dans CountVectorizer() permet de passer \
de 18.321 colonnes à {X_train_scaled_count_cat.shape[1]} colonnes.")

### LogisticRegression

In [None]:
%%time

# Hyperparamètres pour la GridSearchCV de la LogisticRegression
hyperparameters_LR = {"C" : [0.1, 1, 10],
                       "penalty" : ["l2", "l1"],   # "elasticnet"
                       "solver" : ["liblinear"]}

# Instanciation d'une GridSearchCV pour la LogisticRegression
clf_LR_cat = LogisticRegression()
grid_LR_cat = GridSearchCV(estimator = clf_LR_cat, param_grid = hyperparameters_LR, cv = 2)
grid_LR_cat.fit(X_train_scaled_count_cat, y_train_cat) # Prédiction avec la GridSearchCV pour la LogisticRegression

y_pred_LR_cat = grid_LR_cat.predict(X_test_scaled_count_cat)

print("\033[1mRapport de classification de la GridSearchCV pour la LogisticRegression :\033[0m \n")
print(classification_report(y_test_cat, y_pred_LR_cat))

print("\033[1mMatrice de confusion de la GridSearchCV pour la LogisticRegression :\033[0m \n")
display(pd.crosstab(y_test_cat, y_pred_LR_cat, rownames=['Classes réelles'], colnames=['Classes prédites']))

### SVC

In [None]:
%%time

# Classification SVC
clf_SVC_cat = SVC(gamma='scale')
clf_SVC_cat.fit(X_train_scaled_count_cat, y_train_cat)  # Prédiction du SVC
y_pred_SVC_cat = clf_SVC_cat.predict(X_test_scaled_count_cat)

print("\033[1mRapport de classification de SVC :\033[0m \n")
print(classification_report(y_test_cat, y_pred_SVC_cat))

print("\033[1mMatrice de confusion de SVC :\033[0m \n")
display(pd.crosstab(y_test_cat, y_pred_SVC_cat, rownames=['Classes réelles'], colnames=['Classes prédites']))

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Instanciation d'un KNeighborsClassifier
clf_KNN_cat = KNeighborsClassifier()

# Entraînement du modèle
clf_KNN_cat.fit(X_train_scaled_count_cat.values, y_train_cat.values)

# Prédiction avec le modèle
y_pred_KNN_cat = clf_KNN_cat.predict(X_test_scaled_count_cat.values)

# Affichage du rapport de classification
print("\033[1mRapport de classification du KNeighborsClassifier :\033[0m \n")
print(classification_report(y_test_cat, y_pred_KNN_cat))

### DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Hyperparamètres pour la GridSearchCV du DecisionTreeClassifier
hyperparameters_DT_cat = {"criterion": ["gini", "entropy"],
                       "max_depth": [None, 3, 5, 7]}

# Instanciation d'une GridSearchCV du DecisionTreeClassifier
clf_DT_cat = DecisionTreeClassifier()
grid_DT_cat = GridSearchCV(estimator=clf_DT_cat, param_grid=hyperparameters_DT_cat, cv=2)
grid_DT_cat.fit(X_train_scaled_count_cat, y_train_cat)

# Prédiction avec la GridSearchCV du DecisionTreeClassifier
y_pred_DT_cat = grid_DT_cat.predict(X_test_scaled_count_cat)

print("\033[1mRapport de classification de la GridSearchCV du DecisionTreeClassifier :\033[0m \n")
print(classification_report(y_test_cat, y_pred_DT_cat))

print("\033[1mMatrice de confusion de la GridSearchCV du DecisionTreeClassifier :\033[0m \n")
display(pd.crosstab(y_test_cat, y_pred_DT_cat, rownames=['Classes réelles'], colnames=['Classes prédites']))

### GradientBoostingClassifier

In [None]:
%%time
from sklearn.ensemble import GradientBoostingClassifier

clf_GB_cat = GradientBoostingClassifier(n_estimators=50, learning_rate=0.5, max_depth=2)
clf_GB_cat.fit(X_train_scaled_count_cat, y_train_cat)

# Prédiction du GradientBoostingClassifier
y_pred_GB_cat = clf_GB_cat.predict(X_test_scaled_count_cat)

print("\033[1mRapport de classification de la GridSearchCV du GradientBoostingClassifier :\033[0m \n")
print(classification_report(y_test_cat, y_pred_GB_cat))

print("\033[1mMatrice de confusion de la GridSearchCV du GradientBoostingClassifier :\033[0m \n")
display(pd.crosstab(y_test_cat, y_pred_GB_cat, rownames=['Classes réelles'], colnames=['Classes prédites']))

### MultinomialNB

In [None]:
#%%time

#clf_NB1_cat = MultinomialNB()
#clf_NB1_cat.fit(text_feature_train_count_cat, y_train_cat)

In [None]:
# Prédiction du MultinomialNB
#y_pred_NB1_cat = clf_NB1.predict(text_feature_test_count_cat)

#print("\033[1mRapport de classification de la GridSearchCV du MultinomialNB :\033[0m \n")
#print(classification_report(y_test_cat, y_pred_NB1_cat))

#print("\033[1mMatrice de confusion de la GridSearchCV du MultinomialNB :\033[0m \n")
#display(pd.crosstab(y_test_cat, y_pred_NB1_cat, rownames = ['Classes réelles'], colnames = ['Classes prédites']))

## Analyse des coefficients : 

In [None]:
# Extraction des coefficients
coefficients = grid_LR_cat.best_estimator_.coef_[0]
feature_names = X_train_scaled_count_cat.columns

# Création d'un DataFrame pour une meilleure visualisation
coefficients_df = pd.DataFrame({'Variable': feature_names, 'Coefficient': coefficients})
coefficients_df = coefficients_df.sort_values(by='Coefficient', ascending=False)

# Affichage des coefficients
print(coefficients_df)

In [None]:
import matplotlib.pyplot as plt

# Diviser le DataFrame en deux parties : positifs et négatifs
positive_coefficients = coefficients_df[coefficients_df['Coefficient'] > 0].tail(10)
negative_coefficients = coefficients_df[coefficients_df['Coefficient'] < 0].head(10)

# Concaténer les deux parties
top_coefficients = pd.concat([positive_coefficients, negative_coefficients])

# Inverser l'ordre des lignes dans le DataFrame
top_coefficients = top_coefficients.iloc[::-1]

# Créer le graphique
plt.figure(figsize=(12, 8))
colors = ['red' if c < 0 else 'blue' for c in top_coefficients['Coefficient']]
plt.barh(top_coefficients['Variable'], top_coefficients['Coefficient'], color=colors)
plt.xlabel('Coefficient')
plt.ylabel('Variable')
plt.title('Top 10 des Coefficients de la Régression Logistique')
plt.axvline(x=0, color='black', linewidth=0.5)
for i, v in enumerate(top_coefficients['Coefficient']):
    plt.text(v, i, str(round(v, 2)), color='black', va='center')
plt.show()

## Deep Learning "Note" de 1 à 5

In [None]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

inputs = Input(shape = (X_train_scaled_count.shape[1],), name = "Input")

dense1 = Dense(units = 10, activation = "tanh", name = "Dense_1")
dense2 = Dense(units = 8, activation = "tanh", name = "Dense_2")
dense3 = Dense(units = 6, activation = "tanh", name = "Dense_3")
dense4 = Dense(units = 5, activation = "softmax", name = "Dense_4")

x = dense1(inputs)
x = dense2(x)
x = dense3(x)
outputs = dense4(x)

model = Model(inputs = inputs, outputs = outputs)
model.summary()

model.compile(loss = "sparse_categorical_crossentropy",
              optimizer = "adam",
              metrics = ["accuracy"])

y_train_dl = y_train - 1

model.fit(X_train_scaled_count, y_train_dl, epochs = 40, batch_size = 512, validation_split = 0.2)

#Prédiction du model
y_pred_prob = model.predict(X_test_scaled_count)
y_pred_class = np.argmax(y_pred_prob, axis = 1) + 1

print("\033[1mRapport de classification du model :\033[0m \n")
print(classification_report(y_test, y_pred_class))

print("\n")

print("\033[1mMatrice de confusion du model :\033[0m \n")
display(pd.crosstab(y_test, y_pred_class, rownames = ['Classes réelles'], colnames = ['Classes prédites'])) 


## Deep Learning "Note" 2 classes 

In [None]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.metrics import classification_report, confusion_matrix

inputs = Input(shape=(X_train_scaled_count_cat.shape[1],), name="Input")

dense1 = Dense(units = 10, activation = "tanh", name = "Dense_1")
dense2 = Dense(units = 8, activation = "tanh", name = "Dense_2")
dense3 = Dense(units = 6, activation = "tanh", name = "Dense_3")
dense4 = Dense(units=1, activation="sigmoid", name="Dense_4")

x = dense1(inputs)
x = dense2(x)
x = dense3(x)
outputs = dense4(x)

model = Model(inputs=inputs, outputs=outputs)
model.summary()

model.compile(loss="binary_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

# Fit the model
model.fit(X_train_scaled_count_cat, y_train_cat, epochs=40, batch_size=512, validation_split=0.2)

# Prediction
y_pred_prob_cat = model.predict(X_test_scaled_count_cat)
y_pred_class_cat = (y_pred_prob_cat > 0.5).astype(int)


# Flatten y_pred_class_cat
y_pred_class_cat = y_pred_class_cat.flatten()


# Evaluation
print("\033[1mClassification Report:\033[0m \n")
print(classification_report(y_test_cat, y_pred_class_cat))

print("\n")

print("\033[1mMatrice de confusion du model :\033[0m \n")
display(pd.crosstab(y_test_cat, y_pred_class_cat, rownames = ['Classes réelles'], colnames = ['Classes prédites']))