In [40]:
# Import Libs

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Normalizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_curve, roc_curve, roc_auc_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error, root_mean_squared_error
import matplotlib.pyplot as plt
import missingno as msno
import numpy as np
import pandas as pd 
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, Perceptron, LinearRegression
from sklearn.ensemble import RandomForestClassifier


# Load the data
filename = './AI_Project_Data/cleaned_data.csv'
employee_data = pd.read_csv(filename)

# Displaying data.head() to see the first 5 rows of the data
employee_data.head()

Unnamed: 0,Age,Attrition,DistanceFromHome,Education,EmployeeID,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,...,MaritalStatus_Married,MaritalStatus_Single,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement,PerformanceRating,mean_worked_hours,total_worked_hours,worked_days
0,51,False,6,2,1,1,131160,1,11,0,...,1,0,3,4,2,3,3,7.373651,1710.686944,232
1,31,True,10,1,2,1,41890,0,23,1,...,0,1,3,2,4,2,4,7.718969,1821.676667,236
2,32,False,17,4,3,4,193280,1,15,3,...,1,0,2,2,1,3,3,7.01324,1697.204167,242
3,38,False,2,5,4,3,83210,3,11,3,...,1,0,4,4,3,2,3,7.193678,1690.514444,235
4,32,False,10,1,5,1,23420,4,12,2,...,0,1,4,1,3,3,3,8.006175,1961.512778,245


In [41]:
from sklearn.pipeline import Pipeline

# Create a pipeline for the numerical data
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

num_attribs = employee_data.select_dtypes(include=[np.number])
cat_attribs = employee_data.drop(num_attribs, axis=1)

num_data = numerical_pipeline.fit_transform(num_attribs)
employee_data = pd.concat([pd.DataFrame(num_data, columns=num_attribs.columns), cat_attribs], axis=1)

employee_data.head()

Unnamed: 0,Age,DistanceFromHome,Education,EmployeeID,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,...,MaritalStatus_Single,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement,PerformanceRating,mean_worked_hours,total_worked_hours,worked_days,Attrition
0,1.541369,-0.393938,-0.891688,-1.731658,-0.961486,1.405136,-0.678464,-1.150554,-0.932014,-1.320847,...,-0.685565,0.2522,1.156302,-1.085336,0.379672,-0.42623,-0.244123,-0.33378,-0.775066,False
1,-0.648668,0.099639,-1.868426,-1.730873,-0.961486,-0.491661,-1.079486,2.129306,0.241988,-0.678145,...,1.45865,0.2522,-0.663899,1.757749,-1.026167,2.346151,0.013564,0.001208,-0.04821,True
2,-0.539166,0.963398,1.061787,-1.730087,1.74961,2.725053,-0.678464,-0.057267,2.589994,-0.806686,...,-0.685565,-0.66546,-0.663899,-2.506879,0.379672,-0.42623,-0.513073,-0.374474,1.042074,False
3,0.117845,-0.887515,2.038524,-1.729302,0.845911,0.386301,0.12358,-1.150554,2.589994,0.221637,...,-0.685565,1.169861,1.156302,0.336206,-1.026167,-0.42623,-0.378424,-0.394665,-0.229924,False
4,-0.539166,0.099639,-1.868426,-1.728516,-0.961486,-0.884109,0.524602,-0.877232,1.415991,-0.292524,...,1.45865,1.169861,-1.573999,0.336206,0.379672,-0.42623,0.227886,0.423261,1.587216,False


# Predict Attrition values

In [42]:
from sklearn.model_selection import StratifiedShuffleSplit
employee_df = employee_data.copy()

y = employee_df["Attrition"]
X = employee_df.drop(columns=["Attrition"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # TODO : shuffle ? cross validation
split = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)

models = {
    "LogisticRegression": LogisticRegression(random_state=42),
    "SVC": SVC(probability=True, random_state=42),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "NaiveBayes": GaussianNB(),
    "RandomForest": RandomForestClassifier(random_state=42), 
    "Perceptron": Perceptron(random_state=42),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=42),
}

In [43]:
# Initialisation des dictionnaires pour stocker les scores
scores = []
predictions = {}
metrics = {
    'precision': {name: [] for name in models.keys()},
    'recall': {name: [] for name in models.keys()},
    'f1': {name: [] for name in models.keys()},
    'auc': {name: [] for name in models.keys()},
    'accuracy': {name: [] for name in models.keys()}
}

In [44]:
# Évaluation des modèles
for train_index, test_index in split.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    for name, model in models.items():
        # Prédictions
        model.fit(X_train, y_train)
        predictions[name] = model.predict(X_test)    
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
        
        # Calcul des métriques
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_proba) if y_proba is not None else np.nan
        accuracy = accuracy_score(y_test, y_pred)
        
        # Stockage des métriques
        metrics['precision'][name].append(precision)
        metrics['recall'][name].append(recall)
        metrics['f1'][name].append(f1)
        metrics['auc'][name].append(auc)
        metrics['accuracy'][name].append(accuracy)
        # Ajout des scores dans le DataFrame final
        scores.append({
            'Model': name,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'Accuracy': accuracy,
            'AUC': auc
        })
        

scores_df = pd.DataFrame(scores)
scores_df_mean = scores_df.groupby('Model').mean()
scores_df_std = scores_df.groupby('Model').std()
scores_df_mean = scores_df_mean.sort_values(['F1 Score', 'Precision', 'Accuracy', 'Recall'], ascending=False) # TODO : change order
scores_df_std = scores_df_std.sort_values(['F1 Score', 'Precision', 'Accuracy', 'Recall'], ascending=True) # TODO : change order

print('Mean Scores')
print(scores_df_mean)
print('_'*50)
print('Standard Deviation')
print(scores_df_std)

Mean Scores
                        Precision    Recall  F1 Score  Accuracy       AUC
Model                                                                    
RandomForest             1.000000  0.892958  0.943084  0.982766  0.998539
DecisionTreeClassifier   0.925981  0.908451  0.916931  0.973469  0.947198
SVC                      0.939498  0.542254  0.687158  0.920635  0.940581
KNeighborsClassifier     0.562715  0.366197  0.443489  0.851927  0.906291
NaiveBayes               0.427518  0.418310  0.421449  0.814739  0.739398
LogisticRegression       0.589011  0.232394  0.332717  0.850340  0.802187
Perceptron               0.344597  0.283099  0.303800  0.796372       NaN
__________________________________________________
Standard Deviation
                        Precision    Recall  F1 Score  Accuracy       AUC
Model                                                                    
KNeighborsClassifier     0.037907  0.014085  0.020641  0.007417  0.009059
DecisionTreeClassifier   0.029

In [45]:
"""
predictions = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    predictions[model_name] = model.predict(X_test)    
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        precision, recall, _ = precision_recall_curve(y_test, y_proba)
        plt.plot(recall, precision, label=f"{model_name}")

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()

for name, y_pred in predictions.items():
    cm = confusion_matrix(y_test, y_pred)
    plt.figure()
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title(f"{name} Matrice de Confusion :")
    plt.show()


plt.figure(figsize=(10, 8)) 
for name, model in models.items():
    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_proba)
        auc = roc_auc_score(y_test, y_proba)
        plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.2f})")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

scores = []
for name, y_pred in predictions.items():
    scores.append({
        'Model': name,
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),
        'Accuracy': accuracy_score(y_test, y_pred),
    })

scores_df = pd.DataFrame(scores)
print(scores_df)
print('_'*50)
print('Sorted Dataframe')
print('_'*50)
scores_df = scores_df.sort_values(['F1 Score', 'Precision', 'Accuracy', 'Recall'], ascending=False) # TODO : change order
print(scores_df)
"""

'\npredictions = {}\nfor model_name, model in models.items():\n    print(f"Training {model_name}...")\n    model.fit(X_train, y_train)\n    predictions[model_name] = model.predict(X_test)    \n    if hasattr(model, "predict_proba"):\n        y_proba = model.predict_proba(X_test)[:, 1]\n        precision, recall, _ = precision_recall_curve(y_test, y_proba)\n        plt.plot(recall, precision, label=f"{model_name}")\n\nplt.xlabel(\'Recall\')\nplt.ylabel(\'Precision\')\nplt.title(\'Precision-Recall Curve\')\nplt.legend()\nplt.show()\n\nfor name, y_pred in predictions.items():\n    cm = confusion_matrix(y_test, y_pred)\n    plt.figure()\n    sns.heatmap(cm, annot=True, fmt="d")\n    plt.title(f"{name} Matrice de Confusion :")\n    plt.show()\n\n\nplt.figure(figsize=(10, 8)) \nfor name, model in models.items():\n    if hasattr(model, "predict_proba"):\n        y_proba = model.predict_proba(X_test)[:, 1]\n        fpr, tpr, _ = roc_curve(y_test, y_proba)\n        auc = roc_auc_score(y_test, y

## Optimize model

full params grids : 

svc_params_grid = {
    # general parameters
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
    'class_weight': [None, 'balanced'],
    'decision_function_shape': ['ovo', 'ovr'],
    'random_state': [42],
    'shrinking': [True, False],
    'probability': [True],
    'cache_size': [200, 500, 1000],
    'verbose': [False],
    
    # specific parameters for the 'rbf' and 'sigmoid' kernels
    'gamma': ['scale', 'auto', 0.0001, 0.001, 0.01, 0.1, 1],
    
    # Specific polyinomial kernel parameters
    'degree': [2, 3, 4, 5],
    'coef0': [0.0, 0.1, 0.5, 1.0],
    
    # tolerance and maximum number of iterations
    'tol': [1e-5, 1e-4, 1e-3],
    'max_iter': [-1, 1000, 2000, 5000]
}


rf_params_grid = {
    # Paramètres de la forêt
    'n_estimators': [100, 200, 300, 500],
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'oob_score': [True, False],
    
    # Paramètres des arbres individuels
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_leaf_nodes': [None, 50, 100, 200],
    
    # Paramètres de randomisation
    'random_state': [42],
    
    # Paramètres pour données déséquilibrées
    'class_weight': [None, 'balanced', 'balanced_subsample'],
    
    # Paramètres de parallélisation
    'n_jobs': [-1],
    
    # Paramètres de critère de split
    'criterion': ['gini', 'entropy', 'log_loss'],
    
    # Paramètres de régularisation
    'max_samples': [0.5, 0.7, 0.9, None],
    'min_impurity_decrease': [0.0, 0.01, 0.1]
}


In [46]:
# hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

svc_params_grid = {
    # Most important parameters
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto', 0.01],
    
    # Basic settings
    'class_weight': [None, 'balanced'],
    'random_state': [42],
    'probability': [True]
}

# creating model using GridSearchCV
SVC_model = GridSearchCV(
    estimator=SVC(probability=True),
    param_grid=svc_params_grid,
    cv=5,
    n_jobs=-1,
    scoring=['accuracy', 'f1', 'precision', 'recall', 'roc_auc'],
    refit='accuracy',
    verbose=0,
    error_score='raise',
    return_train_score=True
)

# Définition d'une grille simplifiée de paramètres
rf_params_grid = {
    # Paramètres essentiels
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2'],
    
    # Paramètres de base
    'random_state': [42],
    'n_jobs': [-1],
    'class_weight': [None, 'balanced']
}

# Configuration du GridSearchCV avec des options avancées
RandomForest_model = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=rf_params_grid,
    cv=5,  # Validation croisée à 5 plis
    n_jobs=-1,  # Utilise tous les cœurs disponibles
    scoring=['accuracy', 'f1', 'precision', 'recall', 'roc_auc'],
    refit='accuracy',  # Réentraîne sur la meilleure métrique accuracy
    verbose=2,
    return_train_score=True,
    error_score='raise'
)

# model fitting
RandomForest_model.fit(X_train, y_train)
SVC_model.fit(X_train, y_train)

# getting best parameters
best_rf_model = RandomForest_model.best_estimator_
best_svc_model = SVC_model.best_estimator_

# displaying results
print("\nMeilleurs paramètres trouvés :")
print(f"\nRandomForest {RandomForest_model.best_params_}")
print(f"\nSVC {SVC_model.best_params_}")
print("\nMeilleur score de validation croisée:")
print(f"\nRandomForest {RandomForest_model.best_score_}")
print(f"\nSVC {SVC_model.best_score_}")


# displaying scores for metrics
rf_results = pd.DataFrame(RandomForest_model.cv_results_)
svc_results = pd.DataFrame(SVC_model.cv_results_)
print("\nRésultats détaillés pour la meilleure configuration:")
metrics = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']
for metric in metrics:
    mean_score = rf_results[f'mean_test_{metric}'].iloc[rf_results['rank_test_accuracy'].argmin()]
    std_score = rf_results[f'std_test_{metric}'].iloc[rf_results['rank_test_accuracy'].argmin()]
    print(f"RandomForest - {metric}: {mean_score:.3f} (+/- {std_score*2:.3f})")
    
    mean_score = svc_results[f'mean_test_{metric}'].iloc[svc_results['rank_test_accuracy'].argmin()]
    std_score = svc_results[f'std_test_{metric}'].iloc[svc_results['rank_test_accuracy'].argmin()]
    print(f"SVC - {metric}: {mean_score:.3f} (+/- {std_score*2:.3f})")

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=100, n_jobs=-1, random_state=42; total time=   0.4s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=100, n_jobs=-1, random_state=42; total time=   0.4s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=100, n_jobs=-1, random_state=42; total time=   0.4s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=100, n_jobs=-1, random_state=42; total time=   0.5s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=100, n_jobs=-1, random_state=42; total time=   0.4s
[CV] END class_weight=None, max_depth=None, max_features=sqrt, min_samples_split=2, n_estimators=200, n_jobs=-1, random_state=42; total time=   0.8s
[CV] END class_weight=None, max_depth=None, 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize


Meilleurs paramètres trouvés :

RandomForest {'class_weight': None, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200, 'n_jobs': -1, 'random_state': 42}

SVC {'C': 10, 'class_weight': None, 'gamma': 'scale', 'kernel': 'rbf', 'probability': True, 'random_state': 42}

Meilleur score de validation croisée:

RandomForest 0.9668382456351837

SVC 0.9736403270849658

Résultats détaillés pour la meilleure configuration:
RandomForest - accuracy: 0.967 (+/- 0.011)
SVC - accuracy: 0.974 (+/- 0.010)
RandomForest - f1: 0.887 (+/- 0.045)
SVC - f1: 0.916 (+/- 0.034)
RandomForest - precision: 0.980 (+/- 0.040)
SVC - precision: 0.941 (+/- 0.034)
RandomForest - recall: 0.812 (+/- 0.089)
SVC - recall: 0.893 (+/- 0.064)
RandomForest - roc_auc: 0.990 (+/- 0.009)
SVC - roc_auc: 0.978 (+/- 0.033)


In [49]:
from sklearn.ensemble import VotingClassifier

classifier = VotingClassifier(
    estimators=[
        ('RandomForest', best_rf_model),
        ('SVC', best_svc_model), 
        ('NaivesBayes', GaussianNB())
    ],
)

predictions = {}
metrics = {
    'precision': [],
    'recall': [],
    'f1': [],
    'accuracy': []
}
scores = []

for train_index, test_index in split.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    metrics['precision'].append(precision)
    metrics['recall'].append(recall)
    metrics['f1'].append(f1)
    metrics['accuracy'].append(accuracy)
    
    scores.append({
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'Accuracy': accuracy
    })
    
scores_df = pd.DataFrame(scores)
print(scores_df.mean())
print(scores_df.std())

    


Precision    0.996970
Recall       0.907042
F1 Score     0.949372
Accuracy     0.984580
dtype: float64
Precision    0.006776
Recall       0.045476
F1 Score     0.025432
Accuracy     0.007286
dtype: float64
