In [1]:
# imports
import numpy as np
import os

from numpy.random import default_rng
# stabilité du notebook d'une exécution à l'autre
random=default_rng(42) 

# jolies figures directement dans le notebook
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# où sauver les figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "workflowDS"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID) # le dossier doit exister

## Import csv

In [2]:
import pandas as pd

#Récupération des données
general_data = pd.read_csv('Donnees/general_data.csv')
employee_survey_data = pd.read_csv('Donnees/employee_survey_data.csv')
manager_survey_data = pd.read_csv('Donnees/manager_survey_data.csv')
in_time = pd.read_csv('Donnees/in_time.csv')
out_time = pd.read_csv('Donnees/out_time.csv')

## Combinaison DataFrame

In [3]:
#préparation du fichier in_time
def rename_column(dataset, name, position):
    tbl = dataset.columns.to_list()
    tbl[position] = name
    dataset.columns = tbl

rename_column(in_time, 'EmployeeID', 0)

#calculer les absences de chaque employé
def get_absence(dataset):
    absence = []
    for i in range(len(dataset)):
        absence.append(dataset.iloc[i].isna().sum())
    return absence

absence = get_absence(in_time)

# ajouter la colonne absence au dataset in_time
in_time['absence'] = absence
in_time

#fusion des données
general = pd.merge(employee_survey_data, general_data, on='EmployeeID', how='inner')
general_manager = pd.merge(general, manager_survey_data, on='EmployeeID', how='inner')
general_employee_manager = pd.merge(general_manager, in_time[['EmployeeID', 'absence']], on='EmployeeID')

#suppression des lignes avec des valeurs manquantes
#general_employee_manager=general_employee_manager.dropna().reset_index(drop=True)

#suppression des colonnes inutiles
general_employee_manager = general_employee_manager.drop('Over18', axis=1)
general_employee_manager = general_employee_manager.drop('EmployeeID', axis=1)
#general_data = general_data.drop(['Education', 'EducationField', 'Over18', 'TrainingTimesLastYear'], axis=1)

#affichage des données
general_employee_manager

Unnamed: 0,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,JobInvolvement,PerformanceRating,absence
0,3.0,4.0,2.0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,...,8,0,1.0,6,1,0,0,3,3,29
1,3.0,2.0,4.0,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,...,8,1,6.0,3,5,1,4,2,4,25
2,2.0,2.0,1.0,32,No,Travel_Frequently,Research & Development,17,4,Other,...,8,3,5.0,2,5,0,3,3,3,19
3,4.0,4.0,3.0,38,No,Non-Travel,Research & Development,2,5,Life Sciences,...,8,3,13.0,5,8,7,5,2,3,26
4,4.0,1.0,3.0,32,No,Travel_Rarely,Research & Development,10,1,Medical,...,8,2,9.0,2,6,0,4,3,3,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,4.0,1.0,3.0,42,No,Travel_Rarely,Research & Development,5,4,Medical,...,8,1,10.0,5,3,0,2,3,3,18
4406,4.0,4.0,3.0,29,No,Travel_Rarely,Research & Development,2,4,Medical,...,8,0,10.0,2,3,0,2,2,3,20
4407,1.0,3.0,3.0,25,No,Travel_Rarely,Research & Development,25,2,Life Sciences,...,8,0,5.0,4,4,1,2,3,4,30
4408,4.0,1.0,3.0,42,No,Travel_Rarely,Sales,18,2,Medical,...,8,1,10.0,2,9,7,8,2,3,20


In [4]:
#affichage des données manquantes
general_employee_manager.isna().sum()

EnvironmentSatisfaction    25
JobSatisfaction            20
WorkLifeBalance            38
Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
JobInvolvement              0
PerformanceRating           0
absence                     0
dtype: int64

## Version Thib

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(general_employee_manager, general_employee_manager["Attrition"]):
    stratified_train_set = general_employee_manager.loc[train_index]
    stratified_test_set = general_employee_manager.loc[test_index]

general_employee_manager_labels = stratified_train_set["Attrition"].copy()

for set_ in (stratified_train_set, stratified_test_set):
    set_.drop("Attrition", axis=1, inplace=True)

general_employee_manager = stratified_train_set.copy()

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier 
# max_samples: maximum size 0.5=50% of each sample taken from the full dataset
# max_features: maximum of features 1=100% taken here all 10K 
# n_estimators: number of decision trees

#Utilisation de l'arbre de décision
bg=BaggingClassifier(DecisionTreeClassifier(),max_samples=0.5,max_features=1.0,n_estimators=10)
bg.fit(X_train, y_train)

#affichage de la précision
print("score on test: " + str(bg.score(X_test, y_test)))
print("score on train: "+ str(bg.score(X_train, y_train)))

score on test: 0.92015503875969
score on train: 0.9754152823920266


In [None]:
from sklearn.ensemble import RandomForestClassifier
# n_estimators = number of decision trees

#uttilisation de la forêt aléatoire
rf = RandomForestClassifier(n_estimators=30, max_depth=9)
rf.fit(X_train, y_train)

#affichage de la précision
print("score on test: " + str(rf.score(X_test, y_test)))
print("score on train: "+ str(rf.score(X_train, y_train)))

score on test: 0.8953488372093024
score on train: 0.9332225913621263


## Pipeline

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

#remplacer les valeurs manquantes
def remplacer_valeurs_nulles(df):
    for columns in df.columns:
        i = df[columns].isna().sum()
        if i > 0:
        # Vérifier le type de la colonne
            if df[columns].dtype == 'object':
                # Si la colonne est de type "object" (string), remplacer les valeurs nulles par la valeur la plus fréquente
                valeur_frequente = df[columns].mode()[0]
                df[columns].fillna(valeur_frequente, inplace=True)
            else:
                # Si la colonne est de type numérique, remplacer les valeurs nulles par la moyenne
                moyenne = np.mean(df[columns])
                df[columns].fillna(moyenne, inplace=True)

remplacer_valeurs_nulles(general_employee_manager)

#création du pipeline
num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()), #normalisation des données
    ])

#récupération des données numériques
general_employer_manager_num = general_employee_manager.select_dtypes(include=[np.number])
num_attribs = list(general_employer_manager_num)

#récupération des données catégorielles
cat_attribs = ["BusinessTravel", "Department", "EducationField","Gender","JobRole","MaritalStatus", ]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

general_employee_manager_prepared = full_pipeline.fit_transform(general_employee_manager)

general_employee_manager_prepared

array([[-0.66056485,  0.26539858,  0.34042198, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.2594514 ,  1.17131648,  0.34042198, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.2594514 , -0.64051933,  0.34042198, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-1.58058111, -0.64051933,  0.34042198, ...,  0.        ,
         1.        ,  0.        ],
       [-1.58058111,  0.26539858, -2.52534192, ...,  0.        ,
         0.        ,  1.        ],
       [-1.58058111, -0.64051933,  0.34042198, ...,  0.        ,
         1.        ,  0.        ]])

## Test

In [7]:
from sklearn.model_selection import train_test_split

#suppression des colonnes inutiles
general_employee_manager = general_employee_manager.drop('BusinessTravel', axis=1)
general_employee_manager = general_employee_manager.drop('Department', axis=1)
general_employee_manager = general_employee_manager.drop('EducationField', axis=1)
general_employee_manager = general_employee_manager.drop('Gender', axis=1)
general_employee_manager = general_employee_manager.drop('JobRole', axis=1)
general_employee_manager = general_employee_manager.drop('MaritalStatus', axis=1)

#création des données d'entrainement et de test
X_train, X_test, y_train, y_test = train_test_split(general_employee_manager.drop('Attrition',axis=1), 
                                                    general_employee_manager['Attrition'], test_size=0.30, 
                                                    random_state=101)

KeyError: "['Attrition'] not found in axis"

In [None]:
from sklearn.linear_model import LogisticRegression

#Utilisation de la régression logistique
lr=LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

#affichage de la précision
print("score on test: " + str(lr.score(X_test, y_test)))
print("score on train: "+ str(lr.score(X_train, y_train)))

score on test: 0.8302325581395349
score on train: 0.8418604651162791
