# Attrition Prediction IA Project Yoann

## Import

In [232]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import zipfile

# Import sklearn

from sklearn.compose import make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

## Chargement et exploration des données

In [233]:
# Path to the dataset
src_path = "src/"
general_data_file_path = os.path.join(src_path, "general_data.csv")
employee_survey_data_file_path = os.path.join(src_path, "employee_survey_data.csv")
manager_survey_data_file_path = os.path.join(src_path, "manager_survey_data.csv")
in_time_file_path = os.path.join(src_path + "in_time.csv")
out_time_file_path = os.path.join(src_path + "out_time.csv")

# Load the dataset
general_data = pd.read_csv(general_data_file_path)
employee_survey_data = pd.read_csv(employee_survey_data_file_path)
manager_survey_data = pd.read_csv(manager_survey_data_file_path)
in_time_data = pd.read_csv(os.path.join(src_path, "in_time.csv"))
out_time_data = pd.read_csv(os.path.join(src_path, "out_time.csv"))

## Correction des noms de colonnes

Nous constatons que la première colonne des jeux de données in_time_data et out_time_data est nommée Unnamed: 0. Nous allons la renommer en EmployeeID pour une meilleure lisibilité.

In [234]:
# Replace Unnamed header by EmployeeID
in_time_data.rename(columns={"Unnamed: 0": "EmployeeID"}, inplace=True)
out_time_data.rename(columns={"Unnamed: 0": "EmployeeID"}, inplace=True)

## Calcul de travail journalier en heure

In [235]:
# Création de copie avec uniquement des inputs utilisable en ignorant la colonne EmployeeID

in_time_data_copy = in_time_data.iloc[:, 1:]
out_time_data_copy = out_time_data.iloc[:, 1:]

# Transformation en format datetime

in_time_data_copy = in_time_data_copy.apply(pd.to_datetime, errors='coerce')
out_time_data_copy = out_time_data_copy.apply(pd.to_datetime, errors='coerce')

# Calcul du temps de travail

time_work_data = out_time_data_copy - in_time_data_copy

# Optionnel : Conversion en heures
time_work_data_in_hours = time_work_data.map(lambda x: x.total_seconds() / 3600 if pd.notnull(x) else None)

# Affichage des résultats

time_work_data_in_hours.head()

Unnamed: 0,2015-01-01,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,2015-01-14,...,2015-12-18,2015-12-21,2015-12-22,2015-12-23,2015-12-24,2015-12-25,2015-12-28,2015-12-29,2015-12-30,2015-12-31
0,,7.208333,7.189722,7.410833,7.006667,7.289722,7.484444,7.262778,7.831111,,...,,7.339167,7.395833,6.504722,7.596389,,7.773889,7.315,7.778889,7.080278
1,,8.109167,7.454722,,7.396944,7.416667,7.150833,7.611389,7.278889,,...,7.903056,7.753889,7.712222,7.435556,,,7.614722,7.9825,7.986111,8.227222
2,,6.6925,7.265556,6.405278,6.765,7.345,6.861389,7.418611,6.999722,,...,6.785833,7.163611,6.801667,6.730278,6.849722,,7.023889,7.438889,7.538889,6.786389
3,,7.338333,7.291944,6.943056,6.919444,6.850833,7.193056,6.998611,7.306389,,...,7.629167,6.846667,7.326389,7.413611,7.085,,7.447222,7.416667,7.366389,7.133056
4,,8.055556,7.988056,7.6825,7.806111,7.662222,7.721667,8.365,8.257222,,...,7.903611,7.665,7.9575,7.786944,8.249444,,7.662222,8.268611,7.953333,8.018056


In [236]:
# Calculs pour chaque employé
summary_work_data = pd.DataFrame()
summary_work_data['EmployeeID'] = in_time_data['EmployeeID']

# Moyenne d'heures travaillées par jour
summary_work_data['Avg_Hours_Per_Day'] = time_work_data_in_hours.mean(axis=1, skipna=True)

# Nombre de jours travaillés dans l'année
summary_work_data['Days_Worked'] = time_work_data_in_hours.notnull().sum(axis=1)

# Total d'heures travaillées dans l'année
summary_work_data['Total_Hours_Worked'] = summary_work_data['Avg_Hours_Per_Day'] * summary_work_data['Days_Worked']

# Affichage des résultats
summary_work_data.head()

Unnamed: 0,EmployeeID,Avg_Hours_Per_Day,Days_Worked,Total_Hours_Worked
0,1,7.373651,232,1710.686944
1,2,7.718969,236,1821.676667
2,3,7.01324,242,1697.204167
3,4,7.193678,235,1690.514444
4,5,8.006175,245,1961.512778


## Création du dataframe merged_data

Nous allons maintenant créer un dataframe qui contient toute les données des autres dataframes afin de manipuler plus facilement les données et de créer plus facilement des dataframe contenant uniquement les données que nous voulons tester.

In [237]:
# Ajout de prefix pour différencier les heures de sortie et celle d'entrée

in_time_data.columns = [
    f"In_{col}" if col != "EmployeeID" else col
    for col in in_time_data.columns
]

out_time_data.columns = [
    f"Out_{col}" if col != "EmployeeID" else col
    for col in out_time_data.columns
]

in_time_data.head()

Unnamed: 0,EmployeeID,In_2015-01-01,In_2015-01-02,In_2015-01-05,In_2015-01-06,In_2015-01-07,In_2015-01-08,In_2015-01-09,In_2015-01-12,In_2015-01-13,...,In_2015-12-18,In_2015-12-21,In_2015-12-22,In_2015-12-23,In_2015-12-24,In_2015-12-25,In_2015-12-28,In_2015-12-29,In_2015-12-30,In_2015-12-31
0,1,,2015-01-02 09:43:45,2015-01-05 10:08:48,2015-01-06 09:54:26,2015-01-07 09:34:31,2015-01-08 09:51:09,2015-01-09 10:09:25,2015-01-12 09:42:53,2015-01-13 10:13:06,...,,2015-12-21 09:55:29,2015-12-22 10:04:06,2015-12-23 10:14:27,2015-12-24 10:11:35,,2015-12-28 10:13:41,2015-12-29 10:03:36,2015-12-30 09:54:12,2015-12-31 10:12:44
1,2,,2015-01-02 10:15:44,2015-01-05 10:21:05,,2015-01-07 09:45:17,2015-01-08 10:09:04,2015-01-09 09:43:26,2015-01-12 10:00:07,2015-01-13 10:43:29,...,2015-12-18 10:37:17,2015-12-21 09:49:02,2015-12-22 10:33:51,2015-12-23 10:12:10,,,2015-12-28 09:31:45,2015-12-29 09:55:49,2015-12-30 10:32:25,2015-12-31 09:27:20
2,3,,2015-01-02 10:17:41,2015-01-05 09:50:50,2015-01-06 10:14:13,2015-01-07 09:47:27,2015-01-08 10:03:40,2015-01-09 10:05:49,2015-01-12 10:03:47,2015-01-13 10:21:26,...,2015-12-18 10:15:14,2015-12-21 10:10:28,2015-12-22 09:44:44,2015-12-23 10:15:54,2015-12-24 10:07:26,,2015-12-28 09:42:05,2015-12-29 09:43:36,2015-12-30 09:34:05,2015-12-31 10:28:39
3,4,,2015-01-02 10:05:06,2015-01-05 09:56:32,2015-01-06 10:11:07,2015-01-07 09:37:30,2015-01-08 10:02:08,2015-01-09 10:08:12,2015-01-12 10:13:42,2015-01-13 09:53:22,...,2015-12-18 10:17:38,2015-12-21 09:58:21,2015-12-22 10:04:25,2015-12-23 10:11:46,2015-12-24 09:43:15,,2015-12-28 09:52:44,2015-12-29 09:33:16,2015-12-30 10:18:12,2015-12-31 10:01:15
4,5,,2015-01-02 10:28:17,2015-01-05 09:49:58,2015-01-06 09:45:28,2015-01-07 09:49:37,2015-01-08 10:19:44,2015-01-09 10:00:50,2015-01-12 10:29:27,2015-01-13 09:59:32,...,2015-12-18 09:58:35,2015-12-21 10:03:41,2015-12-22 10:10:30,2015-12-23 10:13:36,2015-12-24 09:44:24,,2015-12-28 10:05:15,2015-12-29 10:30:53,2015-12-30 09:18:21,2015-12-31 09:41:09


In [238]:
# set into a dictionary
data = {
    'general_data': general_data,
    'employee_survey_data': employee_survey_data,
    'manager_survey_data': manager_survey_data,
    'summary_work_data': summary_work_data,
    'in_time_data': in_time_data,
    'out_time_data': out_time_data
}

# Fusion de tous les DataFrames sur la clé "EmployeeID"
merged_data = data['general_data']
for key, df in data.items():
    if key != 'general_data':
        merged_data = pd.merge(merged_data, df, on='EmployeeID', how='inner')

merged_data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,Out_2015-12-18,Out_2015-12-21,Out_2015-12-22,Out_2015-12-23,Out_2015-12-24,Out_2015-12-25,Out_2015-12-28,Out_2015-12-29,Out_2015-12-30,Out_2015-12-31
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,,2015-12-21 17:15:50,2015-12-22 17:27:51,2015-12-23 16:44:44,2015-12-24 17:47:22,,2015-12-28 18:00:07,2015-12-29 17:22:30,2015-12-30 17:40:56,2015-12-31 17:17:33
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,2015-12-18 18:31:28,2015-12-21 17:34:16,2015-12-22 18:16:35,2015-12-23 17:38:18,,,2015-12-28 17:08:38,2015-12-29 17:54:46,2015-12-30 18:31:35,2015-12-31 17:40:58
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,2015-12-18 17:02:23,2015-12-21 17:20:17,2015-12-22 16:32:50,2015-12-23 16:59:43,2015-12-24 16:58:25,,2015-12-28 16:43:31,2015-12-29 17:09:56,2015-12-30 17:06:25,2015-12-31 17:15:50
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,2015-12-18 17:55:23,2015-12-21 16:49:09,2015-12-22 17:24:00,2015-12-23 17:36:35,2015-12-24 16:48:21,,2015-12-28 17:19:34,2015-12-29 16:58:16,2015-12-30 17:40:11,2015-12-31 17:09:14
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,2015-12-18 17:52:48,2015-12-21 17:43:35,2015-12-22 18:07:57,2015-12-23 18:00:49,2015-12-24 17:59:22,,2015-12-28 17:44:59,2015-12-29 18:47:00,2015-12-30 17:15:33,2015-12-31 17:42:14


In [239]:
# Vérifier la taille pour s'assurer qu'il y a bien le bon nombre d'employés
print(f"Nombre de lignes dans le DataFrame fusionné : {merged_data.shape[0]}")

Nombre de lignes dans le DataFrame fusionné : 4410


### Création du fichier csv qui contient le dataframe merged_data

In [240]:
#merged_data.to_csv('./src/merged_data.csv')

## Définition des features et de la target

In [241]:
y = merged_data['Attrition']
X = merged_data.drop('Attrition', axis=1)

## Nettoyage des données

In [242]:
numerical_features = make_column_selector(dtype_include=np.number)
categorical_features = make_column_selector(dtype_exclude=np.number)

numerical_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

X = pipeline.fit_transform(X)

print(X.shape)

(4410, 1556824)


## Preprocessing

### Encodage

In [243]:
encoder = OrdinalEncoder()
encoder.fit_transform(numerical_features)

ValueError: Expected 2D array, got scalar array instead:
array=<sklearn.compose._column_transformer.make_column_selector object at 0x00000275D7D0D2D0>.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

### Normalisation