# 0. Context

L’entreprise pharmaceutique HumanForYou, située en Inde, emploie environ 4 000 personnes mais doit faire face à un taux de turn-over annuel de 15%, impactant son fonctionnement de plusieurs manières :

- Retards de projets nuisibles à la réputation de l’entreprise auprès des clients et partenaires.
- Charge importante sur les ressources humaines, nécessitant un service conséquent pour recruter de nouveaux talents.
- Perte de productivité en raison du temps nécessaire à la formation et à l’intégration des nouveaux employés.
- L’objectif est de comprendre les facteurs clés influençant ce taux de turn-over et de proposer des modèles prédictifs et des stratégies concrètes pour réduire ce phénomène.

les etapes du projet : Données brutes > Données structurés (Catégorielle & Numérique) > input > Deeplearning / AI > produit (^y)

# I. Environement setup

In [15]:
# Import necessary libraries
import matplotlib.pyplot as plt
import numpy as np
import nbformat
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings('ignore')

# II. Data loading

In [16]:
# data path
csv_path = "./csv/"

# Load datasets
in_time = pd.read_csv(csv_path + "in_time.csv")
out_time = pd.read_csv(csv_path + "out_time.csv")
general_data = pd.read_csv(csv_path + "general_data.csv")
manager_survey_data = pd.read_csv(csv_path + "manager_survey_data.csv")
employee_survey_data = pd.read_csv(csv_path + "employee_survey_data.csv")

# III. Data mixing

In [None]:
# Time mixing
total_time = pd.concat([in_time, out_time], ignore_index=True)
total_time = total_time.replace(np.nan, 0)
total_time.iloc[:, 1:] = total_time.iloc[:, 1:].apply(pd.to_datetime, errors='coerce')
total_time = total_time.diff(periods=4410, axis=0)
total_time = total_time.iloc[4410:]
total_time.reset_index(inplace=True)
total_time.drop(
    columns=[
            'Unnamed: 0', '2015-01-01', '2015-01-14','2015-01-26','2015-03-05',
            '2015-05-01','2015-07-17','2015-09-17','2015-10-02',
            '2015-11-09','2015-11-10','2015-11-11','2015-12-25', 'index'
            ],
    axis=1,
    inplace=True
)
total_time = total_time.replace(pd.NaT, pd.Timedelta(0))
total_time['Mean Time']=total_time.mean(axis=1)
total_time['hrs']=total_time['Mean Time'] / np.timedelta64(1, 'h')
total_time.reset_index(inplace=True)
total_time.drop(total_time.columns.difference(['index','hrs']), axis=1, inplace=True)
total_time.rename(columns={'index': 'EmployeeID'}, inplace=True)

total_time.head()

In [None]:
# Create global dataframe
global_data = pd.merge(general_data, total_time, on='EmployeeID', how='inner')
global_data = pd.merge(global_data, manager_survey_data, on='EmployeeID', how='inner')
global_data = pd.merge(global_data, employee_survey_data, on='EmployeeID', how='inner')

global_data.head()

# IV. Remove unusable data

In [None]:
# Delete unique value column
for col in global_data.columns:
    if len(global_data[col].unique()) == 1:
        global_data.drop(columns=[col], axis=1, inplace=True)
        print("Deleted : " + col + " (Only one value)")

# Delete Ethical column
global_data.drop(columns=['Age', 'EmployeeID', 'Gender', 'MaritalStatus'], axis=1, inplace=True)

global_data.head()

# V. Change column type for categorical data

In [None]:
global_data.info()

In [None]:
global_data['Education'] = global_data['Education'].astype('object')
global_data['EnvironmentSatisfaction'] = global_data['EnvironmentSatisfaction'].astype('object')
global_data['JobInvolvement'] = global_data['JobInvolvement'].astype('object')
global_data['JobSatisfaction'] = global_data['JobSatisfaction'].astype('object')
global_data['PerformanceRating'] = global_data['PerformanceRating'].astype('object')
global_data['JobLevel'] = global_data['JobLevel'].astype('object')
global_data['WorkLifeBalance'] = global_data['WorkLifeBalance'].astype('object')

global_data.head()

# VI. Replace data

In [None]:
# Replace numerical data
for col in global_data.columns:
    if global_data[col].isna().values.any():
        global_data[col].fillna(global_data[col].median(), inplace=True)

# Replace categorical data
global_data['Education'].replace({1: 'BAC', 2: 'BAC+2', 3: 'BAC+3', 4: 'BAC+5', 5: 'Doctorat'}, inplace=True)
global_data['JobInvolvement'].replace({1: 'Faible', 2: 'Moyenne', 3: 'Importante', 4: 'Très importante'}, inplace=True)
global_data['PerformanceRating'].replace({1: 'Faible', 2: 'Bon', 3: 'Excellent', 4: 'Au delà des attentes'}, inplace=True)
global_data['EnvironmentSatisfaction'].replace({1: 'Faible', 2: 'Moyen', 3: 'Élevé', 4: 'Très élevé'}, inplace=True)
global_data['JobSatisfaction'].replace({1: 'Faible', 2: 'Moyen', 3: 'Élevé', 4: 'Très élevé'}, inplace=True)
global_data['WorkLifeBalance'].replace({1: 'Mauvais', 2: 'Satisfaisant', 3: 'Très satisfaisant', 4: 'Excellent'}, inplace=True)

global_data.head()

# VII. Preparing data for the model

In [None]:
# Get categorical column
categorical_column_data = global_data.select_dtypes(include='object').drop(columns=['Attrition'])
categorical_column_name = list(categorical_column_data.columns.values)
categorical_column_name

In [None]:
# Replace categorical data with numerical data
global_data_dummy = pd.get_dummies(global_data, columns=categorical_column_name)
global_data_dummy['Attrition'] = global_data_dummy['Attrition'].map({'Yes': 1, 'No': 0})

global_data_dummy.head()

In [25]:
# Split train data with test data
X = global_data_dummy.drop('YearsAtCompany', axis=1)
y = global_data_dummy['YearsAtCompany']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# VIII. Model application

In [None]:
# correlation matrix
plt.figure(figsize=(60, 40))
sns.heatmap(global_data_dummy.corr(), mask=np.triu(np.ones_like(global_data_dummy.corr(), dtype=bool)), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')

In [None]:
model = LinearRegression()
selector = RFECV(model, step=1, cv=5)
selector.fit(X_train, y_train)
selected_features = X_train.columns[selector.support_]
print("Variables sélectionnées :", selected_features)
model.fit(X_train[selected_features], y_train)
y_test_pred = model.predict(X_test[selected_features])

if hasattr(model, "intercept_") and hasattr(model, "coef_"):
    intercept = model.intercept_
    coefficients = model.coef_

    print("Intercept :", intercept)
    print("Coefficients :", coefficients)
else:
    print("Ce modèle n'expose pas d'intercept ou de coefficients (par exemple, Random Forest).")

mse = mean_squared_error(y_test, y_test_pred)
r2 = r2_score(y_test, y_test_pred)

print("MSE :", mse)
print("R² :", r2)

X.head()