# Machine Learning and IA Orange Summer Challenge 2024

## Préparation et Exploration des Données

### Importing neccessary libraries

In [None]:
# Importing neccessary libraries

import pandas as pd
import numpy as np
import scipy.stats as sc
from scipy.stats import pearsonr
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from tabulate import tabulate
from sklearn.model_selection import train_test_split
import seaborn as sns
%matplotlib inline

### Loading Dataset

In [None]:
df = pd.read_csv("./data/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [None]:
df.head()

In [None]:
df.info()

### Display of descriptive statistics

In [None]:
# Describing Numerical Values
df.describe()

D'après la description numérique ci-dessus, les colonnes telles que l'âge, le taux journalier et l'éducation ont une distribution équilibrée des valeurs. Il n'y a pas d'asymétrie significative (valeurs extrêmes ou données très élevées/faibles).


In [None]:
# Finding Outliers with Interquartile Range (IQR)
q1 = df['YearsAtCompany'].quantile(0.25)
q3 = df['YearsAtCompany'].quantile(0.75)
iqr = q3 - q1

lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)

df[(df['YearsAtCompany'] < lower_bound) | (df['YearsAtCompany'] > upper_bound)]

Toutefois, les colonnes présentant de grandes différences entre la moyenne et la médiane, telles que TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion et YearsWithCurrentManager, présentent des valeurs aberrantes, comme le montre le code ci-dessus.

In [None]:
# Describing Categorical Values
df.describe(include = 'object')

From the categorical description, we can conclude points such as:

Most of the values in Attrition is No, meaning most employees resigned
Most employees are Sales Executives
There are more Male employees compared to Female employees
Most employees are Married

In [None]:
# Check for Null Data
df.isnull().sum()

In [None]:
df.duplicated().sum()
# There are no duplicated values in the dataset

In [None]:
df.isna().sum()
# There are no null values

This means the dataset has no null values.

## EDA

### Preparation et prétraitement des données

### Data Cleaning

In [None]:
ColsBox = df.select_dtypes('int64')
for col in ColsBox.columns:
    plt.figure(figsize=(10,6))
    plt.title('box plot of '+col)
    sns.boxplot(df[col])
    plt.show()

# Drop Outlier Datas
df.drop(df[(df['TotalWorkingYears'] < lower_bound) | (df['TotalWorkingYears'] > upper_bound)]. index, inplace = True)
df.drop(df[(df['YearsAtCompany'] < lower_bound) | (df['YearsAtCompany'] > upper_bound)]. index, inplace = True)
df.drop(df[(df['YearsInCurrentRole'] < lower_bound) | (df['YearsInCurrentRole'] > upper_bound)]. index, inplace = True)
df.drop(df[(df['YearsSinceLastPromotion'] < lower_bound) | (df['YearsSinceLastPromotion'] > upper_bound)]. index, inplace = True)
df.drop(df[(df['YearsWithCurrManager'] < lower_bound) | (df['YearsWithCurrManager'] > upper_bound)]. index, inplace = True)

Additionally, columns 'EmployeeCount', 'Over18', 'StandardHours' contain only one unique value (all rows have the same values), which will not be useful later on. These columns will also be dropped.

In [None]:
# Drop Data with Only One Unique Value
df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis="columns", inplace=True)

### Data Transformation

In [None]:
# Transforming categorical values into numerical values

mapping = {"Yes": 1, "No": 0}
df["Attrition"] = df["Attrition"].replace(mapping)
df["Attrition"] = df["Attrition"].astype("int64")
df["OverTime"] = df["OverTime"].replace(mapping)
df["OverTime"] = df["OverTime"].astype("int64")

In [None]:
mapping2 = {"Non-Travel": 0, "Travel_Rarely": 1, "Travel_Frequently": 2}
df["BusinessTravel"] = df["BusinessTravel"].replace(mapping2)
df["BusinessTravel"] = df["BusinessTravel"].astype("int64")

In [None]:
mapping3 = {"Research & Development" : 0, "Sales" : 1, "Human Resources" : 2}
df["Department"] = df["Department"].replace(mapping3)
df["Department"] = df["Department"].astype("int64")

In [None]:
mapping4 = {"Life Sciences": 0, "Medical": 1, "Marketing" : 2,"Technical Degree" : 3, "Human Resources" : 4, "Other" : 5  }
df["EducationField"] = df["EducationField"].replace(mapping4)
df["EducationField"] = df["EducationField"].astype("int64")

In [None]:
mapping5 = {"Male": 1, "Female": 0}
df["Gender"] = df["Gender"].replace(mapping5)
df["Gender"] = df["Gender"].astype("int64")

In [None]:
mapping6 = {"Human Resources": 8, "Manager": 7,"Healthcare Representative": 6,"Manufacturing Director": 5, "Laboratory Technician" : 4,"Sales Representative": 3, "Sales Executive": 2,"Research Director":1, "Research Scientist": 0}
df["JobRole"] = df["JobRole"].replace(mapping6)
df["JobRole"] = df["JobRole"].astype("int64")

In [None]:
mapping7 = {"Divorced": 2,"Married": 1, "Single": 0}
df["MaritalStatus"] = df["MaritalStatus"].replace(mapping7)
df["MaritalStatus"] = df["MaritalStatus"].astype("int64")

In [None]:
df.head()

In [None]:
df.info()

### Data Visualization

Which columns have the top 5 highest correlations with each other?
What factor influences employee's rate of attrition the most?

In [None]:
df.corr()

In [None]:
# Correlations Matrix in Regards to Attrition

# Selects 10 columns that has the highest correlation to Attrition
col = df.corr().nlargest(10, "Attrition").Attrition.index

plt.figure(figsize=(15, 15))
sns.heatmap(df[col].corr(), annot=True, fmt=".2f", cmap="coolwarm", annot_kws={"size":15})

In [None]:
# Correlation Matrix (all columns)
plt.figure(figsize=(30, 30))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm", annot_kws={"size":15})

Top 5 Highest Correlations

- JobLevel affects MonthlyIncome (0.88)
- YearsInCurrentRole affects YearsAtCompany (0.86)
- YearsAtCompany affects YearsWithCurrManager (0.85)
- PerformanceRating affects PercentSalaryHike (0.77)
- YearsInCurrentRole affects YearsWithCurrentManager (0.74)

## Attrition predictions

To predict Attrition we should use a classification model, because this is a binary variable.

1. LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix, recall_score, precision_recall_curve

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df.drop('Attrition', axis=1), df['Attrition'], test_size=0.2, random_state=33)

# Using solver='lbfgs' option to implement regularization
logreg = LogisticRegression(solver='lbfgs', max_iter=10000)
logreg.fit(x_train, y_train)

# Checking the score
logreg.score(x_test, y_test)

A confusion matrix is used to evaluate the performance of a classification model. It operates such metrics of predictions as:

- True Positive (TP): Correct positive prediction
- False Positive (FP): Incorrect positive prediction, real label is negative
- False Negative (FN): Incorrect negative prediction, real label is positive
- True Negative (TN): Correct negative prediction

It looks like this:



In [None]:

# Let's build a confusion matrix

y_pred = logreg.predict(x_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
precision = precision_score(y_test, y_pred)
print("Precision:", precision)
recall = recall_score(y_test, y_pred)
print("Recall:", recall)
f1 = f1_score(y_test, y_pred)
print("F1:", f1)

2. LogisticRegressionCV

This model is a subclass of LogisticRegression, but includes cross-validation.



In [None]:
from sklearn.linear_model import LogisticRegressionCV

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df.drop('Attrition', axis=1), df['Attrition'], test_size=0.2, random_state=33)

logreg = LogisticRegressionCV(max_iter=10000)
logreg.fit(x_train, y_train)

# Checking the score
logreg.score(x_train, y_train)

Confusion matrix

In [None]:
y_pred = logreg.predict(x_test)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
precision = precision_score(y_test, y_pred)
print("Precision:", precision)
recall = recall_score(y_test, y_pred)
print("Recall:", recall)
f1 = f1_score(y_test, y_pred)
print("F1:", f1)

3. KNeighborsClassifier¶

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df.drop('Attrition', axis=1), df['Attrition'], test_size=0.2, random_state=33)

# Create a KNN model and fit it to the training data
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)

# Checking the score on the training set
knn.score(x_test, y_test)