In [23]:
import os
import pandas as pd
import ydata_profiling

extract_path = './AI_Project_Data'

# Import des données

In [24]:
csv_employee_survey_data = os.path.join(extract_path, 'employee_survey_data.csv')
csv_manager_survey_data = os.path.join(extract_path, 'manager_survey_data.csv')
csv_general_data = os.path.join(extract_path, 'general_data.csv')
csv_out_time = os.path.join(extract_path, 'out_time.csv')
csv_in_time = os.path.join(extract_path, 'in_time.csv')

employee_survey_data = pd.read_csv(csv_employee_survey_data)
manager_survey_data = pd.read_csv(csv_manager_survey_data)
general_data = pd.read_csv(csv_general_data)
out_time_raw = pd.read_csv(csv_out_time)
in_time_raw = pd.read_csv(csv_in_time)

# Traitement des données

## Données global

### Merge

In [25]:
# Merge employee_survey_data, general_data et manager_survey_data on EmployeeID

full_data = pd.merge(employee_survey_data, general_data, on='EmployeeID')
full_data = pd.merge(full_data, manager_survey_data, on='EmployeeID')

### Valeurs constantes

In [26]:
# EmployeeCount has constant value "1"
# Over18 has constant value "True"
# StandardHours has constant value "8"
full_data.drop(columns=['EmployeeCount', 'Over18', 'StandardHours'], inplace=True)

### Valeurs manquantes

#### Catégories

In [27]:
# EnvironmentSatisfaction Missing 25 float
# JobSatisfaction Missing 20 float
# WorkLifeBalance Missing 38 float

# Replace missing values with the mode
mode_replace_missing = ['EnvironmentSatisfaction', 'JobSatisfaction', 'WorkLifeBalance']
full_data[mode_replace_missing] = full_data[mode_replace_missing].fillna(full_data[mode_replace_missing].mode().iloc[0])

#### Numérique

In [28]:
# NumCompaniesWorked Missing 19 float
# TotalWorkingYears Missing	9 float

# Replace missing values with the median rounded to the nearest integer
median_replace_missing = ['NumCompaniesWorked', 'TotalWorkingYears']
full_data[median_replace_missing] = full_data[median_replace_missing].fillna(full_data[median_replace_missing].median().round())

### Type des valeurs

#### Simplification des types

In [29]:
# EnvironmentSatisfaction fake float
full_data['EnvironmentSatisfaction'] = full_data['EnvironmentSatisfaction'].astype(int)

# JobSatisfaction fake float
full_data['JobSatisfaction'] = full_data['JobSatisfaction'].astype(int)

# WorkLifeBalance fake float
full_data['WorkLifeBalance'] = full_data['WorkLifeBalance'].astype(int)

# NumCompaniesWorked fake float
full_data['NumCompaniesWorked'] = full_data['NumCompaniesWorked'].astype(int)

# TotalWorkingYears fake float
full_data['TotalWorkingYears'] = full_data['TotalWorkingYears'].astype(int)


#### Conversion des type object en valeur numérique

In [30]:
# # Attrition object to int
full_data['Attrition'] = full_data['Attrition'].map({'No': 0, 'Yes': 1})

# BusinessTravel object to int
full_data['BusinessTravel'] = full_data['BusinessTravel'].map({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2})

# Department object to int
full_data['Department'] = full_data['Department'].map({'Research & Development': 0, 'Sales': 1, 'Human Resources': 2})

# EducationField object to int
full_data['EducationField'] = full_data['EducationField'].map({'Life Sciences': 0, 'Medical': 1, 'Marketing': 2, 'Technical Degree': 3, 'Human Resources': 4, 'Other': 5})

# Gender object to int
full_data['Gender'] = full_data['Gender'].map({'Male': 0, 'Female': 1})

# JobRole object to int
full_data['JobRole'] = full_data['JobRole'].map({'Healthcare Representative': 0, 'Research Scientist': 1, 'Sales Executive': 2, 'Human Resources': 3, 'Research Director': 4, 'Laboratory Technician': 5, 'Manufacturing Director': 6, 'Sales Representative': 7, 'Manager': 8})

# MaritalStatus object to int
full_data['MaritalStatus'] = full_data['MaritalStatus'].map({'Single': 0, 'Married': 1, 'Divorced': 2})

## in_time et out_time

In [31]:
# Transformer les dataframes (passer les colonnes de dates en lignes)
in_time_melted = in_time_raw.melt(id_vars=['Unnamed: 0'], var_name='date', value_name='arrival_time')
out_time_melted = out_time_raw.melt(id_vars=['Unnamed: 0'], var_name='date', value_name='departure_time')

# Renommer la colonne EmployeeID
in_time_melted.rename(columns={'Unnamed: 0': 'EmployeeID'}, inplace=True)
out_time_melted.rename(columns={'Unnamed: 0': 'EmployeeID'}, inplace=True)

# Fusionner les deux DataFrames sur 'id' et 'date'
merged_clock_in = pd.merge(in_time_melted, out_time_melted, on=['EmployeeID', 'date'], how='outer')

merged_clock_in['arrival_time'] = pd.to_datetime(merged_clock_in['arrival_time'])
merged_clock_in['departure_time'] = pd.to_datetime(merged_clock_in['departure_time'])

# Calculer le temps travaillé (différence entre départ et arrivée)
merged_clock_in['worked_time'] = merged_clock_in['departure_time'] - merged_clock_in['arrival_time']

# Convertir en heures pour avoir un format lisible
merged_clock_in['worked_hours'] = merged_clock_in['worked_time'].dt.total_seconds() / 3600

# Trier par id et date
merged_clock_in.sort_values(by=['EmployeeID', 'date'], inplace=True)

# Analyse

## Données global

In [32]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype
---  ------                   --------------  -----
 0   EmployeeID               4410 non-null   int64
 1   EnvironmentSatisfaction  4410 non-null   int64
 2   JobSatisfaction          4410 non-null   int64
 3   WorkLifeBalance          4410 non-null   int64
 4   Age                      4410 non-null   int64
 5   Attrition                4410 non-null   int64
 6   BusinessTravel           4410 non-null   int64
 7   Department               4410 non-null   int64
 8   DistanceFromHome         4410 non-null   int64
 9   Education                4410 non-null   int64
 10  EducationField           4410 non-null   int64
 11  Gender                   4410 non-null   int64
 12  JobLevel                 4410 non-null   int64
 13  JobRole                  4410 non-null   int64
 14  MaritalStatus            4410 non-null   int64
 15  Mont

In [33]:
#full_data_report = ydata_profiling.ProfileReport(full_data, title='Full Data')
#full_data_report.to_notebook_iframe()

## in_time and out_time data

In [34]:
#merged_clock_in_report = ydata_profiling.ProfileReport(merged_clock_in, title='Merged Data')
#merged_clock_in_report.to_notebook_iframe()

## Quitte l'entreprise selon chaque critère  

### DataFrame pour chaque critère

In [35]:
environment_satisfaction_data = full_data[["EmployeeID", "EnvironmentSatisfaction", "Attrition"]]
job_satisfaction_data = full_data[["EmployeeID", "JobSatisfaction", "Attrition"]]
workLife_balance_data = full_data[["EmployeeID", "WorkLifeBalance", "Attrition"]]
age_data = full_data[["EmployeeID", "Age", "Attrition"]]
business_travel_data = full_data[["EmployeeID", "BusinessTravel", "Attrition"]]
department_data = full_data[["EmployeeID", "Department", "Attrition"]]
distance_from_home_data = full_data[["EmployeeID", "DistanceFromHome", "Attrition"]]
education_data = full_data[["EmployeeID", "Education", "Attrition"]]
education_field_data = full_data[["EmployeeID", "EducationField", "Attrition"]]
gender_data = full_data[["EmployeeID", "Gender", "Attrition"]]
job_level_data = full_data[["EmployeeID", "JobLevel", "Attrition"]]
job_role_data = full_data[["EmployeeID", "JobRole", "Attrition"]]
marital_status_data = full_data[["EmployeeID", "MaritalStatus", "Attrition"]]
monthly_income_data = full_data[["EmployeeID", "MonthlyIncome", "Attrition"]]
num_companies_worked_data = full_data[["EmployeeID", "NumCompaniesWorked", "Attrition"]]
percent_salary_hike_data = full_data[["EmployeeID", "PercentSalaryHike", "Attrition"]]
stock_option_level_data = full_data[["EmployeeID", "StockOptionLevel", "Attrition"]]
total_working_years_data = full_data[["EmployeeID", "TotalWorkingYears", "Attrition"]]
training_times_last_year_data = full_data[["EmployeeID", "TrainingTimesLastYear", "Attrition"]]
years_at_company_data = full_data[["EmployeeID", "YearsAtCompany", "Attrition"]]
years_since_last_promotion_data = full_data[["EmployeeID", "YearsSinceLastPromotion", "Attrition"]]
years_with_curr_manager_data = full_data[["EmployeeID", "YearsWithCurrManager", "Attrition"]]
job_involvement_data = full_data[["EmployeeID", "JobInvolvement", "Attrition"]]
performance_rating_data = full_data[["EmployeeID", "PerformanceRating", "Attrition"]]

### Calcule le % pour Oui et Non

In [36]:
def calculate_percentages(df, column_filter, column_target, filter_values, target_values,):
    filtered_df = df[df[column_filter].isin(filter_values)]
    total_filtered = filtered_df.shape[0]

    percentages = {}
    for target_value in target_values:
        count = filtered_df[filtered_df[column_target] == target_value].shape[0]
        percentages[target_value] = round((count / total_filtered * 100), 2) if total_filtered > 0 else 0

    return percentages, total_filtered

def calculate_percentages_values(df, column_target, filter_condition, target_values):
    filtered_df = df[filter_condition]
    total_filtered = filtered_df.shape[0]

    percentages = {}
    for target_value in target_values:
        count = filtered_df[filtered_df[column_target] == target_value].shape[0]
        percentages[target_value] = round((count / total_filtered * 100), 2) if total_filtered > 0 else 0

    return percentages, total_filtered

### Pour EnvironmentSatisfaction

In [56]:
environment_satisfaction_data = full_data[["EmployeeID", "EnvironmentSatisfaction", "Attrition"]]
# Calcul des pourcentages pour EnvironmentSatisfaction = [1, 2]
percentages_poor, count_poor = calculate_percentages(environment_satisfaction_data, "EnvironmentSatisfaction", "Attrition", [1, 2], [1, 0])

# Calcul des pourcentages pour EnvironmentSatisfaction = [3, 4]
percentages_great, count_great = calculate_percentages(environment_satisfaction_data, "EnvironmentSatisfaction", "Attrition", [3, 4], [1, 0])

print("Pourcentages pour EnvironmentSatisfaction = [1, 2]:", percentages_poor, "(Nombre de lignes :", count_poor, ")")
print("Pourcentages pour EnvironmentSatisfaction = [3, 4]:", percentages_great, "(Nombre de lignes :", count_great, ")")

Pourcentages pour EnvironmentSatisfaction = [1, 2]: {1: 20.05, 0: 79.95} (Nombre de lignes : 1701 )
Pourcentages pour EnvironmentSatisfaction = [3, 4]: {1: 13.66, 0: 86.34} (Nombre de lignes : 2709 )


### Pour JobSatisfaction

In [38]:
# Calcul des pourcentages pour JobSatisfaction = [1, 2]
percentages_poor, count_poor = calculate_percentages(job_satisfaction_data, "JobSatisfaction", "Attrition", [1, 2], [1, 0])

# Calcul des pourcentages pour JobSatisfaction = [3, 4]
percentages_great, count_great = calculate_percentages(job_satisfaction_data, "JobSatisfaction","Attrition", [3, 4], [1, 0])

print("Pourcentages pour JobSatisfaction = [1, 2]:", percentages_poor, "(Nombre de lignes :", count_poor, ")")
print("Pourcentages pour JobSatisfaction = [3, 4]:", percentages_great, "(Nombre de lignes :", count_great, ")")

Pourcentages pour JobSatisfaction = [1, 2]: {1: 19.71, 0: 80.29} (Nombre de lignes : 1700 )
Pourcentages pour JobSatisfaction = [3, 4]: {1: 13.87, 0: 86.13} (Nombre de lignes : 2710 )


### Pour WorkLifeBalance

In [39]:
# Calcul des pourcentages pour WorkLifeBalance = [1, 2]
percentages_poor, count_poor = calculate_percentages(workLife_balance_data, "WorkLifeBalance", "Attrition", [1, 2], [1, 0])

# Calcul des pourcentages pour WorkLifeBalance = [3, 4]
percentages_great, count_great = calculate_percentages(workLife_balance_data, "WorkLifeBalance", "Attrition", [3, 4], [1, 0])

print("Pourcentages pour WorkLifeBalance = [1, 2]:", "(Nombre de lignes :", count_poor, ")")
print("Pourcentages pour WorkLifeBalance = [3, 4]:", "(Nombre de lignes :", count_great, ")")

Pourcentages pour WorkLifeBalance = [1, 2]: (Nombre de lignes : 1258 )
Pourcentages pour WorkLifeBalance = [3, 4]: (Nombre de lignes : 3152 )


### Pour Age

In [40]:
# Calcul des pourcentages pour Age <= 25
filter_young = age_data["Age"] <= 25
percentages_young, count_young = calculate_percentages_values(age_data, "Attrition", filter_young, [1, 0])

# Calcul des pourcentages pour 25 < Age <= 45
filter_middle = (age_data["Age"] > 25) & (age_data["Age"] <= 45)
percentages_middle, count_middle = calculate_percentages_values(age_data, "Attrition", filter_middle, [1, 0])

# Calcul des pourcentages pour Age > 45
filter_old = age_data["Age"] > 45
percentages_old, count_old = calculate_percentages_values(age_data, "Attrition", filter_old, [1, 0])


print("Pourcentages pour Age <= 25 :", percentages_young, "(Nombre de lignes :", count_young, ")")
print("Pourcentages pour 25 < Age <= 45 :", percentages_middle, "(Nombre de lignes :", count_middle, ")")
print("Pourcentages pour Age > 45 :", percentages_old, "(Nombre de lignes :", count_old, ")")

Pourcentages pour Age <= 25 : {1: 35.77, 0: 64.23} (Nombre de lignes : 369 )
Pourcentages pour 25 < Age <= 45 : {1: 14.8, 0: 85.2} (Nombre de lignes : 3222 )
Pourcentages pour Age > 45 : {1: 12.45, 0: 87.55} (Nombre de lignes : 819 )


### Pour BusinessTravel

In [41]:
# Calcul des pourcentages pour BusinessTravel = 0 = Voyage jamais
percentages_never, count_never = calculate_percentages(business_travel_data,"BusinessTravel", "Attrition", [0], [1, 0])

# Calcul des pourcentages pour BusinessTravel = 1 = Voyage rarement
percentages_rarely, count_rarely = calculate_percentages(business_travel_data,"BusinessTravel", "Attrition", [1], [1, 0])

# Calcul des pourcentages pour BusinessTravel = 2 = Voyage fréquemment
percentages_frequently, count_frequently = calculate_percentages(business_travel_data,"BusinessTravel", "Attrition", [2], [1, 0])


print("Pourcentages pour BusinessTravel = Voyage jamais :", percentages_never, "(Nombre de lignes :", count_never, ")")
print("Pourcentages pour BusinessTravel = Voyage rarement :", percentages_rarely, "(Nombre de lignes :", count_rarely, ")")
print("Pourcentages pour BusinessTravel = Voyage fréquemment :", percentages_frequently, "(Nombre de lignes :", count_frequently, ")")

Pourcentages pour BusinessTravel = Voyage jamais : {1: 8.0, 0: 92.0} (Nombre de lignes : 450 )
Pourcentages pour BusinessTravel = Voyage rarement : {1: 14.96, 0: 85.04} (Nombre de lignes : 3129 )
Pourcentages pour BusinessTravel = Voyage fréquemment : {1: 24.91, 0: 75.09} (Nombre de lignes : 831 )


### Pour Department

In [42]:
# Calcul des pourcentages pour Department = 0 = Research & Development
percentages_rd, count_rd = calculate_percentages(department_data,"Department", "Attrition", [0], [1, 0])

# Calcul des pourcentages pour Department = 1 = Sales
percentages_sales, count_sales = calculate_percentages(department_data,"Department", "Attrition", [1], [1, 0])

# Calcul des pourcentages pour Department = 2 = Human Resources
percentages_hr, count_hr = calculate_percentages(department_data,"Department", "Attrition", [2], [1, 0])


print("Pourcentages pour Department = Research & Development :", percentages_rd, "(Nombre de lignes :", count_rd, ")")
print("Pourcentages pour Department = Sales :", percentages_sales, "(Nombre de lignes :", count_sales, ")")
print("Pourcentages pour Department = Human Resources :", percentages_hr, "(Nombre de lignes :", count_hr, ")")

Pourcentages pour Department = Research & Development : {1: 15.71, 0: 84.29} (Nombre de lignes : 2883 )
Pourcentages pour Department = Sales : {1: 15.02, 0: 84.98} (Nombre de lignes : 1338 )
Pourcentages pour Department = Human Resources : {1: 30.16, 0: 69.84} (Nombre de lignes : 189 )


### Pour DistanceFromHome

In [43]:
distance_from_home_data.head()

# Calcul des pourcentages pour DistanceFromHome <= 10
filter_little = distance_from_home_data["DistanceFromHome"] <= 10
percentages_little,count_little = calculate_percentages_values(distance_from_home_data, "Attrition", filter_little, [1, 0])

# Calcul des pourcentages pour 10 < DistanceFromHome <= 20
filter_middle = (distance_from_home_data["DistanceFromHome"] > 10) & (distance_from_home_data["DistanceFromHome"] <= 20)
percentages_middle,count_middle = calculate_percentages_values(distance_from_home_data, "Attrition", filter_middle, [1, 0])

# Calcul des pourcentages pour DistanceFromHome > 20
filter_alot = distance_from_home_data["DistanceFromHome"] > 20
percentages_alot,count_alot = calculate_percentages_values(distance_from_home_data, "Attrition", filter_alot, [1, 0])


print("Pourcentages pour DistanceFromHome <= 10 :", percentages_young, "(Nombre de lignes :", count_little, ")")
print("Pourcentages pour 10 < DistanceFromHome <= 20 :", percentages_middle, "(Nombre de lignes :", count_middle, ")")
print("Pourcentages pour 20 < DistanceFromHome :", percentages_old, "(Nombre de lignes :", count_alot, ")")

Pourcentages pour DistanceFromHome <= 10 : {1: 35.77, 0: 64.23} (Nombre de lignes : 3078 )
Pourcentages pour 10 < DistanceFromHome <= 20 : {1: 18.33, 0: 81.67} (Nombre de lignes : 720 )
Pourcentages pour 20 < DistanceFromHome : {1: 12.45, 0: 87.55} (Nombre de lignes : 612 )
