In [3]:
import os
import pandas as pd
import ydata_profiling

extract_path = './AI_Project_Data'

  from .autonotebook import tqdm as notebook_tqdm


# Import des données

In [29]:
csv_employee_survey_data = os.path.join(extract_path, 'employee_survey_data.csv')
csv_manager_survey_data = os.path.join(extract_path, 'manager_survey_data.csv')
csv_general_data = os.path.join(extract_path, 'general_data.csv')
csv_out_time = os.path.join(extract_path, 'out_time.csv')
csv_in_time = os.path.join(extract_path, 'in_time.csv')

employee_survey_data = pd.read_csv(csv_employee_survey_data)
manager_survey_data = pd.read_csv(csv_manager_survey_data)
general_data = pd.read_csv(csv_general_data)
out_time_raw = pd.read_csv(csv_out_time)
in_time_raw = pd.read_csv(csv_in_time)

# Traitement des données

## Données global

### Merge

In [30]:
# Merge employee_survey_data, general_data et manager_survey_data on EmployeeID

full_data = pd.merge(employee_survey_data, general_data, on='EmployeeID')
full_data = pd.merge(full_data, manager_survey_data, on='EmployeeID')

### Valeurs constantes

In [6]:
# EmployeeCount has constant value "1"
# Over18 has constant value "True"
# StandardHours has constant value "8"
full_data.drop(columns=['EmployeeCount', 'Over18', 'StandardHours'], inplace=True)

### Valeurs manquantes

In [7]:
# EnvironmentSatisfaction Missing	25
# JobSatisfaction Missing	20
# WorkLifeBalance Missing	38
# NumCompaniesWorked Missing 19
# TotalWorkingYears Missing	9



### Type des valeurs

In [8]:
# EnvironmentSatisfaction fake float
# JobSatisfaction fake float
# WorkLifeBalance fake float
# full_data['EnvironmentSatisfaction'] = full_data['EnvironmentSatisfaction'].astype(int)
# full_data['JobSatisfaction'] = full_data['JobSatisfaction'].astype(int)
# full_data['WorkLifeBalance'] = full_data['WorkLifeBalance'].astype(int)

# Attrition object to int
# BusinessTravel object to int
# Department object to int
# EducationField object to int
# Gender object to int
# JobLevel object to int
# JobRole object to int
# MaritalStatus object to int

## in_time et out_time

In [9]:
# Transformer les dataframes (passer les colonnes de dates en lignes)
in_time_melted = in_time_raw.melt(id_vars=['Unnamed: 0'], var_name='date', value_name='arrival_time')
out_time_melted = out_time_raw.melt(id_vars=['Unnamed: 0'], var_name='date', value_name='departure_time')

# Renommer la colonne EmployeeID
in_time_melted.rename(columns={'Unnamed: 0': 'EmployeeID'}, inplace=True)
out_time_melted.rename(columns={'Unnamed: 0': 'EmployeeID'}, inplace=True)

# Fusionner les deux DataFrames sur 'id' et 'date'
merged_clock_in = pd.merge(in_time_melted, out_time_melted, on=['EmployeeID', 'date'], how='outer')

merged_clock_in['arrival_time'] = pd.to_datetime(merged_clock_in['arrival_time'])
merged_clock_in['departure_time'] = pd.to_datetime(merged_clock_in['departure_time'])

# Calculer le temps travaillé (différence entre départ et arrivée)
merged_clock_in['worked_time'] = merged_clock_in['departure_time'] - merged_clock_in['arrival_time']

# Convertir en heures pour avoir un format lisible
merged_clock_in['worked_hours'] = merged_clock_in['worked_time'].dt.total_seconds() / 3600

# Trier par id et date
merged_clock_in.sort_values(by=['EmployeeID', 'date'], inplace=True)

# Analyse

## Données global

In [31]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   EmployeeID               4410 non-null   int64  
 1   EnvironmentSatisfaction  4385 non-null   float64
 2   JobSatisfaction          4390 non-null   float64
 3   WorkLifeBalance          4372 non-null   float64
 4   Age                      4410 non-null   int64  
 5   Attrition                4410 non-null   object 
 6   BusinessTravel           4410 non-null   object 
 7   Department               4410 non-null   object 
 8   DistanceFromHome         4410 non-null   int64  
 9   Education                4410 non-null   int64  
 10  EducationField           4410 non-null   object 
 11  EmployeeCount            4410 non-null   int64  
 12  Gender                   4410 non-null   object 
 13  JobLevel                 4410 non-null   int64  
 14  JobRole                 

In [11]:
full_data_report = ydata_profiling.ProfileReport(full_data, title='Full Data')
full_data_report.to_notebook_iframe()

ModuleNotFoundError: No module named 'ipywidgets'

## in_time and out_time data

In [12]:
merged_clock_in_report = ydata_profiling.ProfileReport(merged_clock_in, title='Merged Data')
merged_clock_in_report.to_notebook_iframe()

ModuleNotFoundError: No module named 'ipywidgets'

## Quitte l'entreprise selon chaque critère  

In [32]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   EmployeeID               4410 non-null   int64  
 1   EnvironmentSatisfaction  4385 non-null   float64
 2   JobSatisfaction          4390 non-null   float64
 3   WorkLifeBalance          4372 non-null   float64
 4   Age                      4410 non-null   int64  
 5   Attrition                4410 non-null   object 
 6   BusinessTravel           4410 non-null   object 
 7   Department               4410 non-null   object 
 8   DistanceFromHome         4410 non-null   int64  
 9   Education                4410 non-null   int64  
 10  EducationField           4410 non-null   object 
 11  EmployeeCount            4410 non-null   int64  
 12  Gender                   4410 non-null   object 
 13  JobLevel                 4410 non-null   int64  
 14  JobRole                 

In [33]:
environment_satisfaction_data = full_data[["EmployeeID", "EnvironmentSatisfaction", "Attrition"]]
job_satisfaction_data = full_data[["EmployeeID", "JobSatisfaction", "Attrition"]]
workLife_balance_data = full_data[["EmployeeID", "WorkLifeBalance", "Attrition"]]
age_data = full_data[["EmployeeID", "Age", "Attrition"]]
business_travel_data = full_data[["EmployeeID", "BusinessTravel", "Attrition"]]
department_data = full_data[["EmployeeID", "Department", "Attrition"]]
distance_from_home_data = full_data[["EmployeeID", "DistanceFromHome", "Attrition"]]
education_data = full_data[["EmployeeID", "Education", "Attrition"]]
education_field_data = full_data[["EmployeeID", "EducationField", "Attrition"]]
gender_data = full_data[["EmployeeID", "Gender", "Attrition"]]
job_level_data = full_data[["EmployeeID", "JobLevel", "Attrition"]]
job_role_data = full_data[["EmployeeID", "JobRole", "Attrition"]]
marital_status_data = full_data[["EmployeeID", "MaritalStatus", "Attrition"]]
monthly_income_data = full_data[["EmployeeID", "MonthlyIncome", "Attrition"]]
num_companies_worked_data = full_data[["EmployeeID", "NumCompaniesWorked", "Attrition"]]
percent_salary_hike_data = full_data[["EmployeeID", "PercentSalaryHike", "Attrition"]]
stock_option_level_data = full_data[["EmployeeID", "StockOptionLevel", "Attrition"]]
total_working_years_data = full_data[["EmployeeID", "TotalWorkingYears", "Attrition"]]
training_times_last_year_data = full_data[["EmployeeID", "TrainingTimesLastYear", "Attrition"]]
years_at_company_data = full_data[["EmployeeID", "YearsAtCompany", "Attrition"]]
years_since_last_promotion_data = full_data[["EmployeeID", "YearsSinceLastPromotion", "Attrition"]]
years_with_curr_manager_data = full_data[["EmployeeID", "YearsWithCurrManager", "Attrition"]]
job_involvement_data = full_data[["EmployeeID", "JobInvolvement", "Attrition"]]
performance_rating_data = full_data[["EmployeeID", "PerformanceRating", "Attrition"]]

In [55]:
def calculate_percentages(df, column_filter, column_target, filter_values, target_values,):
    filtered_df = df[df[column_filter].isin(filter_values)]
    total_filtered = filtered_df.shape[0]

    percentages = {}
    for target_value in target_values:
        count = filtered_df[filtered_df[column_target] == target_value].shape[0]
        percentages[target_value] = round((count / total_filtered * 100), 2) if total_filtered > 0 else 0

    return percentages

def calculate_percentages_values(df, column_target, filter_condition, target_values):
    filtered_df = df[filter_condition]
    total_filtered = filtered_df.shape[0]

    percentages = {}
    for target_value in target_values:
        count = filtered_df[filtered_df[column_target] == target_value].shape[0]
        percentages[target_value] = round((count / total_filtered * 100), 2) if total_filtered > 0 else 0

    return percentages

In [56]:
# Calcul des pourcentages pour EnvironmentSatisfaction = [1.0, 2.0]
percentages_2_1 = calculate_percentages(environment_satisfaction_data, "EnvironmentSatisfaction", "Attrition", [1.0, 2.0], ["Yes", "No"])

# Calcul des pourcentages pour EnvironmentSatisfaction = [3.0, 4.0]
percentages_3_4 = calculate_percentages(environment_satisfaction_data, "EnvironmentSatisfaction", "Attrition", [3.0, 4.0], ["Yes", "No"])

print("Pourcentages pour EnvironmentSatisfaction = [1.0, 2.0]:", percentages_2_1)
print("Pourcentages pour EnvironmentSatisfaction = [3.0, 4.0]:", percentages_3_4)

Pourcentages pour EnvironmentSatisfaction = [1.0, 2.0]: {'Yes': 20.05, 'No': 79.95}
Pourcentages pour EnvironmentSatisfaction = [3.0, 4.0]: {'Yes': 13.6, 'No': 86.4}


In [37]:
# Calcul des pourcentages pour JobSatisfaction = [1.0, 2.0]
percentages_2_1 = calculate_percentages(job_satisfaction_data, "JobSatisfaction", "Attrition", [1.0, 2.0], ["Yes", "No"])

# Calcul des pourcentages pour JobSatisfaction = [3.0, 4.0]
percentages_3_4 = calculate_percentages(job_satisfaction_data, "JobSatisfaction", "Attrition", [3.0, 4.0], ["Yes", "No"])

print("Pourcentages pour JobSatisfaction = [1.0, 2.0]:", percentages_2_1)
print("Pourcentages pour JobSatisfaction = [3.0, 4.0]:", percentages_3_4)

Pourcentages pour JobSatisfaction = [1.0, 2.0]: {'Yes': 19.71, 'No': 80.29}
Pourcentages pour JobSatisfaction = [3.0, 4.0]: {'Yes': 13.94, 'No': 86.06}


In [40]:
# Calcul des pourcentages pour WorkLifeBalance = [1.0, 2.0]
percentages_2_1 = calculate_percentages(workLife_balance_data, "WorkLifeBalance", "Attrition", [1.0, 2.0], ["Yes", "No"])

# Calcul des pourcentages pour WorkLifeBalance = [3.0, 4.0]
percentages_3_4 = calculate_percentages(workLife_balance_data, "WorkLifeBalance", "Attrition", [3.0, 4.0], ["Yes", "No"])

print("Pourcentages pour WorkLifeBalance = [1.0, 2.0]:", percentages_2_1)
print("Pourcentages pour WorkLifeBalance = [3.0, 4.0]:", percentages_3_4)

Pourcentages pour WorkLifeBalance = [1.0, 2.0]: {'Yes': 19.55, 'No': 80.45}
Pourcentages pour WorkLifeBalance = [3.0, 4.0]: {'Yes': 14.8, 'No': 85.2}


In [57]:
# Calcul des pourcentages pour Age <= 25
filter_young = age_data["Age"] <= 25
percentages_young = calculate_percentages_values(age_data, "Attrition", filter_young, ["Yes", "No"])

# Calcul des pourcentages pour 25 < Age <= 45
filter_middle = (age_data["Age"] > 25) & (age_data["Age"] <= 45)
percentages_middle = calculate_percentages_values(age_data, "Attrition", filter_middle, ["Yes", "No"])

# Calcul des pourcentages pour Age > 45
filter_old = age_data["Age"] > 45
percentages_old = calculate_percentages_values(age_data, "Attrition", filter_old, ["Yes", "No"])


print("Pourcentages pour Age <= 25 :", percentages_young)
print("Pourcentages pour 25 < Age <= 45 :", percentages_middle)
print("Pourcentages pour 45 < Age :", percentages_old)

Pourcentages pour Age <= 25 : {'Yes': 35.77, 'No': 64.23}
Pourcentages pour 25 < Age <= 45 : {'Yes': 14.8, 'No': 85.2}
Pourcentages pour 45 < Age : {'Yes': 12.45, 'No': 87.55}


In [47]:
unique_ages = age_data['Age'].unique()

# Print the unique values
print("Unique ages:", unique_ages)

Unique ages: [51 31 32 38 46 28 29 25 45 36 55 47 37 21 35 26 50 53 42 44 49 18 41 39
 58 33 43 52 27 30 54 40 23 48 57 34 24 22 56 60 19 20 59]
