In [14]:
import os
import pandas as pd
import ydata_profiling

extract_path = './AI_Project_Data'

# Import des données

In [15]:
csv_employee_survey_data = os.path.join(extract_path, 'employee_survey_data.csv')
csv_manager_survey_data = os.path.join(extract_path, 'manager_survey_data.csv')
csv_general_data = os.path.join(extract_path, 'general_data.csv')
csv_out_time = os.path.join(extract_path, 'out_time.csv')
csv_in_time = os.path.join(extract_path, 'in_time.csv')

employee_survey_data = pd.read_csv(csv_employee_survey_data)
manager_survey_data = pd.read_csv(csv_manager_survey_data)
general_data = pd.read_csv(csv_general_data)
out_time_raw = pd.read_csv(csv_out_time)
in_time_raw = pd.read_csv(csv_in_time)

# Traitement des données

## Données global

### Merge

In [16]:
# Merge employee_survey_data, general_data et manager_survey_data on EmployeeID

full_data = pd.merge(employee_survey_data, general_data, on='EmployeeID')
full_data = pd.merge(full_data, manager_survey_data, on='EmployeeID')

### Valeurs constantes

In [17]:
# EmployeeCount has constant value "1"
# Over18 has constant value "True"
# StandardHours has constant value "8"
full_data.drop(columns=['EmployeeCount', 'Over18', 'StandardHours'], inplace=True)

### Valeurs manquantes

In [18]:
# EnvironmentSatisfaction Missing	25
# JobSatisfaction Missing	20
# WorkLifeBalance Missing	38
# NumCompaniesWorked Missing 19
# TotalWorkingYears Missing	9



### Type des valeurs

In [None]:
# EnvironmentSatisfaction fake float
# JobSatisfaction fake float
# WorkLifeBalance fake float
# full_data['EnvironmentSatisfaction'] = full_data['EnvironmentSatisfaction'].astype(int)
# full_data['JobSatisfaction'] = full_data['JobSatisfaction'].astype(int)
# full_data['WorkLifeBalance'] = full_data['WorkLifeBalance'].astype(int)

# Attrition object to int
# BusinessTravel object to int
# Department object to int
# EducationField object to int
# Gender object to int
# JobLevel object to int
# JobRole object to int
# MaritalStatus object to int

## in_time et out_time

In [None]:
# Transformer les dataframes (passer les colonnes de dates en lignes)
in_time_melted = in_time_raw.melt(id_vars=['Unnamed: 0'], var_name='date', value_name='arrival_time')
out_time_melted = out_time_raw.melt(id_vars=['Unnamed: 0'], var_name='date', value_name='departure_time')

# Renommer la colonne EmployeeID
in_time_melted.rename(columns={'Unnamed: 0': 'EmployeeID'}, inplace=True)
out_time_melted.rename(columns={'Unnamed: 0': 'EmployeeID'}, inplace=True)

# Fusionner les deux DataFrames sur 'id' et 'date'
merged_clock_in = pd.merge(in_time_melted, out_time_melted, on=['EmployeeID', 'date'], how='outer')

merged_clock_in['arrival_time'] = pd.to_datetime(merged_clock_in['arrival_time'])
merged_clock_in['departure_time'] = pd.to_datetime(merged_clock_in['departure_time'])

# Calculer le temps travaillé (différence entre départ et arrivée)
merged_clock_in['worked_time'] = merged_clock_in['departure_time'] - merged_clock_in['arrival_time']

# Convertir en heures pour avoir un format lisible
merged_clock_in['worked_hours'] = merged_clock_in['worked_time'].dt.total_seconds() / 3600

# Trier par id et date
merged_clock_in.sort_values(by=['EmployeeID', 'date'], inplace=True)

# Analyse

## Données global

In [21]:
full_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   EmployeeID               4410 non-null   int64  
 1   EnvironmentSatisfaction  4385 non-null   float64
 2   JobSatisfaction          4390 non-null   float64
 3   WorkLifeBalance          4372 non-null   float64
 4   Age                      4410 non-null   int64  
 5   Attrition                4410 non-null   object 
 6   BusinessTravel           4410 non-null   object 
 7   Department               4410 non-null   object 
 8   DistanceFromHome         4410 non-null   int64  
 9   Education                4410 non-null   int64  
 10  EducationField           4410 non-null   object 
 11  Gender                   4410 non-null   object 
 12  JobLevel                 4410 non-null   int64  
 13  JobRole                  4410 non-null   object 
 14  MaritalStatus           

In [None]:
full_data_report = ydata_profiling.ProfileReport(full_data, title='Full Data')
full_data_report.to_notebook_iframe()

## in_time and out_time data

In [None]:
merged_clock_in_report = ydata_profiling.ProfileReport(merged_clock_in, title='Merged Data')
merged_clock_in_report.to_notebook_iframe()