In [1]:
import os
import pandas as pd
import seaborn as sns
import ydata_profiling
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler, Normalizer

extract_path = './AI_Project_Data'

# Import des données

In [2]:
csv_employee_survey_data = os.path.join(extract_path, 'employee_survey_data.csv')
csv_manager_survey_data = os.path.join(extract_path, 'manager_survey_data.csv')
csv_general_data = os.path.join(extract_path, 'general_data.csv')
csv_out_time = os.path.join(extract_path, 'out_time.csv')
csv_in_time = os.path.join(extract_path, 'in_time.csv')

employee_survey_df = pd.read_csv(csv_employee_survey_data)
manager_survey_df = pd.read_csv(csv_manager_survey_data)
general_df = pd.read_csv(csv_general_data)
out_time_df = pd.read_csv(csv_out_time)
in_time_df = pd.read_csv(csv_in_time)

# init empty dataframe
work_info = pd.DataFrame()

# Analyse des données

## Overviews

In [3]:
employee_survey_report = ydata_profiling.ProfileReport(employee_survey_df, title='Employee Survey Data')
employee_survey_report.to_notebook_iframe()

manager_survey_report = ydata_profiling.ProfileReport(manager_survey_df, title='Manager Survey Data')
manager_survey_report.to_notebook_iframe()

general_data_report = ydata_profiling.ProfileReport(general_df, title='General Data')
general_data_report.to_notebook_iframe()

out_time_report = ydata_profiling.ProfileReport(out_time_df, title='Out Time')
out_time_report.to_notebook_iframe()

in_time_report = ydata_profiling.ProfileReport(in_time_df, title='In Time')
in_time_report.to_notebook_iframe()

# Nettoyage des données

## Valeurs constantes

In [4]:
# Remove columns with only one unique value
def remove_constant_columns(df):
    return df.loc[:, df.nunique() > 1]

## Valeurs manquantes

In [5]:
# Fill missing values with the most frequent value
def fill_categorical_na(df):
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    if len(categorical_cols) == 0:
        return df

    categorical_imputer = SimpleImputer(strategy='most_frequent')
    
    df.loc[:, categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])
    
    return df


In [6]:
# Fill missing values with the median rounded to the nearest integer
def fill_numeric_na(df):
    for col in df.select_dtypes(include=['number', 'float64']).columns:
        median_value = df[col].median()
        df.loc[:, col] = df[col].fillna(round(median_value))
    return df

Lorsque les années totales de travail sont manquantes, utiliser les années passées dans l’entreprise comme proxie est une stratégie logique.

In [7]:
# Fill TotalWorkingYears with YearsAtCompany if missing
def fill_total_working_years(df):
    if 'TotalWorkingYears' in df.columns and 'YearsAtCompany' in df.columns:
        df.loc[:, 'TotalWorkingYears'] = df['TotalWorkingYears'].fillna(df['YearsAtCompany'])
    return df

## Type des valeurs

### Simplification des types

In [8]:
# Fill YearsSinceLastPromotion with YearsAtCompany if missing
def simplify_numeric_columns(df):
    df = df.apply(lambda col: col.astype(int) if col.dtype == 'float64' and col.dropna().mod(1).eq(0).all() else col)
    return df

### Conversion des type object en valeur numérique

In [9]:
# Transfrom categorical columns with two unique values into binary columns
def transform_attrition_to_bool(df):
    if 'Attrition' in df.columns:
        df.loc[:, 'Attrition'] = df['Attrition'].map({'Yes': True, 'No': False})
    return df

## in_time et out_time

In [10]:
def reverse_and_merge(df):
    in_time_melted = in_time_df.melt(id_vars=['Unnamed: 0'], var_name='date', value_name='arrival_time')
    out_time_melted = out_time_df.melt(id_vars=['Unnamed: 0'], var_name='date', value_name='departure_time')

    # Renommer la colonne EmployeeID
    in_time_melted.rename(columns={'Unnamed: 0': 'EmployeeID'}, inplace=True)
    out_time_melted.rename(columns={'Unnamed: 0': 'EmployeeID'}, inplace=True)

    # Fusionner les deux DataFrames sur 'id' et 'date'
    merged_clock_in = pd.merge(in_time_melted, out_time_melted, on=['EmployeeID', 'date'], how='outer')
    return merged_clock_in

In [11]:
def generate_work_column(df):
    df['arrival_time'] = pd.to_datetime(df['arrival_time'])
    df['departure_time'] = pd.to_datetime(df['departure_time'])

    # Calculer le temps travaillé (différence entre départ et arrivée)
    df['worked_time'] = df['departure_time'] - df['arrival_time']

    # Convertir en heures pour avoir un format lisible
    df['worked_hours'] = df['worked_time'].dt.total_seconds() / 3600

    # Trier par id et date
    df.sort_values(by=['EmployeeID', 'date'], inplace=True)

    # Moyenne de la durée de travail par jour pour chaque employé
    mean_worked_hours = df.groupby('EmployeeID')['worked_hours'].mean()

    # Nombre total d'heures travaillées par employé
    total_worked_hours = df.groupby('EmployeeID')['worked_hours'].sum()

    # Nombre de jours ou le worked_hours est non nul
    worked_days = df[df['worked_hours'] > 0].groupby('EmployeeID')['worked_hours'].count()

    # Faire un data frame avec ces 3 informations pour chaque employé avec l'EmployeeID comme index
    work_info = pd.concat([mean_worked_hours, total_worked_hours, worked_days], axis=1)
    work_info.columns = ['mean_worked_hours', 'total_worked_hours', 'worked_days']
    return work_info


In [12]:
def employee_survey_to_cat(df):
    df['EnvironmentSatisfaction'] = df['EnvironmentSatisfaction'].replace({1: 'Poor', 2: 'Fair', 3: 'Good', 4: 'Excellent'})
    df['JobSatisfaction'] = df['JobSatisfaction'].replace({1: 'Dissatisfied', 2: 'Neutral', 3: 'Satisfied', 4: 'Very Satisfied'})
    df['WorkLifeBalance'] = df['WorkLifeBalance'].replace({1: 'Poor', 2: 'Average', 3: 'Good', 4: 'Excellent'})
    return df

In [13]:
def manager_survey_to_cat(df):
    df['JobInvolvement'] = df['JobInvolvement'].replace({1: 'Not Engaged', 2: 'Moderately Engaged', 3: 'Highly Engaged', 4: 'Fully Committed'})
    df['PerformanceRating'] = df['PerformanceRating'].replace({1: 'Poor', 2: 'Fair', 3: 'Good', 4: 'Outstanding'})
    return df

In [14]:
def general_to_cat(df):
    df['Education'] = df['Education'].replace({
        1: 'HighSchool', 
        2: 'Associate', 
        3: 'Bachelor', 
        4: 'Master', 
        5: 'PhD'
    })
    df['StockOptionLevel'] = df['StockOptionLevel'].replace({  
        0: 'None',  
        1: 'Low',  
        2: 'Medium',  
        3: 'High'  
    })
    return df

## Pipeline

In [15]:
default_preprocessor = Pipeline([
    ('remove_constants', FunctionTransformer(remove_constant_columns, validate=False)),
    ('fill_categorical_na', FunctionTransformer(fill_categorical_na, validate=False)),
    ('fill_numeric_na', FunctionTransformer(fill_numeric_na, validate=False)),
    ('simplify_numeric_columns', FunctionTransformer(simplify_numeric_columns, validate=False)),
])

employee_survey_preprocessor = Pipeline([
    ('default_preprocessor', default_preprocessor),
    ('employee_survey_to_cat', FunctionTransformer(employee_survey_to_cat, validate=False)),
])

manager_survey_preprocessor = Pipeline([
    ('default_preprocessor', default_preprocessor),
    ('manager_survey_to_cat', FunctionTransformer(manager_survey_to_cat, validate=False)),
])

general_preprocessor = Pipeline([
    ('fill_total_working_years', FunctionTransformer(fill_total_working_years, validate=False)),
    ('transform_attrition_to_bool', FunctionTransformer(transform_attrition_to_bool, validate=False)),
    ('general_to_cat', FunctionTransformer(general_to_cat, validate=False)),
    ('default_preprocessor', default_preprocessor),
])

work_info_preprocessor = Pipeline([
    ('reverse_and_merge', FunctionTransformer(reverse_and_merge, validate=False)),
    ('generate_work_column', FunctionTransformer(generate_work_column, validate=False)),
])

# Nettoyage des données
employee_survey_data = employee_survey_preprocessor.fit_transform(employee_survey_df)
manager_survey_data = manager_survey_preprocessor.fit_transform(manager_survey_df)
general_data = general_preprocessor.fit_transform(general_df)
work_info = work_info_preprocessor.fit_transform(work_info)

## Merge

In [16]:
clean_data = general_data.merge(employee_survey_data, on='EmployeeID').merge(manager_survey_data, on='EmployeeID').merge(work_info, on='EmployeeID')

In [17]:
clean_data_num = clean_data.select_dtypes(include=['number']).columns

In [18]:
sns.pairplot(clean_data[clean_data_num].drop(columns=['EmployeeID']))
plt.show()

In [None]:
plt.figure(figsize=(20, 10))
sns.heatmap(clean_data[clean_data_num].corr(), cmap='coolwarm')

plt.show()

In [20]:
cat_columns = clean_data.select_dtypes(include=[object]).drop(columns=['Attrition'])

clean_data_cat = pd.get_dummies(cat_columns, dtype=float)

In [21]:
scaler = StandardScaler()

clean_data_num = pd.DataFrame(scaler.fit_transform(clean_data[clean_data_num]), columns=clean_data_num)

In [27]:
conact_data = pd.concat([clean_data_num, clean_data_cat, clean_data['Attrition']], axis=1)

### Gestion le la proportion des données

In [None]:
df_true = conact_data[conact_data['Attrition'] == True]
df_false = conact_data[conact_data['Attrition'] == False]


min_count = min(len(df_true), len(df_false))

df_true_sampled = df_true.sample(n=min_count)
df_false_sampled = df_false.sample(n=min_count)


final_data = pd.concat([df_true_sampled, df_false_sampled])
final_data['Attrition'].value_counts()

# Analyse

In [23]:
# Export des données nettoyées
final_data.to_csv(os.path.join(extract_path, 'final_data.csv'), index=False)

# Export

In [24]:
# Export des données nettoyées
final_data.to_csv(os.path.join(extract_path, 'final_data.csv'), index=False)