# Projet IA

## Introduction

L'entreprise de produits pharmaceutiques HumanForYou basée en Inde emploie environ 4000 personnes. Cependant, chaque année elle subit un turn-over d'environ 15% de ses employés nécessitant de retrouver des profils similaires sur le marché de l'emploi.

La direction trouve que ce niveau de turn-over n'est pas bon pour l'entreprise car :

- Les projets sur lesquels étaient les employés quittant la société prennent du retard ce qui nuit à la réputation de l'entreprise auprès de ses clients et partenaires.

- Un service de ressources humaines de taille conséquente doit être conservé car il faut avoir les moyens de trouver les nouvelles recrues.

- Du temps est perdu à l'arrivée des nouveaux employés car ils doivent très souvent être formés et ont besoin de temps pour devenir pleinement opérationnels dans leur nouvel environnement.

***Le direction fait donc appel à vous, spécialistes de l'analyse de données, pour déterminer les facteurs ayant le plus d'influence sur ce taux de turn-over et lui proposer des modèles afin d'avoir des pistes d'amélioration pour donner à leurs employés l'envie de rester.***




## Gestion Execution et images

In [1]:
# imports
import numpy as np
import os

from numpy.random import default_rng
# stabilité du notebook d'une exécution à l'autre
random=default_rng(42) 

# jolies figures directement dans le notebook
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# où sauver les figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "workflowDS"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID) # le dossier doit exister

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Import des CSV

In [2]:
import pandas as pd
import os

EMPLOYEE_SURVEY_DATA_PATH = os.path.join("data")
GENERAL_DATA_PATH = os.path.join("data")
MANAGER_SURVEY_DATA_PATH = os.path.join("data")
IN_TIME_PATH = os.path.join("data")
OUT_TIME_PATH = os.path.join("data")

def employee_survey_data(employee_path=EMPLOYEE_SURVEY_DATA_PATH):
    csv_path = os.path.join(employee_path, "employee_survey_data.csv")
    return pd.read_csv(csv_path)

def general_data(data_path=GENERAL_DATA_PATH):
    csv_path = os.path.join(data_path, "general_data.csv")
    return pd.read_csv(csv_path) 

def manager_survey_data(manager_survey_path=MANAGER_SURVEY_DATA_PATH):
    csv_path = os.path.join(manager_survey_path, "manager_survey_data.csv")
    return pd.read_csv(csv_path) 

def out_time_data(out_time_path=OUT_TIME_PATH):
    csv_path = os.path.join(out_time_path, "out_time.csv")
    return pd.read_csv(csv_path) 

def in_time_data(in_time_path=IN_TIME_PATH):
    csv_path = os.path.join(in_time_path, "in_time.csv")
    return pd.read_csv(csv_path) 

In [3]:
employee = employee_survey_data()
general = general_data()
manager = manager_survey_data()
outTime = out_time_data()
inTime = in_time_data()

## Preparation des heures d'arrivés et de sorties

In [4]:
inTimeCpy = inTime.copy()
inTimeCpy = inTimeCpy.fillna(0)
outTimeCpy = outTime.copy()
outTimeCpy = outTimeCpy.fillna(0)

for k in range(2,len(inTimeCpy.columns)):
    inTimeCpy[inTimeCpy.columns[k]] = pd.to_datetime(inTimeCpy[inTimeCpy.columns[k]], format='%Y-%m-%d %H:%M:%S')
for k in range(2,len(outTimeCpy.columns)):
    outTimeCpy[outTimeCpy.columns[k]] = pd.to_datetime(outTimeCpy[outTimeCpy.columns[k]], format='%Y-%m-%d %H:%M:%S')

deltaTime = outTimeCpy-inTimeCpy

deltaTime.head()

Unnamed: 0.1,Unnamed: 0,2015-01-01,2015-01-02,2015-01-05,2015-01-06,2015-01-07,2015-01-08,2015-01-09,2015-01-12,2015-01-13,...,2015-12-18,2015-12-21,2015-12-22,2015-12-23,2015-12-24,2015-12-25,2015-12-28,2015-12-29,2015-12-30,2015-12-31
0,0,0.0,0 days 07:12:30,0 days 07:11:23,0 days 07:24:39,0 days 07:00:24,0 days 07:17:23,0 days 07:29:04,0 days 07:15:46,0 days 07:49:52,...,0 days 00:00:00,0 days 07:20:21,0 days 07:23:45,0 days 06:30:17,0 days 07:35:47,0 days,0 days 07:46:26,0 days 07:18:54,0 days 07:46:44,0 days 07:04:49
1,0,0.0,0 days 08:06:33,0 days 07:27:17,0 days 00:00:00,0 days 07:23:49,0 days 07:25:00,0 days 07:09:03,0 days 07:36:41,0 days 07:16:44,...,0 days 07:54:11,0 days 07:45:14,0 days 07:42:44,0 days 07:26:08,0 days 00:00:00,0 days,0 days 07:36:53,0 days 07:58:57,0 days 07:59:10,0 days 08:13:38
2,0,0.0,0 days 06:41:33,0 days 07:15:56,0 days 06:24:19,0 days 06:45:54,0 days 07:20:42,0 days 06:51:41,0 days 07:25:07,0 days 06:59:59,...,0 days 06:47:09,0 days 07:09:49,0 days 06:48:06,0 days 06:43:49,0 days 06:50:59,0 days,0 days 07:01:26,0 days 07:26:20,0 days 07:32:20,0 days 06:47:11
3,0,0.0,0 days 07:20:18,0 days 07:17:31,0 days 06:56:35,0 days 06:55:10,0 days 06:51:03,0 days 07:11:35,0 days 06:59:55,0 days 07:18:23,...,0 days 07:37:45,0 days 06:50:48,0 days 07:19:35,0 days 07:24:49,0 days 07:05:06,0 days,0 days 07:26:50,0 days 07:25:00,0 days 07:21:59,0 days 07:07:59
4,0,0.0,0 days 08:03:20,0 days 07:59:17,0 days 07:40:57,0 days 07:48:22,0 days 07:39:44,0 days 07:43:18,0 days 08:21:54,0 days 08:15:26,...,0 days 07:54:13,0 days 07:39:54,0 days 07:57:27,0 days 07:47:13,0 days 08:14:58,0 days,0 days 07:39:44,0 days 08:16:07,0 days 07:57:12,0 days 08:01:05


Calcul de la moyenne des temps de travail par employés

In [None]:
meanWorkingTime = pd.DataFrame(data={'EmployeeID': [], 'MeanWorkingTime': []})

for row in range(len(deltaTime)):
    tempRow={'EmployeeID':0, 'MeanWorkingTime':0}
    mean = 0
    for col in range(1,len(deltaTime.columns)):
        tempRow['EmployeeID'] = int(inTime[inTime.columns[0]][row])
        if(deltaTime[deltaTime.columns[col]][row] != 0):
            if(mean != 0):
                mean = (mean+deltaTime[deltaTime.columns[col]][row])/2
            else:
                mean = deltaTime[deltaTime.columns[col]][row]
    tempRow['MeanWorkingTime'] = mean.total_seconds()
    meanWorkingTime = meanWorkingTime.append(tempRow, ignore_index=True)

meanWorkingTime.head()

# Merge de Employee/General/Manager

In [None]:
merge= employee.merge( general, on='EmployeeID')
merge2 = merge.merge(manager, on='EmployeeID')
merge3 = merge2.merge(meanWorkingTime, on='EmployeeID')

workingMerge = merge3
#merge2.head() 
#merge2.info()
merge3.info()


## Visualisation des champs

In [None]:
workingMerge.head()

## Table visualisation Info et histogramme 

In [None]:
#graph hist
workingMerge.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
#valeur dans un champs
workingMerge["MeanWorkingTime"].value_counts()

# Classification Ascendante Hiérarchique et dendogramme


In [None]:
from sklearn.preprocessing import OrdinalEncoder
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

X = workingMerge.copy() # A new copy of data set

#Drop rows with Nan
X.dropna(inplace=True)

#remplace Y/N to 0/1
X.Over18.replace(to_replace=dict(Y=1, N=0), inplace=True)
X.Attrition.replace(to_replace=dict(Yes=1, No=0), inplace=True)

#remplace gender male=1 / female=0
X.Gender.replace(to_replace=dict(Male=1, Female=0), inplace=True)

#Visualisation
pd.set_option('display.max_columns', None)
X.head()

In [None]:
X

## Dendogramme

In [None]:
plt.figure(figsize=(10, 7))
dendrogram(linkage(X, method='ward'), orientation='top', distance_sort='descending', show_leaf_counts=True)
plt.show()

In [None]:
cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
cluster.fit_predict(X)
plt.figure(figsize=(10, 7))
plt.scatter(X['MeanWorkingTime'],X['MonthlyIncome'], c=cluster.labels_, cmap='rainbow')
plt.show()

# Création d'un jeu de test
## Jeu de test aléatoire


In [None]:
from sklearn.model_selection import train_test_split
X_full_set, X_test_set = train_test_split(X, test_size=0.2, random_state=42) #SOLUTION
X_test_set.info()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
 
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])
 
X_num = X.select_dtypes(include=[np.number])
 
num_attribs = list(X_num)
cat_attribs = ["MaritalStatus", "JobRole", "BusinessTravel", "Department", "EducationField"]
 
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OrdinalEncoder(), cat_attribs),
    ])
 
X_prepared = full_pipeline.fit_transform(X)
X_prepared