In [2]:
import numpy as np
import pandas as pd
import os
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
import scipy.stats as stats
from scipy.stats import chi2_contingency
from scipy.stats import ttest_ind
from scipy.stats import shapiro, kstest, poisson, levene, bartlett, mannwhitneyu
from scipy.stats import ttest_ind, spearmanr
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from IPython.display import display, Markdown, Image
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
pd.set_option('display.max_columns', None)
from colores_team_1 import paleta_team_1, get_gradient_cmap, apply_team_palette
from matplotlib.colors import LinearSegmentedColormap

In [3]:
df_talento = pd.read_csv('df_talento_limpio.csv' , sep=',')

In [4]:
df_talento.head(5)


Unnamed: 0,age,attrition,business_travel,department,distance_from_home,education,education_field,employee_number,environment_satisfaction,gender,job_involvement,job_level,job_role,job_satisfaction,marital_status,monthly_income,num_companies_worked,overtime,percent_salary_hike,performance_rating,relationship_satisfaction,stock_option_level,total_working_years,training_times_last_year,work_life_balance,years_at_company,years_since_last_promotion,years_with_curr_manager,date_birth,remote_work
0,51,no,unknown,research_&_development,6,3,unknown,162,1,male,3,5,research director,3,unknown,19537.0,7,No,13,3,3,0,28,5,3,20,15,15,1972,yes
1,52,no,unknown,research_&_development,1,4,life_sciences,259,3,male,2,5,manager,3,unknown,19999.0,0,unknown,14,3,1,1,34,5,3,33,11,9,1971,yes
2,42,no,travel_rarely,research_&_development,4,2,technical_degree,319,3,male,3,5,manager,4,married,19232.0,1,No,11,3,4,0,22,3,3,22,11,15,1981,yes
3,48,no,unknown,research_&_development,22,3,medical,1900,4,female,3,4,manager,4,unknown,17174.0,3,No,11,3,2,1,22,3,3,22,4,7,1975,yes
4,59,no,unknown,sales,25,3,life_sciences,81,1,female,3,3,sales executive,1,unknown,10827.461567,7,unknown,11,3,4,0,28,3,2,21,7,9,1964,yes


In [None]:
## Estrategia 1 : Crear planes de carrera para retener el talento joven con más formaciones y programas de mentoring

## Strategy 1 : Create career plans to retain young talent with more training and mentorship programmes

In [None]:
#Para comprobar si la  distribución de la variable age es nomal aplicamos el test Shapiro-Wilk por ser muy preciso con los muestra pequeñas o medianas como la nuestra.

#To check whether the distribution of the variable age is nominal, we apply the Shapiro-Wilk test because it is very accurate with small or medium-sized samples like ours.

stat_no, p_no = shapiro(df_talento[df_talento['attrition'] == 'no']['age'])
stat_yes, p_yes = shapiro(df_talento[df_talento['attrition'] == 'yes']['age'])

print(f"Shapiro - NO Attrition: p-value = {p_no}")
print(f"Shapiro - YES Attrition: p-value = {p_yes}")


Shapiro - NO Attrition: p-value = 6.036408762559986e-13
Shapiro - YES Attrition: p-value = 3.62741239914311e-08


In [None]:
#Como el valor de p < 0.05 → los datos no son normales. Con este resultado por lo que usamos una prueba no parametrica comparando los dos grupos con el test de Mann-Whitney U.
# H0: La distribución de la edad es igual o mayor en el grupo "YES Attrition" comparado con el grupo "NO Attrition".
# H1: La edad en el grupo "YES Attrition" es menor que en el grupo "NO Attrition".

#As the p-value < 0.05 → the data are not normal. With this result we therefore use a non-parametric test comparing the two groups with the Mann-Whitney U test.
# H0: The age distribution is equal or greater in the ‘YES Attrition’ group compared to the ‘NO Attrition’ group.
# H1: The age in the ‘YES Attrition’ group is lower than in the ‘NO Attrition’ group.

u_stat, p_mw = mannwhitneyu(
    df_talento[df_talento['attrition'] == 'yes']['age'],
    df_talento[df_talento['attrition'] == 'no']['age'],
    alternative='less')

print(f"Mann-Whitney U test p-value: {p_mw}")


Mann-Whitney U test p-value: 1.3635874165787773e-11


p ≤ 0.05 → Rechazas la Hipótesis Nula (H0). Hay una evidencia estadistica muy fuerte para poder afirmar que la edad de las personas que se fueron (yes) es significativamente menor que la edad de quienes se quedaron (no)

p ≤ 0.05 → You reject the Null Hypothesis (H0). There is very strong statistical evidence that the age of those who left (yes) is significantly lower than the age of those who stayed (no).

In [None]:
#Ahora revisamos si la distribucion de la variable training_times_last_year es normal o no en el grupo de empleados que se fueron y en el grupo de empleados que se quedaron.

#Now we check whether the distribution of the variable training_times_last_year is normal or not in the group of employees who left and in the group of employees who stayed.

stat_no_train, p_no_train = shapiro(df_talento[df_talento['attrition'] == 'no']['training_times_last_year'])
stat_yes_train, p_yes_train = shapiro(df_talento[df_talento['attrition'] == 'yes']['training_times_last_year'])

print(f"Shapiro - NO Attrition Training: p-value = {p_no_train}")
print(f"Shapiro - YES Attrition Training: p-value = {p_yes_train}")


Shapiro - NO Attrition Training: p-value = 5.97734353202577e-29
Shapiro - YES Attrition Training: p-value = 2.1188000107043584e-11


In [None]:
#Como  en la prueba anterior el valor de p < 0.05 → los datos no son normales, por lo que pasamos a realizar el test de Mann-Whitney U.
#H0: La cantidad de formaciones recibidas por el grupo "YES Attrition" es igual o mayor que la del grupo "NO Attrition"
#H1: La cantidad de formaciones en el grupo "YES Attrition" es menor que en el grupo "NO Attrition" 

#As in the previous test the p-value < 0.05 → the data are not normal, so we proceed to perform the Mann-Whitney U test.
#H0: The amount of training received by the ‘YES Attrition’ group is equal to or greater than that of the ‘NO Attrition’ group.
#H1: The amount of training in the ‘YES Attrition’ group is lower than in the ‘NO Attrition’ group.

u_stat_train, p_mw_train = mannwhitneyu(
    df_talento[df_talento['attrition'] == 'yes']['training_times_last_year'],
    df_talento[df_talento['attrition'] == 'no']['training_times_last_year'],
    alternative='less')

print(f"Mann-Whitney U test (Training) p-value: {p_mw_train}")


Mann-Whitney U test (Training) p-value: 0.038303295034666146


El valor de p ≤ 0.05 → Rechazas la Hipótesis Nula (H0). Tenemos evidencia estadistica para defender que las personas que se fueron de la empresa (yes)recibieron menos formacion que los empleados que se quedaron (no)

The p-value ≤ 0.05 → You reject the Null Hypothesis (H0). We have statistical evidence to defend that people who left the company (yes) have received less training than employees who stayed (no).

In [None]:
## Estrategia 2 : Los que viven más lejor son más propensos a la rotación: ofrecer bonos transporte, incentivar teletrabajo.

## Strategy 2: Those who live further away are more inclined to rotate: offer transport bonus, encourage home-working.

In [None]:
#De nuevo ejecutamos el test de Shapiro-Wilk para detectar si los datos siguen una distribución normal, pues más preciso que otros test como el de Kolmogorov.

#Again, we run the Shapiro-Wilk test to detect whether the data follow a normal distribution, as it is more accurate than other tests such as the Kolmogorov test.

stat_no, p_no = shapiro(df_talento[df_talento['attrition'] == 'no']['distance_from_home'])
stat_yes, p_yes = shapiro(df_talento[df_talento['attrition'] == 'yes']['distance_from_home'])

print(f"Shapiro - NO Attrition: p-value = {p_no}")
print(f"Shapiro - YES Attrition: p-value = {p_yes}")


Shapiro - NO Attrition: p-value = 2.1811995997111563e-33
Shapiro - YES Attrition: p-value = 1.1323437492395478e-11


In [None]:
#no es una distribucion normal por lo que probamos con Mann-Whitney, las hipotesis son:
#H0: La distancia al trabajo en el grupo "YES Attrition" es igual o menor que en el grupo "NO Attrition".
#H1: La distancia al trabajo en el grupo "YES Attrition" es mayor que en el grupo "NO Attrition".

#is not a normal distribution so we test with Mann-Whitney, the hypotheses are:
#H0: The distance to work in the ‘YES Attrition’ group is equal to or less than in the ‘NO Attrition’ group.
#H1: The distance to work in the YES Attrition group is greater than in the NO Attrition group.

u_stat, p_mw = mannwhitneyu(
    df_talento[df_talento['attrition'] == 'yes']['distance_from_home'],
    df_talento[df_talento['attrition'] == 'no']['distance_from_home'],
    alternative='greater')

print(f"Mann-Whitney U test (Distance) p-value: {p_mw}")


Mann-Whitney U test (Distance) p-value: 0.02841495045159129


El valor de p ≤ 0.05 → Rechazas la Hipótesis Nula (H0). Demostramos estadisticamente que la distancia desde casa al trabajo de los empleados que se fueron es mayor que la que se quedaron.

The p-value ≤ 0.05 → Rejects the Null Hypothesis (H0). We show statistically that the distance from home to work of employees who left is greater than that of those who stayed.

In [None]:
## Estrategia 3: Revisar incentivos en los puestos criticos de Sales representative 
#### Hipótesis a validar:
#H0: No hay diferencia en la tasa de rotación entre Sales Representatives y otros puestos.
#H1: Sales Representatives tienen una mayor tasa de rotación.

## Strategy 3: Review incentives in key Sales representative positions 
#### Hypothesis to be validated:
#H0: There is no difference in turnover rate between Sales Representatives and other positions.
#H1: Sales Representatives have a higher turnover rate.

In [None]:
#Debemos crear una variable binaria que necesitamos para poder separar claramente los dos grupos a comparar.

#We need to create a binary variable that we need to be able to clearly separate the two groups to be compared.

df_talento['is_sales'] = df_talento['job_role'].apply(lambda x: 1 if x.lower() == 'sales representative' else 0)

In [None]:
#aqui no podemos usar Mann-Whitney U ni T de Student, puesto que queremos comparar proporciones o asociaciones entre dos variables categoricas.
#creamos una tabla de contingencia para ver cuantos empleados hay en cada combinacion

#Here we cannot use Mann-Whitney U or Student's T, since we want to compare proportions or combinations between two categorical variables.
#we create a contingency table to see how many employees are in each combination.

contingencia = pd.crosstab(df_talento['is_sales'], df_talento['attrition'])

#Y ejecutamos el test para comparar si la distribucion de la rotacion es independiente o no por el hecho de ser Sales Representative.

#And we run the test to compare whether or not the distribution of the rotation is independent of the fact that it is Sales Representative.

chi2, p, dof, expected = chi2_contingency(contingencia)

print(f"Chi2 Test p-value: {p}")
print(contingencia)


Chi2 Test p-value: 4.8157553447255975e-09
attrition    no  yes
is_sales            
0          1218  208
1            51   33


El p-value: 4.8157553447255975e-09 esta muy por debajo de 0.05, 
por lo que rechazamos la hipotesis nula y nos demuestra que si existe una relacion signifivativa entre trabajar como Sales REpresentative en ABC Corporation y la rotacion laboral.

The p-value: 4.8157553447255975e-09 is far below 0.05, 
so we reject the null hypothesis and show that there is a significant relationship between working as a Sales Representative at ABC Corporation and employment turnover.

In [None]:
## Estrategia 4: Ampliar el programa de politicas de participacion (stock_option)
# Hipotesis a comparar:
#H0: No hay diferencia en el nivel de stock option entre quienes se quedan y quienes se van.
#H1: Los empleados que se van tienen menos stock option.

## Strategy 4: Scaling up the participation in stock options programme 
#-Hypotheses to compare:
#H0: There is no difference in the level of stock option between those who stay and those who leave.
#H1: Employees who leave have less stock option.

In [None]:
#Ejecutamos el test de normalidad sobre stock_option_level

#Run the normality test on stock_option_level

stat_no, p_no = shapiro(df_talento[df_talento['attrition'] == 'no']['stock_option_level'])
stat_yes, p_yes = shapiro(df_talento[df_talento['attrition'] == 'yes']['stock_option_level'])

print(f"Shapiro - NO Attrition: p-value = {p_no}")
print(f"Shapiro - YES Attrition: p-value = {p_yes}")


Shapiro - NO Attrition: p-value = 1.4309034137646362e-36
Shapiro - YES Attrition: p-value = 6.261739387733629e-22


In [None]:
#Ninguno de los dos grupos sigue una distribucion normal asi que nos vamos a por Mann-Whitney 

#Neither of the two groups follows a normal distribution so we go for Mann-Whitney U.

u_stat, p_mw = mannwhitneyu(
    df_talento[df_talento['attrition'] == 'yes']['stock_option_level'],
    df_talento[df_talento['attrition'] == 'no']['stock_option_level'],
    alternative='less')

print(f"Mann-Whitney U test (Stock Options) p-value: {p_mw}")


Mann-Whitney U test (Stock Options) p-value: 1.9184993950372736e-11


El valor de p ≤ 0.05 → Rechazas la Hipótesis Nula (H0). Es un resultado muy significativo y confirmamos que los empleados que se van tiene menos stock option que los que se quedan

The p-value ≤ 0.05 → You reject the Null Hypothesis (H0). It is a highly significant result and we confirm that employees who leave have less stock option than those who stay.

In [None]:
## Estrategia 5: limitar las horas extras o diseñar un sistema de carga laboral
## Hipótesis a validar:
# H0: No hay diferencia significativa en el nivel de stock options entre quienes se van y quienes se quedan.
# H1: Los empleados que se van tienen menos stock options.

## Strategy 5: Restrict overtime or devise a working load system
## Hypotheses to be validated:
# H0: There is no significant difference in the level of stock options between leavers and stayers.
# H1: Employees who leave have fewer stock options.

In [None]:
#As they are categorical variables and we would like to measure their relationship, we use chi-square directly as an inferential test.
contingencia = pd.crosstab(df_talento['overtime'], df_talento['attrition'])
chi2, p, dof, expected = chi2_contingency(contingencia)

print(f"Chi2 Test p-value: {p}")
print(contingencia)


Chi2 Test p-value: 7.530084867181445e-12
attrition   no  yes
overtime           
No         569   69
Yes        168   74
unknown    532   98


El valor de p ≤ 0.05 → Se rechaza la Hipótesis Nula (H0). Obtenemos evidencia estadística muy fuerte de que hacer horas extra está relacionado con la rotación

The p-value ≤ 0.05 → The Null Hypothesis (H0) is rejected. We obtain very strong statistical evidence that working overtime is related to turnover.