In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

# AB testing
# -----------------------------------------------------------------------
from scipy.stats import ttest_ind
from scipy.stats import levene

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data/recursos-humanos_v1.csv', index_col = 0)

### GESTIÓN DE NULOS

In [6]:
#Copia del df
df_copy = df.copy()

In [7]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 311 entries, 0 to 310
Data columns (total 29 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Employee_Name               164 non-null    object 
 1   EmpID                       311 non-null    float64
 2   GenderID                    311 non-null    float64
 3   EmpStatusID                 311 non-null    float64
 4   DeptID                      311 non-null    float64
 5   Salary                      311 non-null    object 
 6   Termd                       311 non-null    float64
 7   Position                    311 non-null    object 
 8   State                       311 non-null    object 
 9   Zip                         311 non-null    float64
 10  DOB                         311 non-null    object 
 11  MaritalDesc                 311 non-null    object 
 12  CitizenDesc                 263 non-null    object 
 13  HispanicLatino              311 non

In [8]:
def unificar_hispaniclatino(df):
    # Unificar valores 'Yes' y 'yes' como 'Yes' y 'No' y 'no' como 'No' en la columna 'HispanicLatino'
    df['HispanicLatino'] = df['HispanicLatino'].str.lower().replace({'yes': 'Yes', 'no': 'No'})

def imputar_race_desc(df):
    # Si en la columna RaceDesc hay un nulo y en la columna HispanicLatino hay un Yes, imputa un "Hispanic"
    df['RaceDesc'] = df.apply(lambda row: 'Hispanic' if row['HispanicLatino'] == 'Yes' and pd.isnull(row['RaceDesc']) else row['RaceDesc'], axis=1)

def imputar_race_desc_unknown(df):
    # A los nulos restantes, les meto una etiqueta de "Unknown"
    df['RaceDesc'].fillna('Unknown', inplace=True)

def nulos_999(df, columns, value):
    for column in columns:
        df[column].fillna(value, inplace=True)

def mediana_nulos(df, column):
    mediana_value = df[column].median()
    df[column].fillna(mediana_value, inplace=True)

def moda_nulos(df, column):
    moda_value = df[column].mode().iloc[0]
    df[column].fillna(moda_value, inplace=True)

# Llamadas
unificar_hispaniclatino(df_copy)
imputar_race_desc(df_copy)
imputar_race_desc_unknown(df_copy)

# Llamadas para valores 999
columns_to_fill_999 = ['DaysLateLast30', 'Zip']
nulos_999(df_copy, columns_to_fill_999, 999)

# Llamada para mediana
mediana_nulos(df_copy, 'SpecialProjectsCount')

# Llamadas para moda
moda_nulos(df_copy, 'CitizenDesc')
moda_nulos(df_copy, 'RecruitmentSource')


In [9]:
df.isnull().sum()

Employee_Name                 147
EmpID                           0
GenderID                        0
EmpStatusID                     0
DeptID                          0
Salary                          0
Termd                           0
Position                        0
State                           0
Zip                             0
DOB                             0
MaritalDesc                     0
CitizenDesc                    48
HispanicLatino                  0
RaceDesc                       84
DateofHire                     83
DateofTermination             209
TermReason                     60
EmploymentStatus                0
ManagerName                    91
ManagerID                       8
RecruitmentSource              45
PerformanceScore                0
EngagementSurvey                0
EmpSatisfaction                 0
SpecialProjectsCount          113
LastPerformanceReview_Date     68
DaysLateLast30                120
Absences                        0
dtype: int64

In [10]:
df.isnull().sum()/df.shape[0]*100

Employee_Name                 47.266881
EmpID                          0.000000
GenderID                       0.000000
EmpStatusID                    0.000000
DeptID                         0.000000
Salary                         0.000000
Termd                          0.000000
Position                       0.000000
State                          0.000000
Zip                            0.000000
DOB                            0.000000
MaritalDesc                    0.000000
CitizenDesc                   15.434084
HispanicLatino                 0.000000
RaceDesc                      27.009646
DateofHire                    26.688103
DateofTermination             67.202572
TermReason                    19.292605
EmploymentStatus               0.000000
ManagerName                   29.260450
ManagerID                      2.572347
RecruitmentSource             14.469453
PerformanceScore               0.000000
EngagementSurvey               0.000000
EmpSatisfaction                0.000000


### COLUMNA RACEDESC
Es una columna tipo object por lo que no se puede usar un knn e iterative, 
hecha la prueba de pasarlo a numerico pero luego no tiene sentido al volver
a pasarlo a categórica. En vez de eso, uso la columna HispanicLatino para
rellenar algunos nulos y al resto le doy etiqueta de Unknown

In [11]:
df['RaceDesc'].describe()

count       227
unique        6
top       White
freq        135
Name: RaceDesc, dtype: object

In [12]:
def unificar_hispaniclatino(df):
    # Unificar valores 'Yes' y 'yes' como 'Yes' y 'No' y 'no' como 'No' en la columna 'HispanicLatino'
    df['HispanicLatino'] = df['HispanicLatino'].str.lower().replace({'yes': 'Yes', 'no': 'No'})

# Llamada
unificar_hispaniclatino(df_copy)

In [13]:
def imputar_race_desc(df):
    # Si en la columna RaceDesc hay un nulo y en la columna HispanicLatino hay un Yes, imputa un "Hispanic"
    df['RaceDesc'] = df.apply(lambda row: 'Hispanic' if row['HispanicLatino'] == 'Yes' and pd.isnull(row['RaceDesc']) else row['RaceDesc'], axis=1)

# Llamada
imputar_race_desc(df_copy)

In [14]:
df_copy.isnull().sum()

Employee_Name                 147
EmpID                           0
GenderID                        0
EmpStatusID                     0
DeptID                          0
Salary                          0
Termd                           0
Position                        0
State                           0
Zip                             0
DOB                             0
MaritalDesc                     0
CitizenDesc                     0
HispanicLatino                  0
RaceDesc                        0
DateofHire                     83
DateofTermination             209
TermReason                     60
EmploymentStatus                0
ManagerName                    91
ManagerID                       8
RecruitmentSource               0
PerformanceScore                0
EngagementSurvey                0
EmpSatisfaction                 0
SpecialProjectsCount            0
LastPerformanceReview_Date     68
DaysLateLast30                  0
Absences                        0
dtype: int64

In [15]:
df_copy.isnull().sum()/df_copy.shape[0]*100

Employee_Name                 47.266881
EmpID                          0.000000
GenderID                       0.000000
EmpStatusID                    0.000000
DeptID                         0.000000
Salary                         0.000000
Termd                          0.000000
Position                       0.000000
State                          0.000000
Zip                            0.000000
DOB                            0.000000
MaritalDesc                    0.000000
CitizenDesc                    0.000000
HispanicLatino                 0.000000
RaceDesc                       0.000000
DateofHire                    26.688103
DateofTermination             67.202572
TermReason                    19.292605
EmploymentStatus               0.000000
ManagerName                   29.260450
ManagerID                      2.572347
RecruitmentSource              0.000000
PerformanceScore               0.000000
EngagementSurvey               0.000000
EmpSatisfaction                0.000000


In [16]:
def imputar_race_desc_unknown(df):
    # A los nulos restantes, les meto una etiqueta de "Unknown"
    df['RaceDesc'].fillna('Unknown', inplace=True)

# Llamada
imputar_race_desc_unknown(df_copy)

In [17]:
df_copy['RaceDesc'].isnull().sum()

0

### COLUMNAS DAYSLATELAST30 & ZIP
Fillna con 999

In [18]:
#le meto un 999 a los nulos
def nulos_999(df, columns, value):
    for column in columns:
        df[column].fillna(value, inplace=True)

# Llamada
columns_to_fill = ['DaysLateLast30', 'Zip']
nulos_999(df_copy, columns_to_fill, 999)


In [19]:
df_copy[['DaysLateLast30', 'Zip']].isnull().sum()

DaysLateLast30    0
Zip               0
dtype: int64

### COLUMNA SPECIALPROJECTSCOUNT
Fillna mediana

In [20]:
def mediana_nulos(df, column):
    mediana_value = df[column].median()
    df[column].fillna(mediana_value, inplace=True)

# Llamada
mediana_nulos(df_copy, 'SpecialProjectsCount')


In [21]:
df_copy['SpecialProjectsCount'].isnull().sum()

0

### COLUMNAS CITIZENDESC & RECRUIMENTSOURCE
Fillna con la moda

In [22]:
def moda_nulos(df, column):
    moda_value = df[column].mode().iloc[0]
    df[column].fillna(moda_value, inplace=True)

# Llamada
moda_nulos(df_copy, 'CitizenDesc')

# Llamada
moda_nulos(df_copy, 'RecruitmentSource')


### COLUMNA RACEDESC
Método KNN e Iterative pasando las columnas usadas a numericas
(No usar)

In [23]:
# Columnas que quieres convertir
columnas_a_convertir = ['CitizenDesc', 'HispanicLatino', 'RaceDesc']

# Instanciar la clase LabelEncoder
label_encoder = LabelEncoder()

# Crear nuevas columnas numéricas y añadirlas al DataFrame
for columna in columnas_a_convertir:
    nueva_columna = f'{columna}_numerica'
    df_copy[nueva_columna] = label_encoder.fit_transform(df_copy[columna])

In [24]:
df_copy[['CitizenDesc_numerica','CitizenDesc']].sample(8)

Unnamed: 0,CitizenDesc_numerica,CitizenDesc
251,2,US Citizen
304,2,US Citizen
237,2,US Citizen
293,2,US Citizen
131,2,US Citizen
245,2,US Citizen
310,2,US Citizen
57,2,US Citizen


In [13]:
# Reemplazar los valores 6 con NaN en las nuevas columnas numéricas
columnas_numericas = ['RaceDesc_numerica']
df_copy[columnas_numericas] = df_copy[columnas_numericas].replace(6, np.nan)

In [14]:
# Reemplazar los valores 3 con NaN en las nuevas columnas numéricas
columnas_numericas = ['CitizenDesc_numerica']
df_copy[columnas_numericas] = df_copy[columnas_numericas].replace(3, np.nan)

In [15]:
df_copy.columns

Index(['Employee_Name', 'EmpID', 'GenderID', 'EmpStatusID', 'DeptID', 'Salary',
       'Termd', 'Position', 'State', 'Zip', 'DOB', 'MaritalDesc',
       'CitizenDesc', 'HispanicLatino', 'RaceDesc', 'DateofHire',
       'DateofTermination', 'TermReason', 'EmploymentStatus', 'ManagerName',
       'ManagerID', 'RecruitmentSource', 'PerformanceScore',
       'EngagementSurvey', 'EmpSatisfaction', 'SpecialProjectsCount',
       'LastPerformanceReview_Date', 'DaysLateLast30', 'Absences',
       'CitizenDesc_numerica', 'HispanicLatino_numerica', 'RaceDesc_numerica'],
      dtype='object')

In [16]:
df_copy[['CitizenDesc_numerica', 'HispanicLatino_numerica', 'RaceDesc_numerica']].describe()

Unnamed: 0,CitizenDesc_numerica,HispanicLatino_numerica,RaceDesc_numerica
count,263.0,311.0,227.0
mean,1.912548,0.090032,3.762115
std,0.395531,0.286689,1.595284
min,0.0,0.0,0.0
25%,2.0,0.0,2.0
50%,2.0,0.0,5.0
75%,2.0,0.0,5.0
max,2.0,1.0,5.0


In [19]:
# Columnas a utilizar para imputar
columnas_imputacion = ['HispanicLatino_numerica', 'CitizenDesc_numerica']

# Imputación KNN
imputer_knn = KNNImputer(n_neighbors=5)
df_knn = imputer_knn.fit_transform(df_copy[columnas_imputacion + ['RaceDesc_numerica']])
df_copy['RaceDesc_numerica_knn'] = df_knn[:, -1]

# Imputación Iterative
imputer_iterative = IterativeImputer(max_iter=20, random_state=42)
df_iterative = imputer_iterative.fit_transform(df_copy[columnas_imputacion + ['RaceDesc_numerica']])
df_copy['RaceDesc_numerica_iterative'] = df_iterative[:, -1]


In [20]:
df_copy.columns

Index(['Employee_Name', 'EmpID', 'GenderID', 'EmpStatusID', 'DeptID', 'Salary',
       'Termd', 'Position', 'State', 'Zip', 'DOB', 'MaritalDesc',
       'CitizenDesc', 'HispanicLatino', 'RaceDesc', 'DateofHire',
       'DateofTermination', 'TermReason', 'EmploymentStatus', 'ManagerName',
       'ManagerID', 'RecruitmentSource', 'PerformanceScore',
       'EngagementSurvey', 'EmpSatisfaction', 'SpecialProjectsCount',
       'LastPerformanceReview_Date', 'DaysLateLast30', 'Absences',
       'CitizenDesc_numerica', 'HispanicLatino_numerica', 'RaceDesc_numerica',
       'RaceDesc_numerica_knn', 'RaceDesc_numerica_iterative'],
      dtype='object')

In [21]:
df_copy[['RaceDesc_numerica','RaceDesc_numerica_knn', 'RaceDesc_numerica_iterative']].describe()

Unnamed: 0,RaceDesc_numerica,RaceDesc_numerica_knn,RaceDesc_numerica_iterative
count,227.0,311.0,311.0
mean,3.762115,3.735691,3.777306
std,1.595284,1.370182,1.36508
min,0.0,0.0,0.0
25%,2.0,2.0,2.0
50%,5.0,3.6,3.839055
75%,5.0,5.0,5.0
max,5.0,5.0,5.0


In [22]:
#Elimino las columnas que no me interesan y renombro con la que me quedo
lista_copy_nulos = ['CitizenDesc_numerica', 'HispanicLatino_numerica', 'RaceDesc_numerica',
       'RaceDesc_numerica_knn']

def eliminar_columna(df,eliminar):
    return df.drop(eliminar, axis=1,inplace=True)

eliminar_columna(df_copy, lista_copy_nulos)

In [23]:
df_copy.columns

Index(['Employee_Name', 'EmpID', 'GenderID', 'EmpStatusID', 'DeptID', 'Salary',
       'Termd', 'Position', 'State', 'Zip', 'DOB', 'MaritalDesc',
       'CitizenDesc', 'HispanicLatino', 'RaceDesc', 'DateofHire',
       'DateofTermination', 'TermReason', 'EmploymentStatus', 'ManagerName',
       'ManagerID', 'RecruitmentSource', 'PerformanceScore',
       'EngagementSurvey', 'EmpSatisfaction', 'SpecialProjectsCount',
       'LastPerformanceReview_Date', 'DaysLateLast30', 'Absences',
       'RaceDesc_numerica_iterative'],
      dtype='object')

In [46]:
df_copy[['RaceDesc', 'RaceDesc_numerica_iterative']].sample(6)

Unnamed: 0,RaceDesc,RaceDesc_numerica_iterative
288,Two or more races,4.0
250,Black or African American,2.0
221,Asian,1.0
91,White,5.0
32,Black or African American,2.0
220,Black or African American,2.0


In [32]:
df_copy[['RaceDesc', 'RaceDesc_numerica_iterative']].value_counts()

RaceDesc                          RaceDesc_numerica_iterative
White                             5.0                            135
Black or African American         2.0                             60
Asian                             1.0                             20
Two or more races                 4.0                              9
American Indian or Alaska Native  0.0                              2
Hispanic                          3.0                              1
dtype: int64

In [33]:
df_copy['RaceDesc_numerica_iterative'].value_counts()

5.000000    135
3.839055     68
2.000000     60
1.000000     20
4.000000      9
3.770342      8
3.914071      4
3.847687      3
0.000000      2
3.000000      1
2.324216      1
Name: RaceDesc_numerica_iterative, dtype: int64