<a href="https://colab.research.google.com/github/AnIsAsPe/Estadistica_y_Probabilidad_para-CD-/blob/main/PrincipiosDeProbabilidad/Semana4/Notebooks/ChiCuadrada_para_selecci%C3%B3n_de_caracter%C3%ADsticas_categoricas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Bibliotecas y funciones

In [90]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2

In [91]:
def describe_datos(df):
  unicos =[]
  for col in df:
    unicos.append(df[col].unique())
  unicos = pd.Series(unicos, index=df.columns)
  descripcion = pd.concat([df.dtypes, len(df)-df.isna().sum(),df.nunique(),unicos], axis=1)
  descripcion.columns = ['dtypes', 'not-null','nunique','unique' ]
  return(descripcion)

# Lectura y exploración de datos

In [92]:
file = 'https://raw.githubusercontent.com/nelson-wu/employee-attrition-ml/refs/heads/master/WA_Fn-UseC_-HR-Employee-Attrition.csv'
HR_data = pd.read_csv(file)
HR_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

Revisamos por separado las variables numericas y categoricas

In [93]:
num_data = HR_data.select_dtypes(include=[np.number])
describe_datos(num_data)

Unnamed: 0,dtypes,not-null,nunique,unique
Age,int64,1470,43,"[41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2..."
DailyRate,int64,1470,886,"[1102, 279, 1373, 1392, 591, 1005, 1324, 1358,..."
DistanceFromHome,int64,1470,29,"[1, 8, 2, 3, 24, 23, 27, 16, 15, 26, 19, 21, 5..."
Education,int64,1470,5,"[2, 1, 4, 3, 5]"
EmployeeCount,int64,1470,1,[1]
EmployeeNumber,int64,1470,1470,"[1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 14, 15, 16,..."
EnvironmentSatisfaction,int64,1470,4,"[2, 3, 4, 1]"
HourlyRate,int64,1470,71,"[94, 61, 92, 56, 40, 79, 81, 67, 44, 84, 49, 3..."
JobInvolvement,int64,1470,4,"[3, 2, 4, 1]"
JobLevel,int64,1470,5,"[2, 1, 3, 4, 5]"


In [94]:
cat_data = HR_data.select_dtypes(include=[object])
describe_datos(cat_data)

Unnamed: 0,dtypes,not-null,nunique,unique
Attrition,object,1470,2,"[Yes, No]"
BusinessTravel,object,1470,3,"[Travel_Rarely, Travel_Frequently, Non-Travel]"
Department,object,1470,3,"[Sales, Research & Development, Human Resources]"
EducationField,object,1470,6,"[Life Sciences, Other, Medical, Marketing, Tec..."
Gender,object,1470,2,"[Female, Male]"
JobRole,object,1470,9,"[Sales Executive, Research Scientist, Laborato..."
MaritalStatus,object,1470,3,"[Single, Married, Divorced]"
Over18,object,1470,1,[Y]
OverTime,object,1470,2,"[Yes, No]"


# Selección de variables categoricas

In [95]:
cat_data = cat_data.drop('Over18',axis=1)
cat_data.head(3)

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,OverTime
0,Yes,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Yes
1,No,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,No
2,Yes,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Yes


## Preprocesamiento (one-hot-encoding)

In [96]:
# Para variables con 2 valores únicos, utilicemos una codificación binaria
cat_data['Attrition'] = np.where(cat_data['Attrition']=='Yes', 1, 0)
cat_data['Gender']    = np.where(cat_data['Gender']=='Female', 1 ,0)
cat_data.rename(columns={'Gender': 'Female'}, inplace=True)
cat_data['OverTime']  = np.where(cat_data['OverTime']=='Yes' , 1, 0)

In [97]:
cat_data.head()

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Female,JobRole,MaritalStatus,OverTime
0,1,Travel_Rarely,Sales,Life Sciences,1,Sales Executive,Single,1
1,0,Travel_Frequently,Research & Development,Life Sciences,0,Research Scientist,Married,0
2,1,Travel_Rarely,Research & Development,Other,0,Laboratory Technician,Single,1
3,0,Travel_Frequently,Research & Development,Life Sciences,1,Research Scientist,Married,1
4,0,Travel_Rarely,Research & Development,Medical,0,Laboratory Technician,Married,0


In [98]:
# Para las demás, utilicemos one-hot encoding utilizando el método get_dummies
cat_data_encoded = pd.get_dummies(cat_data)
cat_data_encoded.columns

Index(['Attrition', 'Female', 'OverTime', 'BusinessTravel_Non-Travel',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'Department_Human Resources', 'Department_Research & Development',
       'Department_Sales', 'EducationField_Human Resources',
       'EducationField_Life Sciences', 'EducationField_Marketing',
       'EducationField_Medical', 'EducationField_Other',
       'EducationField_Technical Degree', 'JobRole_Healthcare Representative',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative',
       'MaritalStatus_Divorced', 'MaritalStatus_Married',
       'MaritalStatus_Single'],
      dtype='object')

In [99]:
cat_data_encoded.shape

(1470, 27)

In [100]:
X = cat_data_encoded.drop(columns = 'Attrition', axis = 1)
y = cat_data_encoded[['Attrition']]

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

## Test Chi-cuadrado para seleccionar variables relevantes a la etiqueta

Evaluar la dependencia o independencia entre cada una de las variables categoricas y la etiqueta.

Ho: las variables son independientes.

Mediante una tabla de contingencia entre cada variable categorica y la variable objetivo,  mide las diferencias entre la frecuencia esperada y la frecuencia observada

In [102]:
#El primer arreglo son los f-scores, el segundo, los p-values
F_score, p_values = chi2(X_train, y_train)
resultados = pd.DataFrame({'F_score':F_score, 'p_values':p_values},
                          index = X_train.columns)
resultados.sort_values('p_values')

Unnamed: 0,F_score,p_values
OverTime,65.886906,4.775538e-16
MaritalStatus_Single,34.354702,4.592894e-09
JobRole_Sales Representative,32.14278,1.43248e-08
BusinessTravel_Travel_Frequently,12.048427,0.0005183609
JobRole_Healthcare Representative,10.933569,0.0009443741
JobRole_Laboratory Technician,9.708868,0.00183381
MaritalStatus_Divorced,9.609589,0.001935639
JobRole_Research Director,9.377164,0.002197052
JobRole_Manufacturing Director,8.098787,0.00442949
JobRole_Manager,7.820683,0.005165166


In [103]:
resultados['Relevantes'] = np.where(resultados['p_values'] < 0.05, 1,
                                   0)
resultados.sort_values('p_values')

Unnamed: 0,F_score,p_values,Relevantes
OverTime,65.886906,4.775538e-16,1
MaritalStatus_Single,34.354702,4.592894e-09,1
JobRole_Sales Representative,32.14278,1.43248e-08,1
BusinessTravel_Travel_Frequently,12.048427,0.0005183609,1
JobRole_Healthcare Representative,10.933569,0.0009443741,1
JobRole_Laboratory Technician,9.708868,0.00183381,1
MaritalStatus_Divorced,9.609589,0.001935639,1
JobRole_Research Director,9.377164,0.002197052,1
JobRole_Manufacturing Director,8.098787,0.00442949,1
JobRole_Manager,7.820683,0.005165166,1


In [104]:
resultados['Relevantes'].value_counts()

Unnamed: 0_level_0,count
Relevantes,Unnamed: 1_level_1
1,14
0,12


In [105]:
col_cat_relevantes = resultados[resultados['Relevantes']==1].index.to_list()
col_cat_relevantes

['OverTime',
 'BusinessTravel_Non-Travel',
 'BusinessTravel_Travel_Frequently',
 'Department_Sales',
 'EducationField_Technical Degree',
 'JobRole_Healthcare Representative',
 'JobRole_Laboratory Technician',
 'JobRole_Manager',
 'JobRole_Manufacturing Director',
 'JobRole_Research Director',
 'JobRole_Sales Representative',
 'MaritalStatus_Divorced',
 'MaritalStatus_Married',
 'MaritalStatus_Single']

In [106]:
col_cat_relevantes = resultados[resultados['Relevantes']==1].index.to_list()
cat_data2 = cat_data_encoded[col_cat_relevantes + ['Attrition']]
cat_data2

Unnamed: 0,OverTime,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,Department_Sales,EducationField_Technical Degree,JobRole_Healthcare Representative,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Attrition
0,1,False,False,True,False,False,False,False,False,False,False,False,False,True,1
1,0,False,True,False,False,False,False,False,False,False,False,False,True,False,0
2,1,False,False,False,False,False,True,False,False,False,False,False,False,True,1
3,1,False,True,False,False,False,False,False,False,False,False,False,True,False,0
4,0,False,False,False,False,False,True,False,False,False,False,False,True,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,0,False,True,False,False,False,True,False,False,False,False,False,True,False,0
1466,0,False,False,False,False,True,False,False,False,False,False,False,True,False,0
1467,1,False,False,False,False,False,False,False,True,False,False,False,True,False,0
1468,0,False,True,True,False,False,False,False,False,False,False,False,True,False,0


In [107]:
# Volvemos a unir nuestros dataframes categóricos y numéricos
HR_data2 = pd.concat([cat_data2, num_data], axis=1)
# Ordenamos nuestras columnas
HR_data2 = HR_data2[ [i for i in HR_data2.columns if i != 'Attrition'] + ['Attrition']]
# Estamos ahora listos para seguir con nuestra tarea de ML
HR_data2


Unnamed: 0,OverTime,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,Department_Sales,EducationField_Technical Degree,JobRole_Healthcare Representative,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,...,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,1,False,False,True,False,False,False,False,False,False,...,80,0,8,0,1,6,4,0,5,1
1,0,False,True,False,False,False,False,False,False,False,...,80,1,10,3,3,10,7,1,7,0
2,1,False,False,False,False,False,True,False,False,False,...,80,0,7,3,3,0,0,0,0,1
3,1,False,True,False,False,False,False,False,False,False,...,80,0,8,3,3,8,7,3,0,0
4,0,False,False,False,False,False,True,False,False,False,...,80,1,6,3,3,2,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,0,False,True,False,False,False,True,False,False,False,...,80,1,17,3,3,5,2,0,3,0
1466,0,False,False,False,False,True,False,False,False,False,...,80,1,9,5,3,7,7,1,7,0
1467,1,False,False,False,False,False,False,False,True,False,...,80,1,6,0,3,6,2,0,3,0
1468,0,False,True,True,False,False,False,False,False,False,...,80,0,17,3,2,9,6,0,8,0
