In [159]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

EmpleadosAttrition = pd.read_csv('assets/EmpleadosReto.csv')

##### Crea una columna llamada Year y obtén el año de contratación del empleado a partir de su fecha ‘HiringDate’. No se te olvide que debe ser un entero.
##### Crea una columna llamada YearsAtCompany que contenga los años que el empleado lleva en la compañía hasta el año 2018. Para su cálculo, usa la variable Year que acabas de crear.

In [228]:
EmpleadosAttrition['Year'] = EmpleadosAttrition['HiringDate'].str.split('/').str[2].astype(int)
EmpleadosAttrition['YearsAtCompany'] = 2018 - EmpleadosAttrition['Year']

- La DistanceFromHome está dada en kilómetros, pero tiene las letras “km” al final y así no puede ser entera.
- Renombra la variable DistanceFromHome a DistanceFromHome_km.
- Crea una nueva variable DistanceFromHome que sea entera, es decir, solo con números.

In [229]:
DistanceFromHome_km = EmpleadosAttrition['DistanceFromHome']
EmpleadosAttrition['DistanceFromHome'] = DistanceFromHome_km.str.split(' ').str[0].astype(int)

#### Borra las columnas Year, HiringDate y DistanceFromHome_km debido a que ya no son útiles.

In [230]:
EmpleadosAttrition.drop(['Year', 'DistanceFromHome', 'HiringDate'], axis=1, inplace=True);

Aprovechando los ajustes que se están haciendo, la empresa desea saber si todos los departamentos tienen un ingreso promedio similar. Genera una nuevo frame llamado SueldoPromedioDepto que contenga el MonthlyIncome promedio por departamento de los empleados y colócalo en una variable llamada SueldoPromedio. Esta tabla solo es informativa, no la vas a utilizar en el set de datos que estás construyendo.

In [231]:
SueldoPromedioDepto = EmpleadosAttrition.groupby(['Department'])[['MonthlyIncome']].mean()

SueldoPromedioDepto.head()

Unnamed: 0_level_0,MonthlyIncome
Department,Unnamed: 1_level_1
Human Resources,6239.888889
Research & Development,6804.149813
Sales,7188.25


##### La variable MonthlyIncome tiene un valor numérico muy grande comparada con las otras variables. Escala dicha variable para que tenga un valor entre 0 y 1. 

In [232]:
rangeMonthlyIncome =  EmpleadosAttrition['MonthlyIncome'].max() - EmpleadosAttrition['MonthlyIncome'].min()
EmpleadosAttrition['MonthlyIncome'] = (EmpleadosAttrition['MonthlyIncome'] - EmpleadosAttrition['MonthlyIncome'].min()) / rangeMonthlyIncome

##### Elimina las columnas que, con alta probabilidad no tienen relación alguna con la salida usando Fast Correlation-Based Filtering
##### Convierte todas las variables categóricas que quedan a numéricas

In [154]:
data = pd.get_dummies(EmpleadosAttrition, dtype=int)
corr = data.corr()
#corr[:] = np.tril(corr.values, k=-1)

y_1 = corr.loc['Attrition_No', :]
y_2 = corr.loc['Attrition_Yes', :]

lvs = 0.1
lev = 0.9

setValue = set(y_1.loc[y_1.abs() > lvs].index.to_list() + y_2.loc[y_2.abs() > lvs].index.to_list())
setValue.remove('Attrition_No')

removed = set([])
xlist = list(setValue)
for i in range(len(xlist)):
    value = xlist[i]
    for j in range(i, len(xlist)):
        comparator = xlist[j]
        if (comparator == value or j <= i):
            continue
        correlation = abs(corr.at[value, comparator])
        if (correlation >= lev):
            if (abs(corr.at['Attrition_No', comparator]) > abs(corr.at['Attrition_No', value])):
                removed.add(value)
            else:
                removed.add(comparator)

setValue.difference_update(removed)
EmpleadosAttritionFinal = data[list(setValue) + ['Attrition_No', 'Attrition_Yes']]
correlated_map = EmpleadosAttritionFinal.corr()

correlated_map[:] = np.tril(correlated_map.values, k=-1)
correlated_map

Unnamed: 0,MaritalStatus_Divorced,OverTime_No,JobRole_Sales Representative,YearsInCurrentRole,JobRole_Laboratory Technician,BusinessTravel_Non-Travel,TotalWorkingYears,JobSatisfaction,JobRole_Research Director,EducationField_Technical Degree,JobInvolvement,Age,JobLevel,JobRole_Healthcare Representative,EnvironmentSatisfaction,MaritalStatus_Single,Attrition_Yes,YearsAtCompany,Attrition_No,Attrition_Yes.1
MaritalStatus_Divorced,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OverTime_No,0.022216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JobRole_Sales Representative,-0.072548,0.039372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YearsInCurrentRole,0.04421,0.016504,-0.160339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JobRole_Laboratory Technician,0.061449,0.029076,-0.115816,-0.122037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BusinessTravel_Non-Travel,0.07183,0.065352,-0.015232,0.002912,0.078258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TotalWorkingYears,-0.005983,0.027569,-0.238649,0.429667,-0.225079,-0.042652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JobSatisfaction,0.065125,0.030091,-0.072162,0.064629,-0.036269,0.058584,-0.006692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JobRole_Research Director,-0.013091,-0.007302,-0.0812,0.221664,-0.141064,-0.015019,0.359642,-0.064142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
EducationField_Technical Degree,-0.032866,-0.052395,0.118918,0.054855,-0.049482,-0.042327,-0.050681,0.060854,-0.037551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [241]:
data = pd.get_dummies(EmpleadosAttrition[EmpleadosAttrition.columns[~EmpleadosAttrition.columns.isin(['Attrition'])]], dtype=int)
data = pd.concat([data, EmpleadosAttrition['Attrition'].map({'No': 0,'Yes': 1}).to_frame('Attrition')], axis=1)
corr = data.corr()
#corr[:] = np.tril(corr.values, k=-1)

y_1 = corr.loc['Attrition', :]
#y_2 = corr.loc['Attrition_Yes', :]

lvs = 0.1
lev = 0.9

setValue = set(y_1.loc[y_1.abs() > lvs].index.to_list())

removed = set([])
xlist = list(setValue)
for i in range(len(xlist)):
    value = xlist[i]
    for j in range(i, len(xlist)):
        comparator = xlist[j]
        if (comparator == value or j <= i):
            continue
        correlation = abs(corr.at[value, comparator])
        if (correlation >= lev):
            removed.add(value) if (abs(corr.at['Attrition', comparator]) > abs(corr.at['Attrition', value])) else removed.add(comparator)

setValue.difference_update(removed)
EmpleadosAttritionFinal = data[list(setValue)]
correlated_map = EmpleadosAttritionFinal.corr()

correlated_map[:] = np.tril(correlated_map.values, k=-1)
correlated_map

Unnamed: 0,MaritalStatus_Divorced,OverTime_No,JobRole_Sales Representative,YearsInCurrentRole,JobRole_Laboratory Technician,BusinessTravel_Non-Travel,TotalWorkingYears,JobSatisfaction,JobRole_Research Director,EducationField_Technical Degree,JobInvolvement,Age,JobLevel,JobRole_Healthcare Representative,EnvironmentSatisfaction,Attrition,MaritalStatus_Single,YearsAtCompany
MaritalStatus_Divorced,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OverTime_No,0.022216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JobRole_Sales Representative,-0.072548,0.039372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
YearsInCurrentRole,0.04421,0.016504,-0.160339,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JobRole_Laboratory Technician,0.061449,0.029076,-0.115816,-0.122037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BusinessTravel_Non-Travel,0.07183,0.065352,-0.015232,0.002912,0.078258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TotalWorkingYears,-0.005983,0.027569,-0.238649,0.429667,-0.225079,-0.042652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JobSatisfaction,0.065125,0.030091,-0.072162,0.064629,-0.036269,0.058584,-0.006692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
JobRole_Research Director,-0.013091,-0.007302,-0.0812,0.221664,-0.141064,-0.015019,0.359642,-0.064142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
EducationField_Technical Degree,-0.032866,-0.052395,0.118918,0.054855,-0.049482,-0.042327,-0.050681,0.060854,-0.037551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [242]:
x = EmpleadosAttritionFinal[EmpleadosAttritionFinal.columns[~EmpleadosAttritionFinal.columns.isin(['Attrition'])]]
pca = PCA(len(x.columns));

pca.fit(x)
EmpleadosAttritionPCA = pca.explained_variance_ratio_
""" print(f'{pca.explained_variance_ratio_}')
print(f'{pca.transform(x).shape}')
print((x.to_numpy() @ pca.components_[:, :2]).shape)
print(((x - x.mean()) / x.std()).to_numpy() @ pca.components_[:, :2]) """
### los PC principales con el 80% son los primeros 2 componentes con 87%
print(EmpleadosAttritionPCA)

[6.34547220e-01 2.41642359e-01 7.88832142e-02 2.10284923e-02
 6.51901600e-03 6.41727183e-03 2.87291318e-03 2.79044175e-03
 1.32533133e-03 1.01147487e-03 6.65785491e-04 6.38068711e-04
 4.65141321e-04 4.17124285e-04 2.82933160e-04 2.74489426e-04
 2.18723332e-04]


In [239]:
EmpleadosAttritionFinal = pd.concat([EmpleadosAttritionFinal, pd.DataFrame(pca.transform(x)[:, :2]).add_prefix('C')], axis=1)

In [244]:
EmpleadosAttritionFinal.to_csv('assets/EmpleadosAttritionFinal.csv', index=False)