In [104]:
# 1. Importar librerías
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.decomposition import PCA

In [105]:
# 2. Leer archivo CSV
EmpleadosAttrition = pd.read_csv('/content/sample_data/empleadosRETO.csv')

In [106]:
# 3. Eliminar columnas irrelevantes
cols_to_drop = ['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours','OverTime']
EmpleadosAttrition.drop(columns=cols_to_drop, inplace=True)


In [107]:
# 4. Procesar fechas
EmpleadosAttrition['HiringDate'] = pd.to_datetime(EmpleadosAttrition['HiringDate'], errors='coerce')
EmpleadosAttrition.dropna(subset=['HiringDate'], inplace=True)  # Drop rows with invalid dates
EmpleadosAttrition['Year'] = EmpleadosAttrition['HiringDate'].dt.year.astype(int)
EmpleadosAttrition['YearsAtCompany'] = 2018 - EmpleadosAttrition['Year']


In [108]:
# 5. Limpiar DistanceFromHome
EmpleadosAttrition.rename(columns={'DistanceFromHome': 'DistanceFromHome_km'}, inplace=True)
EmpleadosAttrition['DistanceFromHome'] = EmpleadosAttrition['DistanceFromHome_km'].str.replace('km', '').astype(int)


In [109]:
# 6. Eliminar columnas ya no útiles
EmpleadosAttrition.drop(columns=['Year', 'HiringDate', 'DistanceFromHome_km'], inplace=True)


In [110]:
# 7. Sueldo promedio por departamento
SueldoPromedioDepto = EmpleadosAttrition.groupby('Department')['MonthlyIncome'].mean().reset_index()
SueldoPromedio = SueldoPromedioDepto


In [111]:
# 8. Escalar MonthlyIncome
scaler = MinMaxScaler()
EmpleadosAttrition['MonthlyIncome'] = scaler.fit_transform(EmpleadosAttrition[['MonthlyIncome']])


In [112]:
# 9. Convertir variables categóricas a numéricas
categorical_vars = [
    'BusinessTravel', 'Department', 'EducationField', 'Gender',
    'JobRole', 'MaritalStatus', 'Attrition'
]

for col in categorical_vars:
    EmpleadosAttrition[col] = EmpleadosAttrition[col].astype('category').cat.codes


In [113]:
# 10. Calcular correlación con Attrition y Seleccionar variables con correlación >= 0.1
selected_vars = ['Attrition']
for col in EmpleadosAttrition.columns:
    correlation = EmpleadosAttrition['Attrition'].corr(EmpleadosAttrition[col])
    correlation_abs = abs(correlation)
    print("column name " + col)
    print(correlation_abs)
    if correlation_abs >= 0.1 and col != 'Attrition':
        selected_vars.append(col)


column name Age
0.21245855782292317
column name BusinessTravel
0.08364461599241864
column name Department
0.05598049333308557
column name Education
0.05447783977298953
column name EducationField
0.051107325410731375
column name EnvironmentSatisfaction
0.12509105447523616
column name Gender
0.03020048168303414
column name JobInvolvement
0.16804697098684127
column name JobLevel
0.2144860035840385
column name JobRole
0.07995672734753002
column name JobSatisfaction
0.16396449171116015
column name MaritalStatus
0.18042985984937782
column name MonthlyIncome
0.1950438736021606
column name NumCompaniesWorked
0.008589681908001608
column name PercentSalaryHike
0.061714068665294615
column name PerformanceRating
0.007009996372327795
column name RelationshipSatisfaction
0.032777019471087236
column name TotalWorkingYears
0.21404342469720986
column name TrainingTimesLastYear
0.07159198469767787
column name WorkLifeBalance
0.02306297795369894
column name YearsInCurrentRole
0.20445337076937528
column n

In [114]:
# 11. Crear DataFrame final con variables seleccionadas
print("Selected variables: ", selected_vars)
EmpleadosAttritionFinal = EmpleadosAttrition[selected_vars]


Selected variables:  ['Attrition', 'Age', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'TotalWorkingYears', 'YearsInCurrentRole', 'YearsAtCompany']


In [115]:
# 12. PCA sobre variables
EmpleadosAttritionPCA = EmpleadosAttritionFinal
pca = PCA(n_components=6)
pca.fit(EmpleadosAttritionPCA)
print("PCA Components:\n", pca.components_)
print("Explained Variance:\n", pca.explained_variance_)
print("Explained Variance Ratio:\n", pca.explained_variance_ratio_)

EmpleadosPCA = pca.transform(EmpleadosAttritionPCA)


PCA Components:
 [[-8.51008986e-03  6.86440539e-01 -2.28297056e-03  2.41991400e-04
   7.10836331e-02  9.20106014e-04 -6.29255367e-03  1.58616791e-02
   6.38033432e-01  1.53791427e-01  3.04365425e-01]
 [-2.23060316e-03 -5.95585980e-01 -2.49428107e-03 -5.36375746e-03
   3.81663482e-02  5.59523207e-03 -6.32517180e-03  8.51475036e-03
   2.25221295e-01  3.82941805e-01  6.68032806e-01]
 [ 6.76948089e-03 -4.16475316e-01 -1.00997001e-02 -1.27232073e-02
   6.19232403e-02 -1.74844262e-02  2.63383118e-02  1.52951417e-02
   7.25962161e-01 -3.30307710e-01 -4.30170336e-01]
 [-1.69177960e-02 -1.81265302e-02 -3.61932535e-03 -1.52244253e-02
  -6.08916913e-03  3.58338576e-02  1.77758225e-03 -2.39469930e-03
   6.57865356e-02  8.47666078e-01 -5.24351647e-01]
 [-1.20296385e-02 -3.57696306e-04 -6.43593354e-01 -3.17206653e-02
   5.52785912e-02  7.61634771e-01 -2.04001238e-02  7.50783052e-03
  -5.46031913e-03 -3.07718082e-02  6.63741079e-03]
 [-7.33472265e-02 -6.56900862e-03  7.61577944e-01 -2.35007721e-03
  

In [116]:
# 13. Agregar componentes principales
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance_ratio)
num_components = np.argmax(cumulative_variance >= 0.8) + 1
for i in range(num_components):
    EmpleadosAttritionFinal = EmpleadosAttritionFinal.assign(**{f'C{i}': EmpleadosPCA[:, i]})

print(EmpleadosAttritionFinal.head())


   Attrition  Age  EnvironmentSatisfaction  JobInvolvement  JobLevel  \
0          0   50                        4               3         4   
1          0   36                        2               3         2   
2          1   21                        2               3         1   
3          0   52                        2               3         3   
4          1   33                        2               3         3   

   JobSatisfaction  MaritalStatus  MonthlyIncome  TotalWorkingYears  \
0                4              0       0.864269                 32   
1                2              0       0.207340                  7   
2                2              2       0.088062                  1   
3                2              2       0.497574                 18   
4                3              1       0.664470                 15   

   YearsInCurrentRole  YearsAtCompany         C0        C1  
0                   4               5  20.236844 -4.209450  
1                 

In [117]:
# 14. Guardar en CSV
# Opcional: reordenar columnas para que Attrition esté al final
cols = [col for col in EmpleadosAttritionFinal.columns if col != 'Attrition'] + ['Attrition']
EmpleadosAttritionFinal = EmpleadosAttritionFinal[cols]
EmpleadosAttritionFinal.to_csv('EmpleadosAttritionFinal.csv', index=False)