In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

In [2]:
# Paso 2: Lee el archivo CSV y coloca los datos en EmpleadosAttrition
# Asegúrate de que 'empleadosRETO.csv' esté en el mismo directorio del notebook
ruta_csv = "empleadosRETO.csv"
EmpleadosAttrition = pd.read_csv(ruta_csv)

# Revisión rápida
print("Dimensiones:", EmpleadosAttrition.shape)
print("Columnas:", EmpleadosAttrition.columns.tolist())
EmpleadosAttrition.head()

Dimensiones: (400, 30)
Columnas: ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'HiringDate', 'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'Attrition']


Unnamed: 0,Age,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,...,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsInCurrentRole,YearsSinceLastPromotion,Attrition
0,50,Travel_Rarely,Research & Development,1 km,2,Medical,1,997,4,Male,...,22,4,3,80,32,1,2,4,1,No
1,36,Travel_Rarely,Research & Development,6 km,2,Medical,1,178,2,Male,...,20,4,4,80,7,0,3,2,0,No
2,21,Travel_Rarely,Sales,7 km,1,Marketing,1,1780,2,Male,...,13,3,2,80,1,3,3,0,1,Yes
3,52,Travel_Rarely,Research & Development,7 km,4,Life Sciences,1,1118,2,Male,...,19,3,4,80,18,4,3,6,4,No
4,33,Travel_Rarely,Research & Development,15 km,1,Medical,1,582,2,Male,...,12,3,4,80,15,2,4,6,7,Yes


In [3]:
# Paso 3: Elimina columnas con alta probabilidad de irrelevancia
cols_irrelevantes = ["EmployeeCount", "EmployeeNumber", "Over18", "StandardHours"]
EmpleadosAttrition.drop(columns=[c for c in cols_irrelevantes if c in EmpleadosAttrition.columns],
                        errors="ignore", inplace=True)

In [4]:
# Paso 4: Convertir HiringDate a datetime y crear Year (entero)
if "HiringDate" in EmpleadosAttrition.columns:
    # Intento de parseo flexible. Si tu fecha es dd/mm/aaaa, dayfirst=True ayuda.
    EmpleadosAttrition["HiringDate"] = pd.to_datetime(
        EmpleadosAttrition["HiringDate"], errors="coerce", dayfirst=True, infer_datetime_format=True
    )
    EmpleadosAttrition["Year"] = EmpleadosAttrition["HiringDate"].dt.year.astype("Int64")
else:
    raise ValueError("La columna 'HiringDate' no existe en el CSV. Verifica el nombre exacto.")

  EmpleadosAttrition["HiringDate"] = pd.to_datetime(


In [5]:
# Paso 5: Crear YearsAtCompany hasta 2018 usando Year
EmpleadosAttrition["YearsAtCompany"] = (2018 - EmpleadosAttrition["Year"]).astype("Int64")

# Correcciones básicas: evitar negativos y manejar nulos
EmpleadosAttrition["YearsAtCompany"] = EmpleadosAttrition["YearsAtCompany"].clip(lower=0)

In [6]:
# Paso 6: Renombrar DistanceFromHome a DistanceFromHome_km y crear DistanceFromHome numérica
if "DistanceFromHome" in EmpleadosAttrition.columns:
    EmpleadosAttrition.rename(columns={"DistanceFromHome": "DistanceFromHome_km"}, inplace=True)

    # Limpieza: remover sufijo "km", espacios, y convertir a número
    EmpleadosAttrition["DistanceFromHome"] = (
        EmpleadosAttrition["DistanceFromHome_km"]
        .astype(str)
        .str.lower()
        .str.replace("km", "", regex=False)
        .str.strip()
        .replace("", np.nan)
    )

    # Convertir a número y redondear a entero (Int64 permite nulos)
    EmpleadosAttrition["DistanceFromHome"] = pd.to_numeric(
        EmpleadosAttrition["DistanceFromHome"], errors="coerce"
    ).round().astype("Int64")
else:
    raise ValueError("La columna 'DistanceFromHome' no existe en el CSV. Verifica el nombre exacto.")

In [7]:
# Paso 7: Generar SueldoPromedioDepto con MonthlyIncome promedio por Department
if "Department" not in EmpleadosAttrition.columns or "MonthlyIncome" not in EmpleadosAttrition.columns:
    raise ValueError("Se requieren columnas 'Department' y 'MonthlyIncome' para el sueldo promedio.")

SueldoPromedioDepto = EmpleadosAttrition.groupby("Department", as_index=False)["MonthlyIncome"].mean()
SueldoPromedio = SueldoPromedioDepto.copy()

print("Sueldo promedio por departamento:")
SueldoPromedio

Sueldo promedio por departamento:


Unnamed: 0,Department,MonthlyIncome
0,Human Resources,6239.888889
1,Research & Development,6804.149813
2,Sales,7188.25


In [8]:
# Paso 8: Escalar MonthlyIncome a rango [0,1]
# Manejo básico de nulos: imputar con la mediana para poder escalar
if EmpleadosAttrition["MonthlyIncome"].isna().any():
    mediana_ingreso = EmpleadosAttrition["MonthlyIncome"].median()
    EmpleadosAttrition["MonthlyIncome"] = EmpleadosAttrition["MonthlyIncome"].fillna(mediana_ingreso)

scaler_minmax = MinMaxScaler()
EmpleadosAttrition["MonthlyIncome_scaled"] = scaler_minmax.fit_transform(
    EmpleadosAttrition[["MonthlyIncome"]]
)

In [9]:
# Paso 9: Convertir categóricas a numéricas (LabelEncoder)
categoricas = ["BusinessTravel", "Department", "EducationField", "Gender", "JobRole", "MaritalStatus"]

# Validación: verificar que existan y codificar
for col in categoricas:
    if col not in EmpleadosAttrition.columns:
        raise ValueError(f"La columna categórica requerida '{col}' no está en el CSV.")
    # Relleno básico de nulos con categoría 'missing'
    EmpleadosAttrition[col] = EmpleadosAttrition[col].astype(str).fillna("missing")
    le = LabelEncoder()
    EmpleadosAttrition[col] = le.fit_transform(EmpleadosAttrition[col])

# Attrition: mapear a 1/0 si es texto
if "Attrition" not in EmpleadosAttrition.columns:
    raise ValueError("La columna 'Attrition' no existe en el CSV.")

# Si es objeto, mapear posibles valores comunes
if EmpleadosAttrition["Attrition"].dtype == "O":
    EmpleadosAttrition["Attrition"] = (
        EmpleadosAttrition["Attrition"]
        .astype(str)
        .str.strip()
        .str.lower()
        .map({"yes": 1, "no": 0, "y": 1, "n": 0, "true": 1, "false": 0})
    )

# Si aún quedan nulos, intenta convertir a numérico y reemplazar nulos por 0
EmpleadosAttrition["Attrition"] = pd.to_numeric(EmpleadosAttrition["Attrition"], errors="coerce").fillna(0).astype(int)

In [10]:
# Paso 10: Borra Year, HiringDate y DistanceFromHome_km
EmpleadosAttrition.drop(columns=["Year", "HiringDate", "DistanceFromHome_km"],
                        errors="ignore", inplace=True)

In [11]:
# Paso 11: Calcular correlaciones con respecto a Attrition
corr_serie = EmpleadosAttrition.corr(numeric_only=True)["Attrition"].sort_values(ascending=False)
print("Correlaciones con Attrition:")
corr_serie

Correlaciones con Attrition:


Unnamed: 0,Attrition
Attrition,1.0
MaritalStatus,0.187283
JobRole,0.078684
BusinessTravel,0.060677
Department,0.054236
DistanceFromHome,0.052732
EducationField,0.051184
PerformanceRating,-0.006471
NumCompaniesWorked,-0.009082
WorkLifeBalance,-0.021723


In [12]:
# Paso 12: Selección por umbral de correlación
umbral = 0.1
cols_sel = corr_serie[corr_serie.abs() >= umbral].index.tolist()

# Asegurar incluir Attrition
if "Attrition" not in cols_sel:
    cols_sel.append("Attrition")

EmpleadosAttritionFinal = EmpleadosAttrition[cols_sel].copy()

print("Columnas seleccionadas por correlación (>= 0.1):", cols_sel)
EmpleadosAttritionFinal.head()

Columnas seleccionadas por correlación (>= 0.1): ['Attrition', 'MaritalStatus', 'EnvironmentSatisfaction', 'YearsAtCompany', 'JobSatisfaction', 'JobInvolvement', 'MonthlyIncome_scaled', 'MonthlyIncome', 'YearsInCurrentRole', 'Age', 'TotalWorkingYears', 'JobLevel']


Unnamed: 0,Attrition,MaritalStatus,EnvironmentSatisfaction,YearsAtCompany,JobSatisfaction,JobInvolvement,MonthlyIncome_scaled,MonthlyIncome,YearsInCurrentRole,Age,TotalWorkingYears,JobLevel
0,0,0,4,5.0,4,3,0.864269,17399,4,50,32,4
1,0,0,2,,2,3,0.20734,4941,2,36,7,2
2,1,2,2,,2,3,0.088062,2679,0,21,1,1
3,0,2,2,,2,3,0.497574,10445,6,52,18,3
4,1,1,2,7.0,3,3,0.66447,13610,6,33,15,3


In [13]:
# Paso 13: Preparar datos para PCA (X sin Attrition), estandarizar y ajustar PCA
X = EmpleadosAttritionFinal.drop(columns=["Attrition"])
y = EmpleadosAttritionFinal["Attrition"]

# Manejo básico de nulos en X: imputar con medianas
X = X.copy()
for c in X.columns:
    if X[c].isna().any():
        X[c] = X[c].fillna(X[c].median())

# Estandarizar X (recomendado para PCA)
scaler_std = StandardScaler()
X_std = scaler_std.fit_transform(X)

# Ajustar PCA y transformar
pca = PCA()
EmpleadosAttritionPCA = pca.fit_transform(X_std)

# Varianza explicada acumulada
var_cum = np.cumsum(pca.explained_variance_ratio_)
n_comp = int(np.argmax(var_cum >= 0.80) + 1)

print("Varianza explicada por componente:", pca.explained_variance_ratio_)
print("Varianza explicada acumulada:", var_cum)
print("Número mínimo de componentes para >=80% varianza:", n_comp)

Varianza explicada por componente: [0.38919488 0.11177111 0.0957506  0.09267394 0.08919549 0.08433882
 0.0678336  0.04310908 0.02049393 0.00563855 0.        ]
Varianza explicada acumulada: [0.38919488 0.50096599 0.5967166  0.68939054 0.77858602 0.86292484
 0.93075844 0.97386752 0.99436145 1.         1.        ]
Número mínimo de componentes para >=80% varianza: 6


In [14]:
# Paso 14: Agregar los componentes principales al frame EmpleadosAttritionFinal
for i in range(n_comp):
    EmpleadosAttritionFinal = EmpleadosAttritionFinal.assign(**{f"C{i}": EmpleadosAttritionPCA[:, i]})

EmpleadosAttritionFinal.head()

Unnamed: 0,Attrition,MaritalStatus,EnvironmentSatisfaction,YearsAtCompany,JobSatisfaction,JobInvolvement,MonthlyIncome_scaled,MonthlyIncome,YearsInCurrentRole,Age,TotalWorkingYears,JobLevel,C0,C1,C2,C3,C4,C5
0,0,0,4,5.0,4,3,0.864269,17399,4,50,32,4,4.08495,-0.48116,1.488925,-2.115291,0.400829,0.605831
1,0,0,2,,2,3,0.20734,4941,2,36,7,2,-0.968284,-0.071683,0.698178,-0.411396,-0.970805,-1.111856
2,1,2,2,,2,3,0.088062,2679,0,21,1,1,-2.798381,-0.534056,-0.695463,1.491838,-0.383561,0.340068
3,0,2,2,,2,3,0.497574,10445,6,52,18,3,1.812663,-1.234924,0.032221,0.863278,-0.447669,0.438606
4,1,1,2,7.0,3,3,0.66447,13610,6,33,15,3,1.754355,0.046972,-0.351714,0.159923,-0.630683,0.234267


In [17]:
# Paso 15: Guardar el set final en CSV
# Opcional: dejar Attrition al final
cols_orden = [c for c in EmpleadosAttritionFinal.columns if c != "Attrition"] + ["Attrition"]
EmpleadosAttritionFinal[cols_orden].to_csv("EmpleadosAttritionFinal.csv", index=False)

print("Archivo 'EmpleadosAttritionFinal.csv' guardado con", EmpleadosAttritionFinal.shape[1], "columnas.")

Archivo 'EmpleadosAttritionFinal.csv' guardado con 18 columnas.
