### LIMPIEZA Y TRATAMIENTO DE DATOS

In [102]:
# importamos las librerías que necesitamos

# Tratamiento de datos
import pandas as pd
import numpy as np
from IPython.display import display


# Librerías de visualización
import seaborn as sns
import matplotlib.pyplot as plt

In [103]:
# ver todas las columnas
pd.set_option('display.max_columns', None)

In [105]:
# ver todas las filas
pd.set_option('display.max_rows', None)

In [106]:
df = pd.read_csv("ABC_data_sin_nulos.csv")

In [107]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.722222,6.0,3.0,Unknown,1.0,1.0,0.0,3.0,5.0,resEArch DIREcToR,3.0,Unknown,"16280,83$","42330,17$",7.0,No,13.0,3.0,3.0,Full Time,0.0,10.0,5.0,3.0,20.0,15.0,15.0,1972.0,"195370,00$",Yes
1,No,Unknown,2063.388889,1.0,4.0,Life Sciences,2.0,3.0,0.0,2.0,5.0,ManAGeR,3.0,Unknown,Unknown,"43331,17$",0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34.0,5.0,3.0,33.0,11.0,9.0,1971.0,"199990,00$",1
2,No,travel_rarely,1984.253968,4.0,2.0,Technical Degree,3.0,3.0,0.0,3.0,5.0,ManaGER,4.0,Married,Unknown,"41669,33$",1.0,No,11.0,3.0,4.0,Unknown,0.0,22.0,3.0,3.0,22.0,11.0,15.0,1981.0,"192320,00$",1
3,No,travel_rarely,1771.404762,2.0,4.0,Medical,4.0,1.0,1.0,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,"14307,50$","37199,50$",3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10.0,2.0,3.0,20.0,5.0,6.0,1976.0,"171690,00$",False
4,No,Unknown,1582.771346,3.0,3.0,Technical Degree,5.0,1.0,1.0,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,"12783,92$","33238,20$",2.0,No,12.0,3.0,4.0,Unknown,1.0,10.0,5.0,3.0,19.0,2.0,8.0,1977.0,Unknown,0


#### Columna gender

- Es de tipo float64.

- Debería ser categórica (M/F)

- Vamos a convertir a category y normalizar.


In [109]:

# ====================================================
# Normalización de columna 'gender' SIN generar NaN
# ====================================================
if 'gender' in df.columns:
    # Convertir todo a texto limpio
    g = df['gender'].astype(str).str.strip().str.lower()

    # Normalizar valores más comunes
    replacements = {
        'male': 'M', 'm': 'M', '1': 'M', '1.0': 'M', 'true': 'M',
        'hombre': 'M', 'man': 'M', 'masculino': 'M',
        'female': 'F', 'f': 'F', '0': 'F', '0.0': 'F', 'false': 'F',
        'mujer': 'F', 'woman': 'F', 'femenino': 'F'
    }

    # Reemplazar solo donde haya coincidencias
    df['gender'] = g.replace(replacements)

    # Si después del reemplazo quedan valores distintos de M/F,
    # intentamos mantener el valor original en mayúsculas (por si ya era “M” o “F”)
    df.loc[~df['gender'].isin(['M', 'F']), 'gender'] = (
        df.loc[~df['gender'].isin(['M', 'F']), 'gender'].str.upper()
    )

    # Asegurar que solo queden M/F, sin eliminar los válidos
    df.loc[~df['gender'].isin(['M', 'F']), 'gender'] = 'M'  # o 'F' si prefieres

    # Convertir a categoría
    df['gender'] = df['gender'].astype('category')

    print("'gender' normalizado correctamente sin crear NaN.")
    print(df['gender'].value_counts(dropna=False))

'gender' normalizado correctamente sin crear NaN.
gender
F    971
M    643
Name: count, dtype: int64


In [110]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.722222,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,resEArch DIREcToR,3.0,Unknown,"16280,83$","42330,17$",7.0,No,13.0,3.0,3.0,Full Time,0.0,10.0,5.0,3.0,20.0,15.0,15.0,1972.0,"195370,00$",Yes
1,No,Unknown,2063.388889,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,ManAGeR,3.0,Unknown,Unknown,"43331,17$",0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34.0,5.0,3.0,33.0,11.0,9.0,1971.0,"199990,00$",1
2,No,travel_rarely,1984.253968,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,ManaGER,4.0,Married,Unknown,"41669,33$",1.0,No,11.0,3.0,4.0,Unknown,0.0,22.0,3.0,3.0,22.0,11.0,15.0,1981.0,"192320,00$",1
3,No,travel_rarely,1771.404762,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,"14307,50$","37199,50$",3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10.0,2.0,3.0,20.0,5.0,6.0,1976.0,"171690,00$",False
4,No,Unknown,1582.771346,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,"12783,92$","33238,20$",2.0,No,12.0,3.0,4.0,Unknown,1.0,10.0,5.0,3.0,19.0,2.0,8.0,1977.0,Unknown,0


In [111]:
df.tail()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
1609,Yes,travel_rarely,1065.277778,3.0,1.0,Life Sciences,1610.0,3.0,F,2.0,3.0,saLEs ExeCUTiVe,4.0,Married,"8604,17$","22370,83$",1.0,Unknown,11.0,3.0,1.0,Full Time,1.0,10.0,6.0,3.0,16.0,3.0,7.0,1987.0,"103250,00$",0
1610,No,non-travel,458.81746,4.0,2.0,Unknown,1611.0,3.0,F,3.0,2.0,LAboRaTOry tECHNiCIAn,2.0,Unknown,"3705,83$","9635,17$",1.0,Unknown,12.0,3.0,2.0,Part Time,0.0,10.0,5.0,2.0,9.0,0.0,8.0,1978.0,"44470,00$",1
1611,No,travel_rarely,1032.487286,-13.0,5.0,Unknown,1612.0,13.0,F,4.0,3.0,sAlES ExECUTivE,3.0,Single,Unknown,"21682,23$",0.0,No,18.0,3.0,4.0,Part Time,0.0,9.0,3.0,3.0,8.0,0.0,7.0,1984.0,"100071,84$",Yes
1612,No,non-travel,556.256661,8.0,4.0,Technical Degree,1613.0,1.0,F,3.0,2.0,SaLes ExecUtIVe,4.0,Divorced,"4492,84$","11681,39$",4.0,No,13.0,3.0,4.0,Part Time,2.0,12.0,3.0,3.0,7.0,0.0,7.0,1987.0,"53914,11$",True
1613,No,Unknown,1118.928571,7.0,2.0,Medical,1614.0,4.0,F,3.0,3.0,mAnUfactURInG DiRECTOr,3.0,Unknown,"9037,50$","23497,50$",6.0,Unknown,13.0,3.0,2.0,Full Time,1.0,10.0,3.0,3.0,8.0,0.0,7.0,1977.0,"108450,00$",0


#### Columna businesstravel

- Es de tipo object.

- Tiene guiones bajos, minúsculas inconsistentes.

- Se van a reemplazar _ por espacio + Title Case

In [113]:
# Limpieza de la columna 'businesstravel'
# ====================================================
if 'businesstravel' in df.columns:
    df['businesstravel'] = (
        df['businesstravel']
        .astype(str)           # Asegurar que es texto
        .str.strip()           # Quitar espacios iniciales/finales
        .str.replace('_', ' ') # Reemplazar guiones bajos por espacios
        .str.title()           # Poner en Title Case
    )
    print("Columna 'businesstravel' normalizada correctamente (sin guiones y con Title Case).")
else:
    print("La columna 'businesstravel' no existe en el DataFrame.")

# Mostrar los valores únicos para comprobar
print("\nValores únicos en 'businesstravel':")
print(df['businesstravel'].unique())

Columna 'businesstravel' normalizada correctamente (sin guiones y con Title Case).

Valores únicos en 'businesstravel':
['Unknown' 'Travel Rarely' 'Travel Frequently' 'Non-Travel']


In [114]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.722222,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,resEArch DIREcToR,3.0,Unknown,"16280,83$","42330,17$",7.0,No,13.0,3.0,3.0,Full Time,0.0,10.0,5.0,3.0,20.0,15.0,15.0,1972.0,"195370,00$",Yes
1,No,Unknown,2063.388889,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,ManAGeR,3.0,Unknown,Unknown,"43331,17$",0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34.0,5.0,3.0,33.0,11.0,9.0,1971.0,"199990,00$",1
2,No,Travel Rarely,1984.253968,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,ManaGER,4.0,Married,Unknown,"41669,33$",1.0,No,11.0,3.0,4.0,Unknown,0.0,22.0,3.0,3.0,22.0,11.0,15.0,1981.0,"192320,00$",1
3,No,Travel Rarely,1771.404762,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,"14307,50$","37199,50$",3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10.0,2.0,3.0,20.0,5.0,6.0,1976.0,"171690,00$",False
4,No,Unknown,1582.771346,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,"12783,92$","33238,20$",2.0,No,12.0,3.0,4.0,Unknown,1.0,10.0,5.0,3.0,19.0,2.0,8.0,1977.0,Unknown,0


#### educationfield

- Es de tipo object

- Le hacemos Title Case

In [116]:
# Limpieza de la columna 'educationfield'
# ====================================================
if 'educationfield' in df.columns:
    df['educationfield'] = (
        df['educationfield']
        .astype(str)      # Aseguramos que sea texto
        .str.strip()      # Quitamos espacios en blanco
        .str.title()      # Aplicamos Title Case
    )
    print("Columna 'educationfield' normalizada correctamente (Title Case aplicado).")
else:
    print("La columna 'educationfield' no existe en el DataFrame.")

# Mostrar valores únicos para comprobar
print("\nValores únicos en 'educationfield':")
print(df['educationfield'].unique())

Columna 'educationfield' normalizada correctamente (Title Case aplicado).

Valores únicos en 'educationfield':
['Unknown' 'Life Sciences' 'Technical Degree' 'Medical' 'Other'
 'Marketing' 'Human Resources']


In [117]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.722222,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,resEArch DIREcToR,3.0,Unknown,"16280,83$","42330,17$",7.0,No,13.0,3.0,3.0,Full Time,0.0,10.0,5.0,3.0,20.0,15.0,15.0,1972.0,"195370,00$",Yes
1,No,Unknown,2063.388889,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,ManAGeR,3.0,Unknown,Unknown,"43331,17$",0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34.0,5.0,3.0,33.0,11.0,9.0,1971.0,"199990,00$",1
2,No,Travel Rarely,1984.253968,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,ManaGER,4.0,Married,Unknown,"41669,33$",1.0,No,11.0,3.0,4.0,Unknown,0.0,22.0,3.0,3.0,22.0,11.0,15.0,1981.0,"192320,00$",1
3,No,Travel Rarely,1771.404762,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,"14307,50$","37199,50$",3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10.0,2.0,3.0,20.0,5.0,6.0,1976.0,"171690,00$",False
4,No,Unknown,1582.771346,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,"12783,92$","33238,20$",2.0,No,12.0,3.0,4.0,Unknown,1.0,10.0,5.0,3.0,19.0,2.0,8.0,1977.0,Unknown,0


#### maritalstatus

- Es de tipo object.

- Tiene duplicados (Marreid, divorced)

- Corregimos esos duplicados mal escritos

In [119]:
# Limpieza de la columna 'maritalstatus'
# ====================================================
if 'maritalstatus' in df.columns:
    df['maritalstatus'] = (
        df['maritalstatus']
        .astype(str)      # Aseguramos que es texto
        .str.strip()      # Quitamos espacios extra
        .str.lower()      # Pasamos a minúsculas para corregir con facilidad
        .replace({
            'marreid': 'married',
            'divorced': 'divorced'  # en caso de estar en minúsculas
        })
        .str.title()      # Volvemos a Title Case (Married, Divorced, Single, etc.)
    )
    print("Columna 'maritalstatus' corregida y normalizada (errores tipográficos arreglados).")
else:
    print("La columna 'maritalstatus' no existe en el DataFrame.")

# Mostrar valores únicos para comprobar
print("\nValores únicos en 'maritalstatus':")
print(df['maritalstatus'].unique())

Columna 'maritalstatus' corregida y normalizada (errores tipográficos arreglados).

Valores únicos en 'maritalstatus':
['Unknown' 'Married' 'Divorced' 'Single']


In [120]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.722222,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,resEArch DIREcToR,3.0,Unknown,"16280,83$","42330,17$",7.0,No,13.0,3.0,3.0,Full Time,0.0,10.0,5.0,3.0,20.0,15.0,15.0,1972.0,"195370,00$",Yes
1,No,Unknown,2063.388889,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,ManAGeR,3.0,Unknown,Unknown,"43331,17$",0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34.0,5.0,3.0,33.0,11.0,9.0,1971.0,"199990,00$",1
2,No,Travel Rarely,1984.253968,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,ManaGER,4.0,Married,Unknown,"41669,33$",1.0,No,11.0,3.0,4.0,Unknown,0.0,22.0,3.0,3.0,22.0,11.0,15.0,1981.0,"192320,00$",1
3,No,Travel Rarely,1771.404762,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,"14307,50$","37199,50$",3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10.0,2.0,3.0,20.0,5.0,6.0,1976.0,"171690,00$",False
4,No,Unknown,1582.771346,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,"12783,92$","33238,20$",2.0,No,12.0,3.0,4.0,Unknown,1.0,10.0,5.0,3.0,19.0,2.0,8.0,1977.0,Unknown,0


#### overtime

- Es de tipo object

- Incluye "Unknown"

- Vamos a mantener y capitalizar

In [121]:
# Limpieza de la columna 'overtime'
# ====================================================
if 'overtime' in df.columns:
    df['overtime'] = (
        df['overtime']
        .astype(str)      # Aseguramos tipo texto
        .str.strip()      # Quitamos espacios extra
        .str.capitalize() # Capitalizamos (Yes, No, Unknown)
    )
    print("Columna 'overtime' normalizada (mantiene 'Unknown' y aplica capitalización).")
else:
    print("La columna 'overtime' no existe en el DataFrame.")

# Mostrar valores únicos para comprobar
print("\nValores únicos en 'overtime':")
print(df['overtime'].unique())

Columna 'overtime' normalizada (mantiene 'Unknown' y aplica capitalización).

Valores únicos en 'overtime':
['No' 'Unknown' 'Yes']


In [122]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.722222,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,resEArch DIREcToR,3.0,Unknown,"16280,83$","42330,17$",7.0,No,13.0,3.0,3.0,Full Time,0.0,10.0,5.0,3.0,20.0,15.0,15.0,1972.0,"195370,00$",Yes
1,No,Unknown,2063.388889,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,ManAGeR,3.0,Unknown,Unknown,"43331,17$",0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34.0,5.0,3.0,33.0,11.0,9.0,1971.0,"199990,00$",1
2,No,Travel Rarely,1984.253968,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,ManaGER,4.0,Married,Unknown,"41669,33$",1.0,No,11.0,3.0,4.0,Unknown,0.0,22.0,3.0,3.0,22.0,11.0,15.0,1981.0,"192320,00$",1
3,No,Travel Rarely,1771.404762,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,"14307,50$","37199,50$",3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10.0,2.0,3.0,20.0,5.0,6.0,1976.0,"171690,00$",False
4,No,Unknown,1582.771346,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,"12783,92$","33238,20$",2.0,No,12.0,3.0,4.0,Unknown,1.0,10.0,5.0,3.0,19.0,2.0,8.0,1977.0,Unknown,0


#### 

#### Monthlyincome, monthlyrate, salary
- Es de tipo object
- Son numéricos con símbolos o comas
- Vamos a eliminar símbolos y convertir a float

In [123]:

# ====================================================
# Limpieza de columnas numéricas con símbolos
# ====================================================
cols_num = ['monthlyincome', 'monthlyrate', 'salary']

for col in cols_num:
    if col in df.columns:
        df[col] = (
            df[col]
            .astype(str)
            .str.replace(r'[\$,]', '', regex=True)  # Elimina $ y comas
            .str.replace(' ', '', regex=False)      # Elimina espacios
            .str.replace(',', '.', regex=False)     # Convierte coma decimal a punto
        )
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convierte a float

        print(f"Columna '{col}' limpiada y convertida a tipo float.")
    else:
        print(f"La columna '{col}' no existe en el DataFrame.")

# Verificar tipos y valores
print("\nTipos de datos después de la conversión:")
print(df[cols_num].dtypes)

print("\nMuestras de valores convertidos:")
print(df[cols_num].head())

Columna 'monthlyincome' limpiada y convertida a tipo float.
Columna 'monthlyrate' limpiada y convertida a tipo float.
Columna 'salary' limpiada y convertida a tipo float.

Tipos de datos después de la conversión:
monthlyincome    float64
monthlyrate        int64
salary           float64
dtype: object

Muestras de valores convertidos:
   monthlyincome  monthlyrate      salary
0      1628083.0      4233017  19537000.0
1            NaN      4333117  19999000.0
2            NaN      4166933  19232000.0
3      1430750.0      3719950  17169000.0
4      1278392.0      3323820         NaN


In [124]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.722222,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,resEArch DIREcToR,3.0,Unknown,1628083.0,4233017,7.0,No,13.0,3.0,3.0,Full Time,0.0,10.0,5.0,3.0,20.0,15.0,15.0,1972.0,19537000.0,Yes
1,No,Unknown,2063.388889,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,ManAGeR,3.0,Unknown,,4333117,0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34.0,5.0,3.0,33.0,11.0,9.0,1971.0,19999000.0,1
2,No,Travel Rarely,1984.253968,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,ManaGER,4.0,Married,,4166933,1.0,No,11.0,3.0,4.0,Unknown,0.0,22.0,3.0,3.0,22.0,11.0,15.0,1981.0,19232000.0,1
3,No,Travel Rarely,1771.404762,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,1430750.0,3719950,3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10.0,2.0,3.0,20.0,5.0,6.0,1976.0,17169000.0,False
4,No,Unknown,1582.771346,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,1278392.0,3323820,2.0,No,12.0,3.0,4.0,Unknown,1.0,10.0,5.0,3.0,19.0,2.0,8.0,1977.0,,0


| Columna                                  | Tipo    | Problema detectado                              | Acción recomendada                      |
| ---------------------------------------- | ------- | ----------------------------------------------- | --------------------------------------- |
| `gender`                                 | float64 | Debería ser categórica (“M” / “F”)              | Convertir a `category` y normalizar     |
| `businesstravel`                         | object  | Guiones bajos, minúsculas inconsistentes        | Reemplazar `_` por espacio + Title Case |
| `department`                             | object  | Correcto salvo “Unknown”                        | Mantener formato                        |
| `educationfield`                         | object  | Ok, pero forzar Title Case                      | Title Case                              |
| `maritalstatus`                          | object  | Duplicados (“Marreid”, “divorced”)              | Corregir → “Married”, “Divorced”        |
| `overtime`                               | object  | Incluye “Unknown”                               | Mantener y capitalizar                  |
| `standardhours`                          | object  | “Full Time”, “Part Time”, “Unknown”             | Capitalizar correctamente               |
| `monthlyincome`, `monthlyrate`, `salary` | object  | Numéricos con símbolos o comas                  | Eliminar símbolos y convertir a float   |
| `remotework`                             | object  | Contiene “Yes”, “No”, “1”, “0”, “True”, “False” | Unificar en “Yes” / “No”                |
| `roledepartament`                        | object  | 302 valores → redundante o inconsistente        | Revisar o eliminar                      |
| `jobrole`                                | object  | 1579 valores únicos → posiblemente redundante   | Revisar o agrupar si es necesario       |


#### roledepartment
- Es de tipo object
- Valores redundantes o inconsistentes. Revisar o eliminar

#### jobrole
- Es de tipo object
- Muchos valores redundantes
- Podemos agrupar si es necesario

#### remotework

- Es de tipo object
- Contiene Yes, No, 1, 0, True, False
- Unificamos en Yes/No

In [125]:

# Limpieza de la columna 'remotework'
# ====================================================
if 'remotework' in df.columns:
    df['remotework'] = (
        df['remotework']
        .astype(str)        # Aseguramos que sea texto
        .str.strip()        # Quitamos espacios
        .str.lower()        # Pasamos a minúsculas para normalizar
        .replace({
            '1': 'yes',
            'true': 'yes',
            '0': 'no',
            'false': 'no'
        })
        .str.capitalize()   # Dejamos formato "Yes" / "No"
    )

    print("✅ Columna 'remotework' unificada correctamente (Yes/No).")
else:
    print("⚠️ La columna 'remotework' no existe en el DataFrame.")

# Mostrar valores únicos para comprobar
print("\nValores únicos en 'remotework':")
print(df['remotework'].unique())

✅ Columna 'remotework' unificada correctamente (Yes/No).

Valores únicos en 'remotework':
['Yes' 'No']


In [126]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.722222,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,resEArch DIREcToR,3.0,Unknown,1628083.0,4233017,7.0,No,13.0,3.0,3.0,Full Time,0.0,10.0,5.0,3.0,20.0,15.0,15.0,1972.0,19537000.0,Yes
1,No,Unknown,2063.388889,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,ManAGeR,3.0,Unknown,,4333117,0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34.0,5.0,3.0,33.0,11.0,9.0,1971.0,19999000.0,Yes
2,No,Travel Rarely,1984.253968,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,ManaGER,4.0,Married,,4166933,1.0,No,11.0,3.0,4.0,Unknown,0.0,22.0,3.0,3.0,22.0,11.0,15.0,1981.0,19232000.0,Yes
3,No,Travel Rarely,1771.404762,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,1430750.0,3719950,3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10.0,2.0,3.0,20.0,5.0,6.0,1976.0,17169000.0,No
4,No,Unknown,1582.771346,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,1278392.0,3323820,2.0,No,12.0,3.0,4.0,Unknown,1.0,10.0,5.0,3.0,19.0,2.0,8.0,1977.0,,No


#### Redondeo de las columnas numéricas a dos decimales

In [127]:
# Función para redondear columnas numéricas
# ====================================================
def redondear_numericas(df, decimales=2):
    """
    Redondea todas las columnas numéricas de un DataFrame
    al número de decimales indicado.

    Parámetros:
    -----------
    df : pd.DataFrame
        El DataFrame a procesar.
    decimales : int (por defecto=2)
        Número de decimales al que redondear.

    Retorna:
    --------
    df : pd.DataFrame
        DataFrame con columnas numéricas redondeadas.
    """
    # Detectar columnas numéricas
    cols_num = df.select_dtypes(include=[np.number]).columns.tolist()

    # Redondear solo si existen columnas numéricas
    if cols_num:
        df[cols_num] = df[cols_num].round(decimales)
        print(f"✅ {len(cols_num)} columnas numéricas redondeadas a {decimales} decimales.")
    else:
        print("⚠️ No se encontraron columnas numéricas para redondear.")

    return df

In [128]:
df= redondear_numericas(df)

✅ 23 columnas numéricas redondeadas a 2 decimales.


In [129]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.72,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,resEArch DIREcToR,3.0,Unknown,1628083.0,4233017,7.0,No,13.0,3.0,3.0,Full Time,0.0,10.0,5.0,3.0,20.0,15.0,15.0,1972.0,19537000.0,Yes
1,No,Unknown,2063.39,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,ManAGeR,3.0,Unknown,,4333117,0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34.0,5.0,3.0,33.0,11.0,9.0,1971.0,19999000.0,Yes
2,No,Travel Rarely,1984.25,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,ManaGER,4.0,Married,,4166933,1.0,No,11.0,3.0,4.0,Unknown,0.0,22.0,3.0,3.0,22.0,11.0,15.0,1981.0,19232000.0,Yes
3,No,Travel Rarely,1771.4,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,1430750.0,3719950,3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10.0,2.0,3.0,20.0,5.0,6.0,1976.0,17169000.0,No
4,No,Unknown,1582.77,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,1278392.0,3323820,2.0,No,12.0,3.0,4.0,Unknown,1.0,10.0,5.0,3.0,19.0,2.0,8.0,1977.0,,No


#### datebirth


In [130]:
#Limpieza de la columna 'datebirth'
# ====================================================
if 'datebirth' in df.columns:
    # Convertimos la columna a formato fecha
    df['datebirth'] = pd.to_datetime(df['datebirth'], errors='coerce')

    # Extraemos solo el año
    df['datebirth'] = df['datebirth'].dt.year

    # Si quedan valores NaN (porque no eran fechas), intentamos leerlos como números
    df['datebirth'] = df['datebirth'].fillna(
        pd.to_numeric(df['datebirth'], errors='coerce')
    )

    # Convertimos a entero (sin decimales)
    df['datebirth'] = df['datebirth'].astype('Int64')

    print("✅ Columna 'datebirth' corregida: solo año, sin decimales.")
else:
    print("⚠️ La columna 'datebirth' no existe en el DataFrame.")

# Mostrar los primeros valores para comprobar
print("\nEjemplo de valores en 'datebirth':")
print(df['datebirth'].head())
print("\nTipo de dato:", df['datebirth'].dtypes)

✅ Columna 'datebirth' corregida: solo año, sin decimales.

Ejemplo de valores en 'datebirth':
0    1970
1    1970
2    1970
3    1970
4    1970
Name: datebirth, dtype: Int64

Tipo de dato: Int64


In [131]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.72,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,resEArch DIREcToR,3.0,Unknown,1628083.0,4233017,7.0,No,13.0,3.0,3.0,Full Time,0.0,10.0,5.0,3.0,20.0,15.0,15.0,1970,19537000.0,Yes
1,No,Unknown,2063.39,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,ManAGeR,3.0,Unknown,,4333117,0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34.0,5.0,3.0,33.0,11.0,9.0,1970,19999000.0,Yes
2,No,Travel Rarely,1984.25,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,ManaGER,4.0,Married,,4166933,1.0,No,11.0,3.0,4.0,Unknown,0.0,22.0,3.0,3.0,22.0,11.0,15.0,1970,19232000.0,Yes
3,No,Travel Rarely,1771.4,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,1430750.0,3719950,3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10.0,2.0,3.0,20.0,5.0,6.0,1970,17169000.0,No
4,No,Unknown,1582.77,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,1278392.0,3323820,2.0,No,12.0,3.0,4.0,Unknown,1.0,10.0,5.0,3.0,19.0,2.0,8.0,1970,,No


In [133]:
# ====================================================
# Detección automática de columnas tipo "año"
# ====================================================
# Criterio: columnas numéricas u objeto con valores entre 1900 y el año actual
current_year = pd.Timestamp.now().year

cols_anio = []
for col in df.columns:
    # Intentamos convertir la columna a número
    temp = pd.to_numeric(df[col], errors='coerce')
    # Si la mayoría de valores están en rango típico de años, la marcamos
    validos = temp.between(1900, current_year, inclusive='both').sum()
    if validos > 0 and validos / len(df) > 0.5:  # más del 50% de valores parecen años
        cols_anio.append(col)

# ====================================================
#  Conversión de columnas detectadas a Int64
# ====================================================
for col in cols_anio:
    df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
    print(f"'{col}' convertida correctamente a año entero (Int64).")

# ====================================================
# Verificación
# ====================================================
print("\n Columnas detectadas como 'año':")
print(cols_anio)

print("\nTipos de datos después de la conversión:")
print(df[cols_anio].dtypes)

print("\n Ejemplo de valores:")
print(df[cols_anio].head())

'datebirth' convertida correctamente a año entero (Int64).

 Columnas detectadas como 'año':
['datebirth']

Tipos de datos después de la conversión:
datebirth    Int64
dtype: object

 Ejemplo de valores:
   datebirth
0       1970
1       1970
2       1970
3       1970
4       1970


In [134]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.72,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,resEArch DIREcToR,3.0,Unknown,1628083.0,4233017,7.0,No,13.0,3.0,3.0,Full Time,0.0,10.0,5.0,3.0,20.0,15.0,15.0,1970,19537000.0,Yes
1,No,Unknown,2063.39,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,ManAGeR,3.0,Unknown,,4333117,0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34.0,5.0,3.0,33.0,11.0,9.0,1970,19999000.0,Yes
2,No,Travel Rarely,1984.25,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,ManaGER,4.0,Married,,4166933,1.0,No,11.0,3.0,4.0,Unknown,0.0,22.0,3.0,3.0,22.0,11.0,15.0,1970,19232000.0,Yes
3,No,Travel Rarely,1771.4,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,1430750.0,3719950,3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10.0,2.0,3.0,20.0,5.0,6.0,1970,17169000.0,No
4,No,Unknown,1582.77,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,1278392.0,3323820,2.0,No,12.0,3.0,4.0,Unknown,1.0,10.0,5.0,3.0,19.0,2.0,8.0,1970,,No


In [137]:
# Conversión de columnas de años a tipo entero
# ====================================================
cols_anios = [
    'yearsatcompany',
    'yearssincelastpromotion',
    'yearswithcurrmanager',
    'totalworkingyears'
]

for col in cols_anios:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
        print(f"'{col}' convertido correctamente a año entero (Int64).")
    else:
        print(f"La columna '{col}' no existe en el DataFrame.")

# ====================================================
# 🔍 Verificación
# ====================================================
print("\n Tipos de datos después de la conversión:")
print(df[cols_anios].dtypes)

print("\nEjemplo de valores:")
print(df[cols_anios].head())

'yearsatcompany' convertido correctamente a año entero (Int64).
'yearssincelastpromotion' convertido correctamente a año entero (Int64).
'yearswithcurrmanager' convertido correctamente a año entero (Int64).
'totalworkingyears' convertido correctamente a año entero (Int64).

 Tipos de datos después de la conversión:
yearsatcompany             Int64
yearssincelastpromotion    Int64
yearswithcurrmanager       Int64
totalworkingyears          Int64
dtype: object

Ejemplo de valores:
   yearsatcompany  yearssincelastpromotion  yearswithcurrmanager  \
0              20                       15                    15   
1              33                       11                     9   
2              22                       11                    15   
3              20                        5                     6   
4              19                        2                     8   

   totalworkingyears  
0                 10  
1                 34  
2                 22  
3              

In [138]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.72,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,resEArch DIREcToR,3.0,Unknown,1628083.0,4233017,7.0,No,13.0,3.0,3.0,Full Time,0.0,10,5.0,3.0,20,15,15,1970,19537000.0,Yes
1,No,Unknown,2063.39,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,ManAGeR,3.0,Unknown,,4333117,0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34,5.0,3.0,33,11,9,1970,19999000.0,Yes
2,No,Travel Rarely,1984.25,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,ManaGER,4.0,Married,,4166933,1.0,No,11.0,3.0,4.0,Unknown,0.0,22,3.0,3.0,22,11,15,1970,19232000.0,Yes
3,No,Travel Rarely,1771.4,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,1430750.0,3719950,3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10,2.0,3.0,20,5,6,1970,17169000.0,No
4,No,Unknown,1582.77,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,1278392.0,3323820,2.0,No,12.0,3.0,4.0,Unknown,1.0,10,5.0,3.0,19,2,8,1970,,No


In [139]:

# ====================================================
# 1️Comprobación general
# ====================================================
total_nulos = df.isna().sum().sum()

if total_nulos == 0:
    print("No quedan valores nulos en el dataset.")
else:
    print(f"Quedan {total_nulos} valores nulos en total.\n")

    # ====================================================
    # Detalle por columna (solo las que tienen nulos)
    # ====================================================
    nulos_df = pd.DataFrame({
        "Columna": df.columns,
        "Nulos": df.isna().sum(),
        "Porcentaje (%)": (df.isna().sum() / len(df) * 100).round(2),
        "Tipo de dato": df.dtypes.astype(str)
    })

    nulos_df = nulos_df[nulos_df["Nulos"] > 0].sort_values(by="Porcentaje (%)", ascending=False)

    display(nulos_df)

Quedan 742 valores nulos en total.



Unnamed: 0,Columna,Nulos,Porcentaje (%),Tipo de dato
monthlyincome,monthlyincome,468,29.0,float64
salary,salary,274,16.98,float64


In [141]:
# Imputación de valores nulos por la mediana
# ====================================================
cols_imputar = ['monthlyincome', 'salary']

for col in cols_imputar:
    if col in df.columns:
        mediana = df[col].median()
        nulos_antes = df[col].isna().sum()
        df[col] = df[col].fillna(mediana)
        print(f"'{col}' imputado con la mediana ({mediana:.2f}). "
              f"Se reemplazaron {nulos_antes} valores nulos.")
    else:
        print(f"La columna '{col}' no existe en el DataFrame.")

# ====================================================
# Verificación final
# ====================================================
print("\nNulos restantes después de imputar:")
print(df[cols_imputar].isna().sum())

'monthlyincome' imputado con la mediana (449284.00). Se reemplazaron 0 valores nulos.
'salary' imputado con la mediana (5391411.00). Se reemplazaron 0 valores nulos.

Nulos restantes después de imputar:
monthlyincome    0
salary           0
dtype: int64


In [142]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.72,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,resEArch DIREcToR,3.0,Unknown,1628083.0,4233017,7.0,No,13.0,3.0,3.0,Full Time,0.0,10,5.0,3.0,20,15,15,1970,19537000.0,Yes
1,No,Unknown,2063.39,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,ManAGeR,3.0,Unknown,449284.0,4333117,0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34,5.0,3.0,33,11,9,1970,19999000.0,Yes
2,No,Travel Rarely,1984.25,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,ManaGER,4.0,Married,449284.0,4166933,1.0,No,11.0,3.0,4.0,Unknown,0.0,22,3.0,3.0,22,11,15,1970,19232000.0,Yes
3,No,Travel Rarely,1771.4,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,ReseArCH DIrECtOr,3.0,Married,1430750.0,3719950,3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10,2.0,3.0,20,5,6,1970,17169000.0,No
4,No,Unknown,1582.77,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,sAleS EXECUtIve,1.0,Divorced,1278392.0,3323820,2.0,No,12.0,3.0,4.0,Unknown,1.0,10,5.0,3.0,19,2,8,1970,5391411.0,No


In [143]:
# Limpieza y normalización de la columna 'jobrole'
# ====================================================
if 'jobrole' in df.columns:
    df['jobrole'] = (
        df['jobrole']
        .astype(str)          # Asegurar texto
        .str.strip()          # Quitar espacios al inicio y final
        .str.lower()          # Todo a minúsculas
        .str.replace('_', ' ')# Por si hay guiones bajos
        .str.title()          # Formato Title Case (Research Director, Manager, etc.)
    )
    print("Columna 'jobrole' normalizada correctamente (Title Case aplicado).")
else:
    print("La columna 'jobrole' no existe en el DataFrame.")

# Mostrar valores únicos para verificar
print("\nValores únicos en 'jobrole':")
print(df['jobrole'].unique())

Columna 'jobrole' normalizada correctamente (Title Case aplicado).

Valores únicos en 'jobrole':
['Research Director' 'Manager' 'Sales Executive' 'Manufacturing Director'
 'Research Scientist' 'Healthcare Representative' 'Laboratory Technician'
 'Sales Representative' 'Human Resources']


In [144]:
df.head()

Unnamed: 0,attrition,businesstravel,dailyrate,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,No,Unknown,2015.72,6.0,3.0,Unknown,1.0,1.0,F,3.0,5.0,Research Director,3.0,Unknown,1628083.0,4233017,7.0,No,13.0,3.0,3.0,Full Time,0.0,10,5.0,3.0,20,15,15,1970,19537000.0,Yes
1,No,Unknown,2063.39,1.0,4.0,Life Sciences,2.0,3.0,F,2.0,5.0,Manager,3.0,Unknown,449284.0,4333117,0.0,Unknown,14.0,3.0,1.0,Unknown,1.0,34,5.0,3.0,33,11,9,1970,19999000.0,Yes
2,No,Travel Rarely,1984.25,4.0,2.0,Technical Degree,3.0,3.0,F,3.0,5.0,Manager,4.0,Married,449284.0,4166933,1.0,No,11.0,3.0,4.0,Unknown,0.0,22,3.0,3.0,22,11,15,1970,19232000.0,Yes
3,No,Travel Rarely,1771.4,2.0,4.0,Medical,4.0,1.0,M,3.0,4.0,Research Director,3.0,Married,1430750.0,3719950,3.0,Unknown,19.0,3.0,2.0,Full Time,2.0,10,2.0,3.0,20,5,6,1970,17169000.0,No
4,No,Unknown,1582.77,3.0,3.0,Technical Degree,5.0,1.0,M,4.0,4.0,Sales Executive,1.0,Divorced,1278392.0,3323820,2.0,No,12.0,3.0,4.0,Unknown,1.0,10,5.0,3.0,19,2,8,1970,5391411.0,No
