In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning) # elimina los errres de que van a cambiar

# importamos las librerías que necesitamos
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt
# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [200]:
class LimpiarDatos:
    def __init__(self):
        self.df = pd.read_csv("HR RAW DATA.csv", index_col=0)
    
    def cambiar_nombres_columnas (self):
        nombres_columnas = {"employeecount":"EmployeeCount",
                     "employeenumber" :"EmployeeNumber",
                     "NUMCOMPANIESWORKED" : "NumCompaniesWorked",
                     "TOTALWORKINGYEARS" :"TotalWorkingYears",
                     "WORKLIFEBALANCE" : "WorkLifeBalance",
                     "YEARSWITHCURRMANAGER" : "YearsWithCurrManager",
                     "NUMBERCHILDREN" : "NumberChildren"}
        self.df.rename(columns=nombres_columnas, inplace=True)
    
    def eliminar_columnas (self):
        columnas_eliminar = ["EmployeeCount", "Salary", "NumberChildren", "SameAsMonthlyIncome", "DateBirth", "YearsInCurrentRole"]
        self.df.drop(columns=columnas_eliminar, inplace=True)

    def cambiar_a_numerico (self):
        # convertir la edad str en número:
        edad = {'forty-seven': "47", 'fifty-eight': "58", 'thirty-six' : "36", 'fifty-five': "55",'fifty-two': "52",'thirty-one': "31",'thirty': "30", 'twenty-six': "26", 'thirty-seven': "37", 'thirty-two': "32", 'twenty-four':"24"}
        self.df["Age"].replace(edad, inplace=True)

        # quitar simbolo $
        self.df["DailyRate"] = self.df["DailyRate"].str.replace("$", "")

        # cambiar comas por puntos
        cambiar_float = ["DailyRate", "EmployeeNumber", "MonthlyIncome", "TotalWorkingYears"]
        for columna in cambiar_float:
            try:
                self.df[columna] = self.df[columna].str.replace(",", ".")
            except:
                self.df[columna] = np.nan

        # Convertir a formato numerico
        cambiar_int = ["Age", "DailyRate", "EmployeeNumber", "HourlyRate", "MonthlyIncome", "StandardHours", "TotalWorkingYears" ]
        for col in cambiar_int:
            try:
                self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
            except:
                self.df[col] = np.nan         
           

In [201]:
limpieza = LimpiarDatos()
limpieza.cambiar_nombres_columnas()
limpieza.eliminar_columnas()
limpieza.cambiar_a_numerico()

In [205]:
limpieza.df[["Age", "HourlyRate", "MonthlyIncome", "MonthlyRate", "StandardHours", "DailyRate", "EmployeeNumber", "TotalWorkingYears" ]].info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                1614 non-null   int64  
 1   HourlyRate         1530 non-null   float64
 2   MonthlyIncome      771 non-null    float64
 3   MonthlyRate        1614 non-null   int64  
 4   StandardHours      0 non-null      float64
 5   DailyRate          1490 non-null   float64
 6   EmployeeNumber     1183 non-null   float64
 7   TotalWorkingYears  1088 non-null   float64
dtypes: float64(6), int64(2)
memory usage: 113.5 KB


In [154]:
df = pd.read_csv("HR RAW DATA.csv", index_col=0)
df["HourlyRate"].unique()

array(['51', '65', '58', '82', '45', '99', '91', '64', '55', '68', '49',
       '61', '79', '31', '69', '48', '80', '74', '98', '59', '33', '56',
       '66', '57', '53', '87', '81', '84', '32', '41', '92', '47',
       'Not Available', '43', '86', '30', '42', '88', '96', '67', '62',
       '72', '78', '89', '52', '50', '90', '37', '94', '76', '60', '46',
       '83', '100', '40', '97', '54', '75', '39', '85', '63', '44', '93',
       '36', '35', '73', '71', '70', '38', '77', '95', '34'], dtype=object)

In [100]:
limpieza.df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1614 entries, 0 to 1613
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Age                       0 non-null      float64
 1   Attrition                 1614 non-null   object 
 2   BusinessTravel            842 non-null    object 
 3   DailyRate                 1490 non-null   float64
 4   Department                302 non-null    object 
 5   DistanceFromHome          1614 non-null   int64  
 6   Education                 1614 non-null   int64  
 7   EducationField            869 non-null    object 
 8   EmployeeNumber            1183 non-null   float64
 9   EnvironmentSatisfaction   1614 non-null   int64  
 10  Gender                    1614 non-null   int64  
 11  HourlyRate                0 non-null      float64
 12  JobInvolvement            1614 non-null   int32  
 13  JobLevel                  1614 non-null   int64  
 14  JobRole      

In [101]:
limpieza.df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,RoleDepartament,RemoteWork
0,,No,,684.0,,6,3,,162.0,1,0,,3,5,resEArch DIREcToR,3,,,6462,7,Y,No,13,30,3,,0,,5,30.0,20,,15,15,,Yes
1,,No,,699.0,,1,4,Life Sciences,259.0,3,0,,2,5,ManAGeR,3,,,5678,0,,,14,30,1,,1,34.0,5,30.0,33,,11,9,,1
2,,No,travel_rarely,532.0,Research & Development,4,2,Technical Degree,319.0,3,0,,3,5,ManaGER,4,Married,,4933,1,,No,11,30,4,,0,22.0,3,,22,,11,15,ManaGER - Research & Development,1
3,,No,travel_rarely,359.0,,2,4,Medical,,1,1,,3,4,ReseArCH DIrECtOr,3,Married,,26703,3,Y,,19,30,2,,2,,2,,20,,5,6,,False
4,,No,,1319.0,,3,3,Technical Degree,,1,1,,4,4,sAleS EXECUtIve,1,Divorced,,7739,2,Y,No,12,30,4,,1,,5,30.0,19,,2,8,,0


In [None]:
cambiar_numerico = ["Age","DailyRate", "DistanceFromHome", "employeenumber", "HourlyRate", "MonthlyIncome", "MonthlyRate"]

In [18]:
df = pd.read_csv("HR RAW DATA.csv", index_col=0)
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,employeecount,employeenumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NUMCOMPANIESWORKED,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TOTALWORKINGYEARS,TrainingTimesLastYear,WORKLIFEBALANCE,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YEARSWITHCURRMANAGER,SameAsMonthlyIncome,DateBirth,Salary,RoleDepartament,NUMBERCHILDREN,RemoteWork
0,51,No,,"684,0$",,6,3,,1,1620.0,1,0,51,3,5,resEArch DIREcToR,3,,195370.0,6462,7,Y,No,13,30,3,,0,,5,30.0,20,,15,15,195370.0,1972,1000000000$,,,Yes
1,52,No,,"699,0$",,1,4,Life Sciences,1,2590.0,3,0,65,2,5,ManAGeR,3,,199990.0,5678,0,,,14,30,1,,1,340.0,5,30.0,33,,11,9,199990.0,1971,1000000000$,,,1
2,42,No,travel_rarely,"532,0$",Research & Development,4,2,Technical Degree,1,3190.0,3,0,58,3,5,ManaGER,4,Married,192320.0,4933,1,,No,11,30,4,,0,220.0,3,,22,,11,15,192320.0,1981,1000000000$,ManaGER - Research & Development,,1
3,47,No,travel_rarely,"359,0$",,2,4,Medical,1,,1,1,82,3,4,ReseArCH DIrECtOr,3,Married,171690.0,26703,3,Y,,19,30,2,,2,,2,,20,,5,6,171690.0,1976,1000000000$,,,False
4,46,No,,"1319,0$",,3,3,Technical Degree,1,,1,1,45,4,4,sAleS EXECUtIve,1,Divorced,,7739,2,Y,No,12,30,4,,1,,5,30.0,19,,2,8,,1977,1000000000$,,,0
