In [28]:
import pandas as pd
import numpy as np
from datetime import datetime
# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

In [29]:
df_clean = pd.read_csv('hr_raw_data_clean.csv', index_col=0)

### FUNCIÓN PARA HOMOGENEIZAR DATOS DE "AGE"

In [3]:
def age(year):
    '''Mantenemos la columna edad actualizada restando el año actual 
    con el año de nacimiento registrado en la columna "datebrith".
    usamos la libreria datetime
    '''
    age_def = (datetime.now().year) - year
    return age_def

df_clean["age"] = df_clean["datebirth"].apply(age)

df_clean.sample(5)

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
1216,47,No,travel_rarely,1973.984127,,2,4,,1,1217,3,0,246.748016,3,5,REseARcH DiRectOr,4,,"15943,72$","41453,67$",7,Y,,17,30,3,Full Time,1,240.0,3,30,2,,2,2,"15943,72$",1977,"191324,62$",,,1
1487,22,No,,213.571429,,-25,1,,1,1488,4,1,26.696429,2,1,REsEaRch scIenTIsT,2,Single,,"4485,00$",1,Y,Yes,11,30,3,Part Time,0,20.0,6,40,2,,2,2,,2002,"20700,00$",,,1
1060,45,No,,352.857143,,20,3,Life Sciences,1,1061,4,0,44.107143,3,1,rEsEarCH SCIeNtist,2,,"2850,00$","7410,00$",1,Y,,13,30,3,Full Time,0,60.0,3,20,5,,1,3,"2850,00$",1979,"34200,00$",,,0
1389,46,No,travel_frequently,1863.436508,,25,3,Medical,1,1390,2,1,,3,5,manAGEr,2,,"15050,83$","39132,17$",3,,,22,40,3,Full Time,0,220.0,4,30,0,,0,0,"15050,83$",1978,"180610,00$",,,0
1499,32,No,travel_rarely,564.15873,,29,4,Marketing,1,1500,1,1,,2,2,saLeS eXEcUTivE,4,,,"11847,33$",1,,,14,30,1,Full Time,2,,3,30,12,,5,7,,1992,"54680,00$",,,1


In [5]:
df_clean["age"].isnull().sum()

np.int64(0)

### FUNCIÓN PARA REEMPLAZAR , por . Y ELIMINAR $

In [None]:
def replace_dot(cadena):
    ''' Para las columnas cuyos valores son object con separación con comas y símbolo 
    de $ al final. 
    Sustituir , por . y eliminar &
    Pasar el valor a float'''
    try:
        # Reemplazar las comas por puntos en la cadena
        return float(cadena.replace(",", ".").replace("$",""))
    
    except:
        # Si ocurre algún error (por ejemplo, si el argumento no es una cadena),
        # devolver np.nan (valor Not a Number) para indicar un valor inválido o no disponible.
        return np.nan

lista_columnas=["monthlyincome", "monthlyrate", "performancerating", "totalworkingyears", "worklifebalance", "sameasmonthlyincome", "salary"]

for columna in lista_columnas:
     df_clean[columna] = df_clean[columna].apply(replace_dot)

df_clean.head(5)

### redondear valores de tipo float con muchos decimales, a 2 decimales

In [None]:
df_clean["dailyrate"] = round(df_clean["dailyrate"], 2)
df_clean["hourlyrate"] = round(df_clean["hourlyrate"], 2)
df_clean.head(2)

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
0,52,No,,2015.72,,6,3,,1,1,1,0,,3,5,resEArch DIREcToR,3,,16280.83,42330.17,7,Y,No,13,3.0,3,Full Time,0,,5,3.0,20,,15,15,16280.83,1972,195370.0,,,Yes
1,53,No,,2063.39,,1,4,Life Sciences,1,2,3,0,,2,5,ManAGeR,3,,,43331.17,0,,,14,3.0,1,,1,34.0,5,3.0,33,,11,9,,1971,199990.0,,,1


In [6]:
df_clean['salary'] = df_clean.apply(lambda row: np.round(row['salary'], 2), axis = 1)

In [17]:
def eliminar_columnas(df, nombre_columna):
    df_drop = df.drop(nombre_columna, axis=1, inplace=True) # hay que asignarle una variable para que lo guarde. Comprobado que funciona, cambiamos el inplace por True.
    print(f'La columna {nombre_columna} se ha eliminado correctamente.')
    return df_drop.head(1)

### FUNCIÓN MAP PARA HOMOGENEIZAR REMOTEWORK

In [8]:
diccionario_remote = {"1": "Yes", "0": "No", "Yes": "Yes", "True": "Yes", "False": "No"}
df_clean["remotework"] = df_clean["remotework"].map(diccionario_remote)

df_clean.head(2)

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
0,52,No,,2015.72,,6,3,,1,1,1,0,,3,5,resEArch DIREcToR,3,,16280.83,42330.17,7,Y,No,13,3.0,3,Full Time,0,,5,3.0,20,,15,15,16280.83,1972,195370.0,,,Yes
1,53,No,,2063.39,,1,4,Life Sciences,1,2,3,0,,2,5,ManAGeR,3,,,43331.17,0,,,14,3.0,1,,1,34.0,5,3.0,33,,11,9,,1971,199990.0,,,Yes


### FUNCIÓN PARA PONER EN MINÚSCULAS

In [9]:
def minusculas(cadena): 
    try:
        return cadena.lower()
    except:
        return "no data"

lista_columnas=["department", "educationfield", "jobrole", "roledepartament"]
for col in lista_columnas:
    df_clean[col] = df_clean[col].apply(minusculas)

## FUNCION PARA RATIOS DE "ENVIRONMENTSATISFACTION"

In [10]:
def clean_satisfaction(valoracion):
    try: 
        if valoracion >= 10:
            return int(str(valoracion/10).split('.')[0])
        else: 
            return valoracion
    except:
            return np.nan


df_clean["environmentsatisfaction"] = df_clean["environmentsatisfaction"].apply(clean_satisfaction)

### GENDER

In [11]:
dict_gender = {0: "M", 1: "F"}

df_clean["gender"] = df_clean["gender"].map(dict_gender)

# 14 de noviembre de 2024

### FUNCIÓN PARA BORRAR COLUMNAS

In [19]:
def clean (dataframe,col_data):
    
    return dataframe.drop([col_data],axis=1, inplace= True)


In [29]:
lista_drop= ["sameasmonthlyincome", "over18", "numberchildren", "employeecount"]

for col in lista_drop:
    clean(df_clean, col)


In [None]:
df_clean[["yearsincurrentrole", "yearssincelastpromotion", "yearsatcompany"]].sample(10)

In [None]:
df_clean[["joblevel","jobrole", "roledepartament","department"]].sample(10)

### GUARDAR CSV

In [7]:
df_clean.to_csv("hr_raw_data_clean.csv")

ELIMINAR '-' EN COLUMNA DISTANCEFROMHOME

In [9]:
'''Sacando columnas numéricas para ver si hay alguna columna más con negativos aparte del distancefromhome'''
df_numericas = df_clean.select_dtypes(include=['number'])

In [None]:
''' ¿Cuántas veces está el - en cada columna numérica? Vemos que solo está presente en distancefromhome'''
for col in df_numericas.columns:
    num_menos_col = df_numericas[col].astype(str).str.contains('-', regex=False).sum()
    print(f' - está presente en {col} {num_menos_col} veces.')

In [None]:
def convert_negatives_in_absolute(df, columns): 
    df[columns] = df[columns].abs()  # Aplica abs() solo a las columnas seleccionadas 
    return df # Llamar a la función, por ejemplo, para convertir las columnas 'A' y 'B'
df_clean = convert_negatives_in_absolute(df_clean, 'distancefromhome') # Mostrar el resultado print(df)
df_clean.sample(2)

In [14]:
df_clean['distancefromhome'].astype(str).str.contains('-', regex=False).sum()

np.int64(0)

In [None]:
df_clean['distancefromhome'].value_counts()

## CREAR COLUMNA NUEVA DE DEPARTAMENTO 
### A PARTIR DE LA INFO QUE TENEMOS EN EL DF[['jobrole', 'roledepartament', 'department']]

In [16]:
'''Creamos un diccionario con los puestos como claves y los departamentos como valores.
Función que asigna el departamento según el puesto. Si no encuentra la clave en el diccionario, devuelve el valor de la columna
roledepartament para que no se cambien los manager que sí están asignados a algunos departamentos concretos - sales, human resources, etc.'''

dic = {'healthcare representative': 'research & development',
       'sales executive': 'sales',
       'laboratory technician': 'research & development',
       'manufacturing director': 'research & development',
       'research scientist': 'research & development',
       'research director': 'research & development',
       'human resources': 'human resources',
       'sales representative': 'sales'}

def assign_departament(puesto, departamento, otra_columna):
    return departamento.get(puesto, otra_columna)

# Limpiamos la columna 'jobrole' de espacios y le hacemos un lower, ya que no estaba reconociendo las claves y es posible que hubiera algún 
# espacio en alguna celda.
df_clean['jobrole'] = df_clean['jobrole'].str.strip().str.lower()

# Asignamos el departamento correspondiente para cada puesto en la columna 'correct_department'
df_clean['correct_department'] = df_clean.apply(lambda fila: assign_departament(fila['jobrole'], dic, fila['roledepartament']), axis=1)

### ELIMINAR COLUMNA 'ROLEDEPARTAMENT' Y 'DEPARTMENT'

In [21]:
# Utilizamos función creada previamente.

lista_drop1= ["roledepartament", "department"]

for col in lista_drop1:
    clean(df_clean, col)

### CAMBIAMOS NOMBRE DE COLUMNA 'CORRECT_DEPARTMENT' A 'DEPARTMENT'

In [24]:
df_clean.rename(columns={'correct_department': 'department'}, inplace=True)

## Introducir los valores que faltan en la columna 'salary' multiplicando el 'monthlyincome' * 12.

In [29]:
'''Si la celda es mayor a 0, es decir tiene dato, se queda tal cual, si no, multiplicamos 'monthlyincome' * 12 para calcular salario anual.'''
df_clean['salary'] = df_clean.apply(lambda row: row['salary'] if row['salary'] > 0 else row['monthlyincome'] * 12, axis = 1)

## Introducir los valores que faltan en la columna 'monthlyincome' dividiendo el 'salary' / 12.

In [30]:
df_clean['monthlyincome'] = df_clean.apply(lambda row: row['monthlyincome'] if row['monthlyincome'] > 0 else row['salary'] / 12, axis = 1)

In [31]:
comp = round(df_clean['salary']/12 - df_clean['monthlyincome'], 2)
comp.unique()

array([     0.  , -83242.17, -86790.33, -93142.17])

In [32]:
df_clean[['monthlyincome', 'salary', 'employeenumber']].loc[df_clean['monthlyincome'] > df_clean['salary']]

Unnamed: 0,monthlyincome,salary,employeenumber
1316,84083.0,10090.0,1317
1359,87667.0,10520.0,1360
1464,94083.0,11290.0,1465


In [33]:
df_clean.loc[df_clean['employeenumber'].isin([1317,1360,1465]), "monthlyincome"]/100

1316    840.83
1359    876.67
1464    940.83
Name: monthlyincome, dtype: float64


## CAMBIAR NULOS POR DATO (non-travel y full time)

In [34]:
def nulos(df,columna, dato): 
    df[columna] = df[columna].fillna(dato)
    return 

In [35]:
nulos(df_clean, "businesstravel", "non-travel")

In [38]:
nulos(df_clean, "standardhours", "Full Time")

In [39]:
df_clean["standardhours"].unique()

array(['Full Time', 'Part Time'], dtype=object)

## FUNCIÓN PARA PASAR DE FLOAT A INTEGER

## falta por terminar!!!!

In [67]:
def numero_entero(valor):
    try: 
        if isinstance(valor):
            return int(valor) 
        else: 
            valor
    except:
        return np.nan

In [69]:
df_clean["worklifebalance"].isnull().sum()

np.int64(114)

In [63]:
lista = [ 1.0, 2.0, 3.0, np.nan, 4 ]

In [58]:
type(lista[0])

float

In [68]:
for numero in lista:
    numero_entero(numero)
print(lista)

[1.0, 2.0, 3.0, nan, 4]


In [70]:
df_clean.to_csv("hr_raw_data_clean.csv")