# SESION 4 DICIEMBRE

# Modificamos columnas que eran object a float 

In [20]:
import pandas as pd
import numpy as np
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Librerías de visualización
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

In [21]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [22]:
df_rawdata = pd.read_csv("raw_data_20231201.csv", index_col=0)

In [23]:
def cambiar_comas(cadena):
    try:
        return float(cadena.replace(",", "."))
    except:
        return np.nan
    
df_rawdata["Monthlyincome"] = df_rawdata["Monthlyincome"].apply(cambiar_comas)
df_rawdata["Performancerating"] = df_rawdata["Performancerating"].apply(cambiar_comas)
df_rawdata["Totalworkingyears"] = df_rawdata["Totalworkingyears"].apply(cambiar_comas)

In [24]:
df_rawdata["Employeenumber"]= df_rawdata["Employeenumber"].apply(cambiar_comas)

In [25]:
df_rawdata["Employeenumber"].isnull().sum()

431

# Gestión de nulos

Cambiamos los nulos de Employeenumber con un "auto-incremental"

In [26]:
# Encuentra el último ID existente
ultimo_id = df_rawdata['Employeenumber'].max()
# Define una variable para el siguiente ID después del último
siguiente_id = ultimo_id + 1
# Itera sobre los índices del DataFrame
for indice in df_rawdata.index:
    if pd.isnull(df_rawdata.at[indice, 'Employeenumber']):  # Verifica si el valor es nulo
        df_rawdata.at[indice, 'Employeenumber'] = siguiente_id  # Asigna el siguiente ID
        siguiente_id += 1  # Incrementa el siguiente ID
df_rawdata["Employeenumber"] = df_rawdata["Employeenumber"].astype(int)
# Muestra el DataFrame actualizado
print(df_rawdata["Employeenumber"].tail(10))

1604     927
1605     941
1606     956
1607    2497
1608     966
1609     967
1610     972
1611    2498
1612     990
1613    2499
Name: Employeenumber, dtype: int64


In [27]:
# Muestra el DataFrame actualizado
df_rawdata["Employeenumber"].dtype

dtype('int64')

In [28]:
nulls_cat = df_rawdata[df_rawdata.columns[df_rawdata.isnull().any()]].select_dtypes(include = "O").columns
nulls_cat

Index(['Businesstravel', 'Department', 'Educationfield', 'Maritalstatus',
       'Overtime'],
      dtype='object')

In [29]:
(df_rawdata[['Businesstravel', 'Department', 'Educationfield', 'Maritalstatus','Overtime']].isnull().sum() / df_rawdata.shape[0]) * 100

Businesstravel    47.880795
Department        81.258278
Educationfield    46.490066
Maritalstatus     40.264901
Overtime          41.721854
dtype: float64

In [30]:
# sacamos el 'value_counts()' de cada una de las columnas categóricas que tienen nulos para saber como es la distribución de sus categorías
for col in nulls_cat:
    print(f"La distribución de las categorías para la columna {col.upper()}")
    display(df_rawdata[col].value_counts() / df_rawdata.shape[0])
    print("........................")

La distribución de las categorías para la columna BUSINESSTRAVEL


travel_rarely        0.364901
travel_frequently    0.101325
non-travel           0.054967
Name: Businesstravel, dtype: float64

........................
La distribución de las categorías para la columna DEPARTMENT


 Research & Development     0.122517
 Sales                      0.055629
 Human Resources            0.009272
Name: Department, dtype: float64

........................
La distribución de las categorías para la columna EDUCATIONFIELD


Life Sciences       0.215894
Medical             0.169536
Marketing           0.064238
Technical Degree    0.042384
Other               0.035762
Human Resources     0.007285
Name: Educationfield, dtype: float64

........................
La distribución de las categorías para la columna MARITALSTATUS


Married     0.270861
Single      0.202649
Divorced    0.123841
Name: Maritalstatus, dtype: float64

........................
La distribución de las categorías para la columna OVERTIME


No     0.422517
Yes    0.160265
Name: Overtime, dtype: float64

........................


In [31]:
nulls_cat = ['Businesstravel', 'Educationfield', 'Maritalstatus', 'Overtime']

Todos desconocidos!

In [32]:
for columna in nulls_cat:
    # reemplazamos los nulos por el valor Unknown para cada una de las columnas de la lista
    df_rawdata[columna] = df_rawdata[columna].fillna("Unknown")

In [33]:
df_rawdata["Overtime"].unique()

array(['No', 'Unknown', 'Yes'], dtype=object)

# Columna Department: vinculada con Jobrole

In [34]:
# primero vemos cómo está la columna Jobrole
#df_rawdata["Jobrole"].head()

In [35]:
# # capitalize valores de Jobrole (primera letra de cada palabra)
# df_rawdata['Jobrole'] = df_rawdata['Jobrole'].apply(lambda x: ' '.join(word.capitalize() for word in x.lower().split()))
# df_rawdata["Jobrole"].head()

In [36]:
# hacemos función para asignar los valores nulos en Department según la columna Jobrole
def fill_department(df, jobrole, department):
    df.loc[df['Jobrole'] == jobrole, 'Department'] = df.loc[df['Jobrole'] == jobrole, 'Department'].fillna(department)

# Uso de la función fill_department para reemplazar los valores faltantes en 'Department'
fill_department(df_rawdata, 'Research Director', 'Research & Development')
fill_department(df_rawdata, 'Healthcare Representative', 'Research & Development')
fill_department(df_rawdata, 'Laboratory Technician', 'Research & Development')
fill_department(df_rawdata, 'Manufacturing Director', 'Research & Development')
fill_department(df_rawdata, 'Research Scientist', 'Research & Development')
fill_department(df_rawdata, 'Sales Executive', 'Sales')
fill_department(df_rawdata, 'Sales Representative', 'Sales')
fill_department(df_rawdata, 'Human Resources', 'Human Resources')
fill_department(df_rawdata, 'Manager', 'Unknown')

# Mostrar el DataFrame actualizado
df_rawdata[["Jobrole", "Department"]].head(5)

Unnamed: 0,Jobrole,Department
0,Research Director,
1,Manager,
2,Manager,Research & Development
3,Research Director,
4,Sales Executive,


In [37]:
# Mostrar estadísticas descriptivas y conteo de valores únicos
print("Estadísticas descriptivas para 'Jobrole':\n", df_rawdata['Jobrole'].describe())
print("\nEstadísticas descriptivas para 'Department':\n", df_rawdata['Department'].describe())

print("\nConteo de valores únicos para 'Jobrole':\n", df_rawdata['Jobrole'].value_counts())
print("\nConteo de valores únicos para 'Department':\n", df_rawdata['Department'].value_counts())

Estadísticas descriptivas para 'Jobrole':
 count                  1510
unique                    9
top        Sales Executive 
freq                    336
Name: Jobrole, dtype: object

Estadísticas descriptivas para 'Department':
 count                          283
unique                           3
top        Research & Development 
freq                           185
Name: Department, dtype: object

Conteo de valores únicos para 'Jobrole':
  Sales Executive               336
 Research Scientist            300
 Laboratory Technician         264
 Manufacturing Director        148
 Healthcare Representative     137
 Manager                       105
 Sales Representative           84
 Research Director              83
 Human Resources                53
Name: Jobrole, dtype: int64

Conteo de valores únicos para 'Department':
  Research & Development     185
 Sales                       84
 Human Resources             14
Name: Department, dtype: int64


# Todas las columnas categóricas listas 

In [38]:
# Como hay valores negativos en la columna Distancefromhome los cambiamos a nulos.
def negative_to_null(data):
    if data < 0:
        return np.nan
    else:
        return data
    
df_rawdata["Distancefromhome"] = df_rawdata["Distancefromhome"].apply(negative_to_null)

In [39]:
df_rawdata["Distancefromhome"].tail(5)

1609    3.0
1610    4.0
1611    NaN
1612    8.0
1613    7.0
Name: Distancefromhome, dtype: float64

## Gestión nulos columnas numéricas

In [40]:
nulls_num = df_rawdata[df_rawdata.columns[df_rawdata.isna().any()]].columns
nulls_num

Index(['Dailyrate', 'Department', 'Distancefromhome', 'Monthlyincome',
       'Performancerating', 'Totalworkingyears', 'Worklifebalance'],
      dtype='object')

In [41]:
df_rawdata_copy = df_rawdata.copy()

In [42]:
print(f"Tenemos nulos: \n{df_rawdata_copy[['Dailyrate', 'Distancefromhome', 'Monthlyincome', 'Performancerating','Totalworkingyears', 'Worklifebalance']].isnull().sum()} nulos")			

Tenemos nulos: 
Dailyrate            116
Distancefromhome     178
Monthlyincome        799
Performancerating    182
Totalworkingyears    494
Worklifebalance      100
dtype: int64 nulos


In [44]:
# instanciamos las clases										
imputer_iterative = IterativeImputer(max_iter = 20, random_state = 42)			
# ajustamos y tranformamos los datos										
imputer_iterative_imputado = imputer_iterative.fit_transform(df_rawdata_copy[['Dailyrate', 'Distancefromhome', 'Monthlyincome', 'Performancerating','Totalworkingyears', 'Worklifebalance']])
# comprobamos que es lo que nos devuelve, que en este caso es un array también					
imputer_iterative_imputado																						

array([[6.84000000e+02, 6.00000000e+00, 1.95370000e+04, 3.00000000e+00,
        3.26307279e+01, 3.00000000e+00],
       [6.99000000e+02, 1.00000000e+00, 1.99990000e+04, 3.00000000e+00,
        3.40000000e+01, 3.00000000e+00],
       [5.32000000e+02, 4.00000000e+00, 1.92320000e+04, 3.00000000e+00,
        2.20000000e+01, 2.91167214e+00],
       ...,
       [9.03000000e+02, 9.05275661e+00, 5.06575987e+03, 3.00000000e+00,
        9.00000000e+00, 3.00000000e+00],
       [1.22900000e+03, 8.00000000e+00, 6.31568435e+03, 3.15086516e+00,
        1.20000000e+01, 3.00000000e+00],
       [5.66000000e+02, 7.00000000e+00, 1.08450000e+04, 3.00000000e+00,
        1.84209077e+01, 3.00000000e+00]])

In [46]:
df_rawdata_copy[['Dailyrate_ITE', 'Distancefromhome_ITE', 'Monthlyincome_ITE', 'Performancerating_ITE','Totalworkingyears_ITE', 'Worklifebalance_ITE']] = imputer_iterative_imputado										
										
# comprobamos los nulos										
print(f"Después del 'Iterative' tenemos: \n{df_rawdata_copy[['Dailyrate_ITE', 'Distancefromhome_ITE', 'Monthlyincome_ITE', 'Performancerating_ITE', 'Totalworkingyears_ITE', 'Worklifebalance_ITE']].isnull().sum()} nulos")										
										

Después del 'Iterative' tenemos: 
Dailyrate_ITE            0
Distancefromhome_ITE     0
Monthlyincome_ITE        0
Performancerating_ITE    0
Totalworkingyears_ITE    0
Worklifebalance_ITE      0
dtype: int64 nulos


In [None]:
#Nos faltó guardar el csv
#df_rawdata.to_csv("raw_data_20231205.csv")