# Feature engineering

Procedemos a realizar la creación de nuevas variables que faciliten nuestro análisis

In [34]:
import numpy as np
import pandas as pd
import yaml
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [35]:
%store df

Stored 'df' (DataFrame)


In [40]:
# Columnas numéricas y categóricas
numerical_cols = ["Transportation expense", "Distance from Residence to Work", "Service time",
                  "Age", "Work load Average/day", "Hit target", "Son", "Pet",
                  "Weight", "Height", "Body mass index"]

nominal_vars = ["Disciplinary failure", "Social drinker", "Social smoker", "Reason for absence"]

In [37]:
# --- Log-transform y normalización ---
df[numerical_cols] = df[numerical_cols].apply(lambda x: np.log1p(x))

Debido a que los rangos de cada variable varía mucho, podemos normalizar las variables con un MixMaxScaler

In [41]:
df.columns

Index(['Reason for absence', 'Month of absence', 'Day of the week', 'Seasons',
       'Transportation expense', 'Distance from Residence to Work',
       'Service time', 'Age', 'Work load Average/day', 'Hit target',
       'Disciplinary failure', 'Education', 'Son', 'Social drinker',
       'Social smoker', 'Pet', 'Weight', 'Height', 'Body mass index',
       'Absenteeism time in hours'],
      dtype='object')

In [42]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[nominal_vars] = scaler.fit_transform(df[nominal_vars])

In [48]:
# Analisis de PCA
n_components = 5
pca = PCA(n_components=n_components)
df_pca = pca.fit_transform(df[numerical_cols])
df_pca = pd.DataFrame(df_pca, columns=[f"PCA_{i+1}" for i in range(n_components)])

In [45]:
df_cat = pd.get_dummies(df[nominal_vars], drop_first=True)

In [49]:
df_final = pd.concat([df_pca, df_cat], axis=1)

In [51]:
print(f"Tamaño final del dataset: {df_final.shape}")

Tamaño final del dataset: (640, 9)


In [52]:
df_final.head()

Unnamed: 0,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5,Disciplinary failure,Social drinker,Social smoker,Reason for absence
0,0.701905,0.238156,0.106666,0.198289,0.005003,0.0,1.0,0.0,0.6
1,-0.678658,0.69294,0.059647,0.227598,-0.029405,,,,
2,0.283119,-1.345145,-0.923848,0.093353,0.379992,0.0,1.0,0.0,0.6
3,0.701905,0.238156,0.106666,0.198289,0.005003,0.0,1.0,1.0,0.3
4,-0.678658,0.69294,0.059647,0.227598,-0.029405,0.0,1.0,0.0,0.6
