# Pipeline

## 0 Initial stuff

### Imports

In [1]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis
from IPython.display import display 
from sklearn.impute import KNNImputer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings("ignore") 

### Data

In [2]:
df_train = pq.read_table('data/df_train.parquet').to_pandas()
df_test = pq.read_table('data/df_test.parquet').to_pandas()

df_train.shape

(23494, 46)

## 1 Transform initial data

### Data imputation strategies

- KNN imputation numeric values

In [3]:
def knn_impute(df, columns, n_neighbors=5):
    df_copy = df.copy()
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_copy[columns] = imputer.fit_transform(df_copy[columns])
    
    return df_copy

- Categorical values

In [4]:
def impute_and_encode_categorical(df, columns):    
    df_copy = df.copy()    
    imputer = SimpleImputer(strategy='most_frequent')
    df_copy[columns] = imputer.fit_transform(df_copy[columns])
    encoder = OneHotEncoder(sparse=False, drop='first')
    encoded_columns = encoder.fit_transform(df_copy[columns])
    encoded_column_names = encoder.get_feature_names_out(columns)
    encoded_df = pd.DataFrame(encoded_columns, columns=encoded_column_names, index=df_copy.index)
    df_copy = df_copy.drop(columns, axis=1)
    df_copy = pd.concat([df_copy, encoded_df], axis=1)
    
    return df_copy


In [11]:
def transform_data(data):
    # Features and Target
    X = data.drop(columns=['Target'])
    y = data['Target']

    # Numeric and Categorical data
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns
    
    # Based on a prev analysis we decided to drop the follwing columns
    to_drop = ['psa_min_gr_flia','PERDIDA_DE_PESO', 'min_Tiempo_CP_Fliar']
    X.drop(to_drop, axis=1, inplace=True)
    
    # Data Imputation
    # Numneric columns
    X = knn_impute(X, ['psa_max_gr_flia'])
    # Catgorical columns
    X = impute_and_encode_categorical(X, ['IMC', 'AGRUPACION_SISTOLICA', 'AGRUPACION_DIASTOLICA', 'RIESGOS', 'CANCER_MAMA_FAMILIAR'])

    return X


In [12]:
result = transform_data(df_train)
result.isnull().sum()

MEDICAMENTOS                        0
MEDICINA ESPECIALIZADA              0
MEDICINA GENERAL                    0
Cant_gr_flia                        0
Cant_riesgos_flia_mean              0
Cant_Fliar_CP                       0
psa_max_gr_flia                     0
Cant_Fliar_riesgos                  0
cantidad_serv_flia                  0
Pendiente_flia                      0
Intercepto_flia                     0
Promedio_costo_flia                 0
ESTADO_CIVI                         0
PROGRAMA                            0
estrato                             0
parentesco                          0
EDAD                                0
CANTIDAD_SERVICIOS                  0
TIEMPO_ULTIMA_CITA                  0
conteo_dx_diferentes                0
TIEMPO_AFILIACION                   0
CANCER_OTRO_SITIO                   0
CANCER_OTRO_SITIO_FAMILIAR          0
HIPERTENSION                        0
HIPERTENSION_FAMILIAR               0
DIABETES                            0
DIABETES_FAM

In [13]:
result

Unnamed: 0_level_0,MEDICAMENTOS,MEDICINA ESPECIALIZADA,MEDICINA GENERAL,Cant_gr_flia,Cant_riesgos_flia_mean,Cant_Fliar_CP,psa_max_gr_flia,Cant_Fliar_riesgos,cantidad_serv_flia,Pendiente_flia,...,AGRUPACION_DIASTOLICA_None,RIESGOS_2.0,RIESGOS_3.0,RIESGOS_4.0,RIESGOS_5.0,RIESGOS_6.0,RIESGOS_7.0,RIESGOS_8.0,CANCER_MAMA_FAMILIAR_1,CANCER_MAMA_FAMILIAR_None
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16484,12.0,3.0,3.0,2,0.0,0.0,8.53897,0.0,34.0,15998.485714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11528,6.0,1.0,1.0,2,0.5,0.0,8.53897,1.0,25.0,655.200000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26630,1.0,0.0,2.0,2,0.0,0.0,8.53897,0.0,36.0,342236.971429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12586,6.0,0.0,2.0,2,0.0,0.0,8.53897,0.0,24.0,-4468.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11225,3.0,0.0,2.0,1,1.0,0.0,8.53897,1.0,7.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7141,0.0,0.0,1.0,3,0.0,0.0,8.53897,0.0,22.0,18670.800000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15417,6.0,0.0,3.0,1,1.0,0.0,8.53897,1.0,11.0,462.857143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
19389,8.0,2.0,2.0,1,0.0,0.0,8.53897,0.0,15.0,5247.171429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
22505,3.0,0.0,1.0,1,0.0,0.0,8.53897,0.0,4.0,40.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [14]:
# cat
# IMC, AGRUPACION_SISTOLICA, AGRUPACION_DIASTOLICA, RIESGOS
# num
# psa_max_gr_flia