In [19]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import SelectPercentile, chi2

In [20]:
test_path = 'C:/Users/diego/OneDrive/Escritorio/mlops_projects/mlops/churn_inference_service/artifacts/test.csv'
train_path = 'C:/Users/diego/OneDrive/Escritorio/mlops_projects/mlops/churn_inference_service/artifacts/train.csv'

In [21]:
numerical_columns = ['monthlycharges','totalcharges','tenure']
categorical_columns = [
                'multiplelines','internetservice','onlinesecurity', 'onlinebackup','deviceprotection',
                'techsupport','streamingtv','streamingmovies','contract','paymentmethod','gender','paperlessbilling',
                'partner','dependents','phoneservice','seniorcitizen'
                ]

In [22]:
# Crear el pipeline numérico
num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]
)

# Crear el pipeline categórico
cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("one_hot_encoder", OneHotEncoder()),
        ("scaler", StandardScaler(with_mean=False))
    ]
)

# Crear el preprocesador usando ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipeline", num_pipeline, numerical_columns),
        ("cat_pipeline", cat_pipeline, categorical_columns)
    ]
)

In [23]:
preprocessing_obj = preprocessor

In [24]:
preprocessing_obj

In [25]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [26]:
train_df.head(5)

Unnamed: 0,clienteidentifier,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,...,gender,paperlessbilling,partner,dependents,phoneservice,seniorcitizen,monthlycharges,totalcharges,tenure,churn
0,4223-BKEOR,No,DSL,Yes,No,Yes,No,No,Yes,One year,...,Female,No,No,Yes,Yes,N,64.85,1336.8,21.0,0
1,6035-RIIOM,Yes,Fiber optic,No,Yes,No,No,Yes,Yes,Two year,...,Female,Yes,No,No,Yes,N,97.2,5129.45,54.0,0
2,3797-VTIDR,No phone service,DSL,No,No,No,No,No,No,Month-to-month,...,Male,Yes,Yes,No,No,N,23.45,23.45,1.0,1
3,2568-BRGYX,No,Fiber optic,No,No,No,No,No,No,Month-to-month,...,Male,Yes,No,No,Yes,N,70.2,237.95,4.0,1
4,2775-SEFEE,Yes,DSL,Yes,Yes,No,Yes,No,No,Two year,...,Male,Yes,No,Yes,Yes,N,61.9,,0.0,0


In [27]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5634 entries, 0 to 5633
Data columns (total 21 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   clienteidentifier  5634 non-null   object 
 1   multiplelines      5634 non-null   object 
 2   internetservice    5634 non-null   object 
 3   onlinesecurity     5634 non-null   object 
 4   onlinebackup       5634 non-null   object 
 5   deviceprotection   5634 non-null   object 
 6   techsupport        5634 non-null   object 
 7   streamingtv        5634 non-null   object 
 8   streamingmovies    5634 non-null   object 
 9   contract           5634 non-null   object 
 10  paymentmethod      5634 non-null   object 
 11  gender             5634 non-null   object 
 12  paperlessbilling   5634 non-null   object 
 13  partner            5634 non-null   object 
 14  dependents         5634 non-null   object 
 15  phoneservice       5634 non-null   object 
 16  seniorcitizen      5634 

### Instancia objeto preprocesador

### Separar datasets

In [28]:
target_columns_name = "churn"

# X and y in Train dataset
input_feature_train_df = train_df.drop(columns=[target_columns_name, "clienteidentifier"], axis = 1)
target_feature_train_df = train_df[target_columns_name]

In [29]:
input_feature_test_df = test_df.drop(columns=[target_columns_name, "clienteidentifier"], axis = 1)
target_feature_test_df = test_df[target_columns_name]

In [30]:
input_feature_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1409 entries, 0 to 1408
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   multiplelines     1409 non-null   object 
 1   internetservice   1409 non-null   object 
 2   onlinesecurity    1409 non-null   object 
 3   onlinebackup      1409 non-null   object 
 4   deviceprotection  1409 non-null   object 
 5   techsupport       1409 non-null   object 
 6   streamingtv       1409 non-null   object 
 7   streamingmovies   1409 non-null   object 
 8   contract          1409 non-null   object 
 9   paymentmethod     1409 non-null   object 
 10  gender            1409 non-null   object 
 11  paperlessbilling  1409 non-null   object 
 12  partner           1409 non-null   object 
 13  dependents        1409 non-null   object 
 14  phoneservice      1409 non-null   object 
 15  seniorcitizen     1409 non-null   object 
 16  monthlycharges    1409 non-null   float64


In [11]:
target_feature_test_df.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1409 entries, 0 to 1408
Series name: churn
Non-Null Count  Dtype
--------------  -----
1409 non-null   int64
dtypes: int64(1)
memory usage: 11.1 KB


### Inicio de preprocesamiento

In [31]:
input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)

In [32]:
input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)

In [33]:
input_feature_test_arr

array([[-1.33162874, -1.00156871, -1.28460467, ...,  0.        ,
         2.72216227,  0.        ],
       [-1.31667194, -0.57187176,  0.35323794, ...,  3.36379851,
         2.72216227,  0.        ],
       [-1.51277218, -0.556283  ,  0.80364466, ...,  3.36379851,
         2.72216227,  0.        ],
       ...,
       [-1.49449165, -0.86670936, -0.62946762, ...,  3.36379851,
         2.72216227,  0.        ],
       [-0.69513389,  0.29652162,  1.49972776, ...,  0.        ,
         2.72216227,  0.        ],
       [-1.11392424, -0.99867208, -1.28460467, ...,  0.        ,
         2.72216227,  0.        ]])

### Concatenación

In [34]:
train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]

In [35]:
test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]