Function to preprocess data

In [20]:
SEED = 2022
#!/usr/bin/env python
import sys
import os
sys.path.append('src/')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import argparse
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer
from sklearn.decomposition import PCA
import logging
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import (
    TimeSeriesSplit,
    cross_val_score,
    cross_validate,
    train_test_split,
)
import time
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

In [21]:
def fit_processor(X_train, target_variable):
    """
    Applies Simple Imputer to Categorical Features
    Applies One Hot Encoding to Categorical Features
    Applies Quantile Scaling to Numeric Features
    Returns and writes pickle file of the complete preprocessor

    Parameters
    ----------
    X_train : numpy
        dataset in NumPy format
    Will create lists of numeric and categorical 
        variables using X_train df

    Returns
    -------
    preprocessor : sklearn.Preprocessor
        sklearn preprocessor fit on the training set
    """
   
    features = X_train.columns.to_list()
    df_numeric_features = X_train.select_dtypes(include='number')
    df_categorical_features = X_train.select_dtypes(include='object')

    numeric_features = df_numeric_features.columns.to_list()
    categorical_features = df_categorical_features.columns.to_list()
    
    pipe_num = Pipeline([
        ('impute', SimpleImputer(strategy='mean')),
        ('scaler',  MinMaxScaler())
    ])
    pipe_cat = Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]) 
    preprocessor = ColumnTransformer([
        ('num', pipe_num, numeric_features),
        ('cat', pipe_cat, categorical_features)
    ])

    norm_X_train = preprocessor.fit(X_train)

    return preprocessor





In [22]:
train_df = pd.read_csv('result.csv') #read your dataset
target_variable='validation_loss' #define your target variable

In [23]:
features = train_df.columns.to_list()
features.remove(target_variable)
X = train_df[features]
y = train_df[[target_variable]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

processor = fit_processor(X_train,target_variable)
norm_X_train = processor.transform(X_train)
norm_X_test = processor.transform(X_test)

In [24]:
norm_X_train

array([[0.69230769, 0.68464937, 0.69230769, 0.        , 0.        ,
        1.        , 0.        , 0.58077872],
       [0.30769231, 0.30191027, 0.30769231, 0.3479308 , 0.        ,
        0.45798394, 0.        , 0.58077872],
       [0.33333333, 0.32686205, 0.33333333, 0.37077836, 0.        ,
        0.70479483, 0.        , 0.58077872],
       [0.        , 0.        , 0.        , 0.61650303, 0.        ,
        0.47546833, 0.        , 0.58077872],
       [0.74358974, 0.74649991, 0.74358974, 0.62153488, 0.        ,
        0.22747547, 0.        , 0.58077872],
       [0.12820513, 0.13670587, 0.12820513, 0.6187933 , 0.        ,
        0.48817375, 0.        , 0.58077872],
       [0.43589744, 0.43114928, 0.43589744, 0.3202252 , 0.        ,
        0.75022302, 0.        , 0.58077872],
       [0.94871795, 0.94412295, 0.94871795, 0.17306747, 0.        ,
        0.89522868, 0.        , 0.58077872],
       [0.76923077, 0.76946052, 0.76923077, 0.12581545, 0.        ,
        0.8416154 , 0.      