In [1]:
import pandas as pd

import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format

In [2]:
# Read data
filepath_merged_train = '../data/interim/merged_train.csv.zip'
filepath_merged_test = '../data/interim/merged_test.csv.zip'
merged_train = pd.read_csv(filepath_merged_train, compression='zip')
merged_test = pd.read_csv(filepath_merged_test, compression='zip')

In [3]:
merged_train.nunique()

people_id             151295
activity_id          2197291
activity_date            411
activity_category          7
activity_type           6515
outcome                    2
char_1                     2
group_1                29899
char_2                     3
date                    1196
char_3                    43
char_4                    25
char_5                     9
char_6                     7
char_7                    25
char_8                     8
char_9                     9
char_10                    2
char_11                    2
char_12                    2
char_13                    2
char_14                    2
char_15                    2
char_16                    2
char_17                    2
char_18                    2
char_19                    2
char_20                    2
char_21                    2
char_22                    2
char_23                    2
char_24                    2
char_25                    2
char_26                    2
char_27       

In [4]:
merged_train.head()

Unnamed: 0,people_id,activity_id,activity_date,activity_category,activity_type,outcome,char_1,group_1,char_2,date,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38
0,ppl_100,act2_1734928,2023-08-26,type 4,type 76,0,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
1,ppl_100,act2_2434093,2022-09-27,type 2,type 1,0,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
2,ppl_100,act2_3404049,2022-09-27,type 2,type 1,0,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
3,ppl_100,act2_3651215,2023-08-04,type 2,type 1,0,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36
4,ppl_100,act2_4109017,2023-08-26,type 2,type 1,0,type 2,group 17304,type 2,2021-06-29,type 5,type 5,type 5,type 3,type 11,type 2,type 2,True,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,True,True,False,True,True,False,False,True,True,True,False,36


In [5]:
def impute_missing_data(filepath):
    # Read data
    df = pd.read_csv(filepath)

    # Imput missing values in the 'activity_type' with the most frequent value (mode)
    mode_activity_type = df['activity_type'].mode()[0]
    df['activity_type'].fillna(mode_activity_type, inplace=True)

    return df

le = LabelEncoder()
def preprocess_data(filepath):
    # Impute missing informations
    df = impute_missing_data(filepath)

    # Select columns for enconding
    col_label_encode = ['activity_type', 'group_1']
    col_onehot_encode = ['activity_category', 'char_2', 'char_3', 'char_4', 'char_5', 'char_6', 'char_7', 'char_8', 'char_9']
    
    # Label Encoding
    for col in col_label_encode:
        df[col + '_labeled'] = le.fit_transform(df[col])

    # OneHot Encoding
    df = pd.get_dummies(df, columns=col_onehot_encode, drop_first=True)

    # Convert all boolean values to 0 and 1
    df = df.applymap(lambda x: int(x) if isinstance(x, bool) else x)

    # Convert 'activity_date' and 'date' to datetime
    df['activity_date'] = pd.to_datetime(df['activity_date'])
    df['date'] = pd.to_datetime(df['date'])

    # Extract date-related features
    df['activity_day_of_week'] = df['activity_date'].dt.dayofweek
    df['activity_month'] = df['activity_date'].dt.month
    df['activity_year'] = df['activity_date'].dt.year

    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year

    df['char_1'] = df['char_1'].map({'type 1': 0, 'type 2': 1})

    return df

In [6]:
training_data = preprocess_data(filepath=filepath_merged_train)
testing_data = preprocess_data(filepath=filepath_merged_test)

In [7]:
training_data.sample(10)

Unnamed: 0,people_id,activity_id,activity_date,activity_type,outcome,char_1,group_1,date,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,activity_type_labeled,group_1_labeled,activity_category_type 2,activity_category_type 3,activity_category_type 4,activity_category_type 5,activity_category_type 6,activity_category_type 7,char_2_type 2,char_2_type 3,char_3_type 10,char_3_type 11,char_3_type 12,char_3_type 13,char_3_type 14,char_3_type 15,char_3_type 16,char_3_type 17,char_3_type 18,char_3_type 19,char_3_type 2,char_3_type 20,char_3_type 21,char_3_type 22,char_3_type 23,char_3_type 24,char_3_type 25,char_3_type 26,char_3_type 27,char_3_type 28,char_3_type 29,char_3_type 3,char_3_type 30,char_3_type 31,char_3_type 32,char_3_type 33,char_3_type 34,char_3_type 35,char_3_type 36,char_3_type 37,char_3_type 38,char_3_type 39,char_3_type 4,char_3_type 40,char_3_type 41,char_3_type 42,char_3_type 44,char_3_type 5,char_3_type 6,char_3_type 7,char_3_type 8,char_3_type 9,char_4_type 10,char_4_type 11,char_4_type 12,char_4_type 13,char_4_type 14,char_4_type 15,char_4_type 16,char_4_type 17,char_4_type 18,char_4_type 19,char_4_type 2,char_4_type 20,char_4_type 21,char_4_type 22,char_4_type 23,char_4_type 24,char_4_type 25,char_4_type 3,char_4_type 4,char_4_type 5,char_4_type 6,char_4_type 7,char_4_type 8,char_4_type 9,char_5_type 2,char_5_type 3,char_5_type 4,char_5_type 5,char_5_type 6,char_5_type 7,char_5_type 8,char_5_type 9,char_6_type 2,char_6_type 3,char_6_type 4,char_6_type 5,char_6_type 6,char_6_type 7,char_7_type 10,char_7_type 11,char_7_type 12,char_7_type 13,char_7_type 14,char_7_type 15,char_7_type 16,char_7_type 17,char_7_type 18,char_7_type 19,char_7_type 2,char_7_type 20,char_7_type 21,char_7_type 22,char_7_type 23,char_7_type 24,char_7_type 25,char_7_type 3,char_7_type 4,char_7_type 5,char_7_type 6,char_7_type 7,char_7_type 8,char_7_type 9,char_8_type 2,char_8_type 3,char_8_type 4,char_8_type 5,char_8_type 6,char_8_type 7,char_8_type 8,char_9_type 2,char_9_type 3,char_9_type 4,char_9_type 5,char_9_type 6,char_9_type 7,char_9_type 8,char_9_type 9,activity_day_of_week,activity_month,activity_year,day_of_week,month,year
3004,ppl_100475,act2_4449811,2023-08-22,type 110,0,1,group 17304,2020-10-13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,109,4691,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,8,2023,1,10,2020
1508536,ppl_351646,act2_4709024,2022-11-03,type 649,1,1,group 29401,2022-04-11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,62,4504,13425,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,3,11,2022,0,4,2022
1367626,ppl_325415,act2_3370976,2023-02-24,type 1,0,1,group 17304,2022-11-17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4691,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,4,2,2023,3,11,2022
1257664,ppl_305432,act2_587107,2023-06-10,type 1,0,1,group 17304,2020-10-23,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17,0,4691,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,6,2023,4,10,2020
1501501,ppl_350276,act1_3592,2022-07-20,type 1,0,1,group 17304,2021-11-03,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4691,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,2,7,2022,2,11,2021
2059718,ppl_75684,act2_4021945,2022-09-11,type 4786,0,1,group 17304,2022-09-09,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,3545,4691,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,6,9,2022,4,9,2022
799254,ppl_235512,act2_3456299,2022-11-03,type 649,1,1,group 39537,2022-11-02,0,1,1,1,1,1,0,1,0,1,0,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,1,94,4504,19651,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,11,2022,2,11,2022
1843486,ppl_41680,act2_4676336,2023-04-04,type 1,0,1,group 18147,2021-08-21,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,79,0,5371,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,4,2023,5,8,2021
2180995,ppl_97427,act2_1868322,2022-11-19,type 1,1,1,group 19048,2021-06-14,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,97,0,6047,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,5,11,2022,0,6,2021
989235,ppl_267343,act2_2436412,2023-06-30,type 464,1,1,group 5393,2023-06-25,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,95,3426,26552,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,4,6,2023,6,6,2023


In [8]:
# Drop columns
cols_to_be_removed = ['people_id', 'activity_id', 'activity_type', 'group_1', 'activity_date', 'date']
training_data.drop(columns=cols_to_be_removed, inplace=True)
testing_data.drop(columns=cols_to_be_removed, inplace=True)

In [9]:
training_data.sample(10)

Unnamed: 0,outcome,char_1,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,activity_type_labeled,group_1_labeled,activity_category_type 2,activity_category_type 3,activity_category_type 4,activity_category_type 5,activity_category_type 6,activity_category_type 7,char_2_type 2,char_2_type 3,char_3_type 10,char_3_type 11,char_3_type 12,char_3_type 13,char_3_type 14,char_3_type 15,char_3_type 16,char_3_type 17,char_3_type 18,char_3_type 19,char_3_type 2,char_3_type 20,char_3_type 21,char_3_type 22,char_3_type 23,char_3_type 24,char_3_type 25,char_3_type 26,char_3_type 27,char_3_type 28,char_3_type 29,char_3_type 3,char_3_type 30,char_3_type 31,char_3_type 32,char_3_type 33,char_3_type 34,char_3_type 35,char_3_type 36,char_3_type 37,char_3_type 38,char_3_type 39,char_3_type 4,char_3_type 40,char_3_type 41,char_3_type 42,char_3_type 44,char_3_type 5,char_3_type 6,char_3_type 7,char_3_type 8,char_3_type 9,char_4_type 10,char_4_type 11,char_4_type 12,char_4_type 13,char_4_type 14,char_4_type 15,char_4_type 16,char_4_type 17,char_4_type 18,char_4_type 19,char_4_type 2,char_4_type 20,char_4_type 21,char_4_type 22,char_4_type 23,char_4_type 24,char_4_type 25,char_4_type 3,char_4_type 4,char_4_type 5,char_4_type 6,char_4_type 7,char_4_type 8,char_4_type 9,char_5_type 2,char_5_type 3,char_5_type 4,char_5_type 5,char_5_type 6,char_5_type 7,char_5_type 8,char_5_type 9,char_6_type 2,char_6_type 3,char_6_type 4,char_6_type 5,char_6_type 6,char_6_type 7,char_7_type 10,char_7_type 11,char_7_type 12,char_7_type 13,char_7_type 14,char_7_type 15,char_7_type 16,char_7_type 17,char_7_type 18,char_7_type 19,char_7_type 2,char_7_type 20,char_7_type 21,char_7_type 22,char_7_type 23,char_7_type 24,char_7_type 25,char_7_type 3,char_7_type 4,char_7_type 5,char_7_type 6,char_7_type 7,char_7_type 8,char_7_type 9,char_8_type 2,char_8_type 3,char_8_type 4,char_8_type 5,char_8_type 6,char_8_type 7,char_8_type 8,char_9_type 2,char_9_type 3,char_9_type 4,char_9_type 5,char_9_type 6,char_9_type 7,char_9_type 8,char_9_type 9,activity_day_of_week,activity_month,activity_year,day_of_week,month,year
1860189,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,0,1,1,0,1,1,0,1,1,94,3843,19367,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,3,2023,2,3,2023
1864247,1,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,72,0,5502,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3,10,2022,3,10,2022
364793,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,97,1035,27833,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,5,7,2023,5,7,2023
510561,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,100,889,26364,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,6,6,2023,3,11,2022
1796712,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,52,1099,9313,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,4,6,2023,4,6,2021
327949,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1035,4691,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,5,2,2023,2,2,2023
487639,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,58,676,3606,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,4,11,2022,2,7,2021
124903,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,91,0,5779,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,5,3,2023,2,4,2022
1234402,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,95,0,10567,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,4,4,2023,5,5,2022
2050650,0,1,0,0,0,1,0,0,0,1,0,1,0,1,1,1,1,1,0,0,1,0,0,0,1,0,0,0,1,1,88,0,5043,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,11,2022,4,2,2022


In [10]:
len(training_data.columns.to_list())

166

In [11]:
# Check memory usage
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2197291 entries, 0 to 2197290
Columns: 166 entries, outcome to year
dtypes: int32(6), int64(160)
memory usage: 2.7 GB


### Memory optimization

In [12]:
# Function to downcast integer columns
def downcast_integers(df):
    # Iterate over each column in the DataFrame
    for col in df.columns:
        if pd.api.types.is_integer_dtype(df[col]):
            # Downcast the column to the smallest possible integer type
            df[col] = pd.to_numeric(df[col], downcast='integer')
    return df

# Apply the downcasting function to the training and testing DataFrame
training_data_optimized = downcast_integers(training_data)
testing_data_optimized = downcast_integers(testing_data)

In [13]:
training_data_optimized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2197291 entries, 0 to 2197290
Columns: 166 entries, outcome to year
dtypes: int16(4), int8(162)
memory usage: 356.2 MB


In [14]:
testing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 498687 entries, 0 to 498686
Columns: 163 entries, char_1 to year
dtypes: int16(4), int8(159)
memory usage: 79.4 MB


In [15]:
training_data_optimized.head()

Unnamed: 0,outcome,char_1,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,activity_type_labeled,group_1_labeled,activity_category_type 2,activity_category_type 3,activity_category_type 4,activity_category_type 5,activity_category_type 6,activity_category_type 7,char_2_type 2,char_2_type 3,char_3_type 10,char_3_type 11,char_3_type 12,char_3_type 13,char_3_type 14,char_3_type 15,char_3_type 16,char_3_type 17,char_3_type 18,char_3_type 19,char_3_type 2,char_3_type 20,char_3_type 21,char_3_type 22,char_3_type 23,char_3_type 24,char_3_type 25,char_3_type 26,char_3_type 27,char_3_type 28,char_3_type 29,char_3_type 3,char_3_type 30,char_3_type 31,char_3_type 32,char_3_type 33,char_3_type 34,char_3_type 35,char_3_type 36,char_3_type 37,char_3_type 38,char_3_type 39,char_3_type 4,char_3_type 40,char_3_type 41,char_3_type 42,char_3_type 44,char_3_type 5,char_3_type 6,char_3_type 7,char_3_type 8,char_3_type 9,char_4_type 10,char_4_type 11,char_4_type 12,char_4_type 13,char_4_type 14,char_4_type 15,char_4_type 16,char_4_type 17,char_4_type 18,char_4_type 19,char_4_type 2,char_4_type 20,char_4_type 21,char_4_type 22,char_4_type 23,char_4_type 24,char_4_type 25,char_4_type 3,char_4_type 4,char_4_type 5,char_4_type 6,char_4_type 7,char_4_type 8,char_4_type 9,char_5_type 2,char_5_type 3,char_5_type 4,char_5_type 5,char_5_type 6,char_5_type 7,char_5_type 8,char_5_type 9,char_6_type 2,char_6_type 3,char_6_type 4,char_6_type 5,char_6_type 6,char_6_type 7,char_7_type 10,char_7_type 11,char_7_type 12,char_7_type 13,char_7_type 14,char_7_type 15,char_7_type 16,char_7_type 17,char_7_type 18,char_7_type 19,char_7_type 2,char_7_type 20,char_7_type 21,char_7_type 22,char_7_type 23,char_7_type 24,char_7_type 25,char_7_type 3,char_7_type 4,char_7_type 5,char_7_type 6,char_7_type 7,char_7_type 8,char_7_type 9,char_8_type 2,char_8_type 3,char_8_type 4,char_8_type 5,char_8_type 6,char_8_type 7,char_8_type 8,char_9_type 2,char_9_type 3,char_9_type 4,char_9_type 5,char_9_type 6,char_9_type 7,char_9_type 8,char_9_type 9,activity_day_of_week,activity_month,activity_year,day_of_week,month,year
0,0,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36,5382,4691,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,5,8,2023,1,6,2021
1,0,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36,0,4691,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,9,2022,1,6,2021
2,0,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36,0,4691,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,9,2022,1,6,2021
3,0,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36,0,4691,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,4,8,2023,1,6,2021
4,0,1,1,0,0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,36,0,4691,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,5,8,2023,1,6,2021


In [16]:
# Save datasets
training_data_optimized.to_csv('../data/interim/training_data.csv.zip', index=False, compression='zip')
testing_data_optimized.to_csv('../data/interim/testing_data.csv.zip', index=False, compression='zip')

### Scaling Features

In [17]:
outcome = training_data_optimized[['outcome']]
training_data_optimized.drop(columns=['outcome'], inplace=True)

In [18]:
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(training_data_optimized), columns=training_data_optimized.columns)

In [19]:
scaler.feature_names_in_

array(['char_1', 'char_10', 'char_11', 'char_12', 'char_13', 'char_14',
       'char_15', 'char_16', 'char_17', 'char_18', 'char_19', 'char_20',
       'char_21', 'char_22', 'char_23', 'char_24', 'char_25', 'char_26',
       'char_27', 'char_28', 'char_29', 'char_30', 'char_31', 'char_32',
       'char_33', 'char_34', 'char_35', 'char_36', 'char_37', 'char_38',
       'activity_type_labeled', 'group_1_labeled',
       'activity_category_type 2', 'activity_category_type 3',
       'activity_category_type 4', 'activity_category_type 5',
       'activity_category_type 6', 'activity_category_type 7',
       'char_2_type 2', 'char_2_type 3', 'char_3_type 10',
       'char_3_type 11', 'char_3_type 12', 'char_3_type 13',
       'char_3_type 14', 'char_3_type 15', 'char_3_type 16',
       'char_3_type 17', 'char_3_type 18', 'char_3_type 19',
       'char_3_type 2', 'char_3_type 20', 'char_3_type 21',
       'char_3_type 22', 'char_3_type 23', 'char_3_type 24',
       'char_3_type 25', 'char_3_

In [20]:
# Save the scaler
with open('../models/standard_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [21]:
processed_train_dataset = pd.concat([df_scaled, outcome], axis=1)

In [22]:
processed_train_dataset

Unnamed: 0,char_1,char_10,char_11,char_12,char_13,char_14,char_15,char_16,char_17,char_18,char_19,char_20,char_21,char_22,char_23,char_24,char_25,char_26,char_27,char_28,char_29,char_30,char_31,char_32,char_33,char_34,char_35,char_36,char_37,char_38,activity_type_labeled,group_1_labeled,activity_category_type 2,activity_category_type 3,activity_category_type 4,activity_category_type 5,activity_category_type 6,activity_category_type 7,char_2_type 2,char_2_type 3,char_3_type 10,char_3_type 11,char_3_type 12,char_3_type 13,char_3_type 14,char_3_type 15,char_3_type 16,char_3_type 17,char_3_type 18,char_3_type 19,char_3_type 2,char_3_type 20,char_3_type 21,char_3_type 22,char_3_type 23,char_3_type 24,char_3_type 25,char_3_type 26,char_3_type 27,char_3_type 28,char_3_type 29,char_3_type 3,char_3_type 30,char_3_type 31,char_3_type 32,char_3_type 33,char_3_type 34,char_3_type 35,char_3_type 36,char_3_type 37,char_3_type 38,char_3_type 39,char_3_type 4,char_3_type 40,char_3_type 41,char_3_type 42,char_3_type 44,char_3_type 5,char_3_type 6,char_3_type 7,char_3_type 8,char_3_type 9,char_4_type 10,char_4_type 11,char_4_type 12,char_4_type 13,char_4_type 14,char_4_type 15,char_4_type 16,char_4_type 17,char_4_type 18,char_4_type 19,char_4_type 2,char_4_type 20,char_4_type 21,char_4_type 22,char_4_type 23,char_4_type 24,char_4_type 25,char_4_type 3,char_4_type 4,char_4_type 5,char_4_type 6,char_4_type 7,char_4_type 8,char_4_type 9,char_5_type 2,char_5_type 3,char_5_type 4,char_5_type 5,char_5_type 6,char_5_type 7,char_5_type 8,char_5_type 9,char_6_type 2,char_6_type 3,char_6_type 4,char_6_type 5,char_6_type 6,char_6_type 7,char_7_type 10,char_7_type 11,char_7_type 12,char_7_type 13,char_7_type 14,char_7_type 15,char_7_type 16,char_7_type 17,char_7_type 18,char_7_type 19,char_7_type 2,char_7_type 20,char_7_type 21,char_7_type 22,char_7_type 23,char_7_type 24,char_7_type 25,char_7_type 3,char_7_type 4,char_7_type 5,char_7_type 6,char_7_type 7,char_7_type 8,char_7_type 9,char_8_type 2,char_8_type 3,char_8_type 4,char_8_type 5,char_8_type 6,char_8_type 7,char_8_type 8,char_9_type 2,char_9_type 3,char_9_type 4,char_9_type 5,char_9_type 6,char_9_type 7,char_9_type 8,char_9_type 9,activity_day_of_week,activity_month,activity_year,day_of_week,month,year,outcome
0,0.3215,1.7734,-0.5102,-0.5539,1.3644,1.7087,-0.5958,1.6516,-0.6232,-0.4645,-0.6100,-0.5137,1.6502,-0.6070,-0.6301,-0.4697,-0.6827,-0.4497,1.8122,1.6367,-0.4415,1.9765,1.6507,-0.6051,-0.5219,1.3754,1.9917,1.4305,-0.6094,-0.3874,2.4757,-0.6785,-0.8366,-0.4928,3.0970,-0.5362,-0.0440,-0.0379,1.3227,-1.0892,-0.1030,-0.2880,-0.0790,-0.0519,-0.0336,-0.1671,-0.0446,-0.0696,-0.0804,-0.1312,-0.3255,-0.0787,-0.0699,-0.0163,-0.0225,-0.0825,-0.0164,-0.0671,-0.0240,-0.0502,-0.0513,-0.0769,-0.0400,-0.0357,-0.0228,-0.0151,-0.0106,-0.0117,-0.0135,-0.0083,-0.0080,-0.0113,-0.3623,-0.6818,-0.0042,-0.0019,-0.0018,2.9915,-0.1873,-0.2764,-0.1109,-0.1903,-0.1242,-0.0387,-0.1012,-0.0520,-0.0505,-0.1690,-0.1012,-0.0271,-0.0641,-0.0495,-0.3394,-0.0327,-0.0535,-0.0450,-0.0304,-0.0263,-0.6818,-0.1330,-0.1023,3.2084,-0.5437,-0.3054,-0.1608,-0.1232,-0.4395,-0.2604,-0.3599,2.3589,-0.2419,-0.2715,-0.2085,-0.6818,-0.1963,1.5171,-0.5600,-0.2913,-0.1097,-0.0463,-0.1187,3.6013,-0.1969,-0.2278,-0.1705,-0.1397,-0.1893,-0.2443,-0.1731,-0.1898,-0.2171,-0.1711,-0.1268,-0.1202,-0.1135,-0.1937,-0.0707,-0.1556,-0.2123,-0.1335,-0.1408,-0.1885,-0.3691,-0.3638,1.4871,-0.8532,-0.2579,-0.1784,-0.2320,-0.1672,-0.1761,1.7344,-0.8911,-0.2701,-0.1860,-0.2230,-0.1710,-0.2040,-0.2030,1.1348,0.3074,1.0234,-1.4012,-0.2286,-0.9674,0
1,0.3215,1.7734,-0.5102,-0.5539,1.3644,1.7087,-0.5958,1.6516,-0.6232,-0.4645,-0.6100,-0.5137,1.6502,-0.6070,-0.6301,-0.4697,-0.6827,-0.4497,1.8122,1.6367,-0.4415,1.9765,1.6507,-0.6051,-0.5219,1.3754,1.9917,1.4305,-0.6094,-0.3874,-0.7310,-0.6785,1.1953,-0.4928,-0.3229,-0.5362,-0.0440,-0.0379,1.3227,-1.0892,-0.1030,-0.2880,-0.0790,-0.0519,-0.0336,-0.1671,-0.0446,-0.0696,-0.0804,-0.1312,-0.3255,-0.0787,-0.0699,-0.0163,-0.0225,-0.0825,-0.0164,-0.0671,-0.0240,-0.0502,-0.0513,-0.0769,-0.0400,-0.0357,-0.0228,-0.0151,-0.0106,-0.0117,-0.0135,-0.0083,-0.0080,-0.0113,-0.3623,-0.6818,-0.0042,-0.0019,-0.0018,2.9915,-0.1873,-0.2764,-0.1109,-0.1903,-0.1242,-0.0387,-0.1012,-0.0520,-0.0505,-0.1690,-0.1012,-0.0271,-0.0641,-0.0495,-0.3394,-0.0327,-0.0535,-0.0450,-0.0304,-0.0263,-0.6818,-0.1330,-0.1023,3.2084,-0.5437,-0.3054,-0.1608,-0.1232,-0.4395,-0.2604,-0.3599,2.3589,-0.2419,-0.2715,-0.2085,-0.6818,-0.1963,1.5171,-0.5600,-0.2913,-0.1097,-0.0463,-0.1187,3.6013,-0.1969,-0.2278,-0.1705,-0.1397,-0.1893,-0.2443,-0.1731,-0.1898,-0.2171,-0.1711,-0.1268,-0.1202,-0.1135,-0.1937,-0.0707,-0.1556,-0.2123,-0.1335,-0.1408,-0.1885,-0.3691,-0.3638,1.4871,-0.8532,-0.2579,-0.1784,-0.2320,-0.1672,-0.1761,1.7344,-0.8911,-0.2701,-0.1860,-0.2230,-0.1710,-0.2040,-0.2030,-1.4326,0.6044,-0.9771,-1.4012,-0.2286,-0.9674,0
2,0.3215,1.7734,-0.5102,-0.5539,1.3644,1.7087,-0.5958,1.6516,-0.6232,-0.4645,-0.6100,-0.5137,1.6502,-0.6070,-0.6301,-0.4697,-0.6827,-0.4497,1.8122,1.6367,-0.4415,1.9765,1.6507,-0.6051,-0.5219,1.3754,1.9917,1.4305,-0.6094,-0.3874,-0.7310,-0.6785,1.1953,-0.4928,-0.3229,-0.5362,-0.0440,-0.0379,1.3227,-1.0892,-0.1030,-0.2880,-0.0790,-0.0519,-0.0336,-0.1671,-0.0446,-0.0696,-0.0804,-0.1312,-0.3255,-0.0787,-0.0699,-0.0163,-0.0225,-0.0825,-0.0164,-0.0671,-0.0240,-0.0502,-0.0513,-0.0769,-0.0400,-0.0357,-0.0228,-0.0151,-0.0106,-0.0117,-0.0135,-0.0083,-0.0080,-0.0113,-0.3623,-0.6818,-0.0042,-0.0019,-0.0018,2.9915,-0.1873,-0.2764,-0.1109,-0.1903,-0.1242,-0.0387,-0.1012,-0.0520,-0.0505,-0.1690,-0.1012,-0.0271,-0.0641,-0.0495,-0.3394,-0.0327,-0.0535,-0.0450,-0.0304,-0.0263,-0.6818,-0.1330,-0.1023,3.2084,-0.5437,-0.3054,-0.1608,-0.1232,-0.4395,-0.2604,-0.3599,2.3589,-0.2419,-0.2715,-0.2085,-0.6818,-0.1963,1.5171,-0.5600,-0.2913,-0.1097,-0.0463,-0.1187,3.6013,-0.1969,-0.2278,-0.1705,-0.1397,-0.1893,-0.2443,-0.1731,-0.1898,-0.2171,-0.1711,-0.1268,-0.1202,-0.1135,-0.1937,-0.0707,-0.1556,-0.2123,-0.1335,-0.1408,-0.1885,-0.3691,-0.3638,1.4871,-0.8532,-0.2579,-0.1784,-0.2320,-0.1672,-0.1761,1.7344,-0.8911,-0.2701,-0.1860,-0.2230,-0.1710,-0.2040,-0.2030,-1.4326,0.6044,-0.9771,-1.4012,-0.2286,-0.9674,0
3,0.3215,1.7734,-0.5102,-0.5539,1.3644,1.7087,-0.5958,1.6516,-0.6232,-0.4645,-0.6100,-0.5137,1.6502,-0.6070,-0.6301,-0.4697,-0.6827,-0.4497,1.8122,1.6367,-0.4415,1.9765,1.6507,-0.6051,-0.5219,1.3754,1.9917,1.4305,-0.6094,-0.3874,-0.7310,-0.6785,1.1953,-0.4928,-0.3229,-0.5362,-0.0440,-0.0379,1.3227,-1.0892,-0.1030,-0.2880,-0.0790,-0.0519,-0.0336,-0.1671,-0.0446,-0.0696,-0.0804,-0.1312,-0.3255,-0.0787,-0.0699,-0.0163,-0.0225,-0.0825,-0.0164,-0.0671,-0.0240,-0.0502,-0.0513,-0.0769,-0.0400,-0.0357,-0.0228,-0.0151,-0.0106,-0.0117,-0.0135,-0.0083,-0.0080,-0.0113,-0.3623,-0.6818,-0.0042,-0.0019,-0.0018,2.9915,-0.1873,-0.2764,-0.1109,-0.1903,-0.1242,-0.0387,-0.1012,-0.0520,-0.0505,-0.1690,-0.1012,-0.0271,-0.0641,-0.0495,-0.3394,-0.0327,-0.0535,-0.0450,-0.0304,-0.0263,-0.6818,-0.1330,-0.1023,3.2084,-0.5437,-0.3054,-0.1608,-0.1232,-0.4395,-0.2604,-0.3599,2.3589,-0.2419,-0.2715,-0.2085,-0.6818,-0.1963,1.5171,-0.5600,-0.2913,-0.1097,-0.0463,-0.1187,3.6013,-0.1969,-0.2278,-0.1705,-0.1397,-0.1893,-0.2443,-0.1731,-0.1898,-0.2171,-0.1711,-0.1268,-0.1202,-0.1135,-0.1937,-0.0707,-0.1556,-0.2123,-0.1335,-0.1408,-0.1885,-0.3691,-0.3638,1.4871,-0.8532,-0.2579,-0.1784,-0.2320,-0.1672,-0.1761,1.7344,-0.8911,-0.2701,-0.1860,-0.2230,-0.1710,-0.2040,-0.2030,0.4930,0.3074,1.0234,-1.4012,-0.2286,-0.9674,0
4,0.3215,1.7734,-0.5102,-0.5539,1.3644,1.7087,-0.5958,1.6516,-0.6232,-0.4645,-0.6100,-0.5137,1.6502,-0.6070,-0.6301,-0.4697,-0.6827,-0.4497,1.8122,1.6367,-0.4415,1.9765,1.6507,-0.6051,-0.5219,1.3754,1.9917,1.4305,-0.6094,-0.3874,-0.7310,-0.6785,1.1953,-0.4928,-0.3229,-0.5362,-0.0440,-0.0379,1.3227,-1.0892,-0.1030,-0.2880,-0.0790,-0.0519,-0.0336,-0.1671,-0.0446,-0.0696,-0.0804,-0.1312,-0.3255,-0.0787,-0.0699,-0.0163,-0.0225,-0.0825,-0.0164,-0.0671,-0.0240,-0.0502,-0.0513,-0.0769,-0.0400,-0.0357,-0.0228,-0.0151,-0.0106,-0.0117,-0.0135,-0.0083,-0.0080,-0.0113,-0.3623,-0.6818,-0.0042,-0.0019,-0.0018,2.9915,-0.1873,-0.2764,-0.1109,-0.1903,-0.1242,-0.0387,-0.1012,-0.0520,-0.0505,-0.1690,-0.1012,-0.0271,-0.0641,-0.0495,-0.3394,-0.0327,-0.0535,-0.0450,-0.0304,-0.0263,-0.6818,-0.1330,-0.1023,3.2084,-0.5437,-0.3054,-0.1608,-0.1232,-0.4395,-0.2604,-0.3599,2.3589,-0.2419,-0.2715,-0.2085,-0.6818,-0.1963,1.5171,-0.5600,-0.2913,-0.1097,-0.0463,-0.1187,3.6013,-0.1969,-0.2278,-0.1705,-0.1397,-0.1893,-0.2443,-0.1731,-0.1898,-0.2171,-0.1711,-0.1268,-0.1202,-0.1135,-0.1937,-0.0707,-0.1556,-0.2123,-0.1335,-0.1408,-0.1885,-0.3691,-0.3638,1.4871,-0.8532,-0.2579,-0.1784,-0.2320,-0.1672,-0.1761,1.7344,-0.8911,-0.2701,-0.1860,-0.2230,-0.1710,-0.2040,-0.2030,1.1348,0.3074,1.0234,-1.4012,-0.2286,-0.9674,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2197286,0.3215,1.7734,1.9601,1.8053,1.3644,1.7087,1.6784,1.6516,1.6047,2.1530,1.6393,1.9468,1.6502,1.6475,1.5872,2.1289,1.4648,2.2235,1.8122,1.6367,2.2648,1.9765,1.6507,1.6527,-0.5219,1.3754,1.9917,1.4305,1.6408,1.2476,1.0904,-0.6357,-0.8366,-0.4928,3.0970,-0.5362,-0.0440,-0.0379,-0.7560,0.9181,-0.1030,-0.2880,-0.0790,-0.0519,-0.0336,-0.1671,-0.0446,-0.0696,-0.0804,-0.1312,3.0719,-0.0787,-0.0699,-0.0163,-0.0225,-0.0825,-0.0164,-0.0671,-0.0240,-0.0502,-0.0513,-0.0769,-0.0400,-0.0357,-0.0228,-0.0151,-0.0106,-0.0117,-0.0135,-0.0083,-0.0080,-0.0113,-0.3623,-0.6818,-0.0042,-0.0019,-0.0018,-0.3343,-0.1873,-0.2764,-0.1109,-0.1903,-0.1242,-0.0387,-0.1012,-0.0520,-0.0505,-0.1690,-0.1012,-0.0271,-0.0641,-0.0495,-0.3394,-0.0327,-0.0535,-0.0450,-0.0304,-0.0263,-0.6818,-0.1330,-0.1023,-0.3117,-0.5437,3.2740,-0.1608,-0.1232,2.2752,-0.2604,-0.3599,-0.4239,-0.2419,-0.2715,-0.2085,-0.6818,-0.1963,-0.6591,-0.5600,-0.2913,-0.1097,-0.0463,-0.1187,-0.2777,-0.1969,-0.2278,-0.1705,-0.1397,-0.1893,-0.2443,-0.1731,-0.1898,4.6068,-0.1711,-0.1268,-0.1202,-0.1135,-0.1937,-0.0707,-0.1556,-0.2123,-0.1335,-0.1408,-0.1885,-0.3691,-0.3638,1.4871,-0.8532,-0.2579,-0.1784,-0.2320,-0.1672,-0.1761,1.7344,-0.8911,-0.2701,-0.1860,-0.2230,-0.1710,-0.2040,-0.2030,0.4930,-0.2866,1.0234,0.5610,-1.6849,1.4645,1
2197287,0.3215,1.7734,1.9601,1.8053,1.3644,1.7087,1.6784,1.6516,1.6047,2.1530,1.6393,1.9468,1.6502,1.6475,1.5872,2.1289,1.4648,2.2235,1.8122,1.6367,2.2648,1.9765,1.6507,1.6527,-0.5219,1.3754,1.9917,1.4305,1.6408,1.2476,-0.2121,-0.6357,-0.8366,-0.4928,3.0970,-0.5362,-0.0440,-0.0379,-0.7560,0.9181,-0.1030,-0.2880,-0.0790,-0.0519,-0.0336,-0.1671,-0.0446,-0.0696,-0.0804,-0.1312,3.0719,-0.0787,-0.0699,-0.0163,-0.0225,-0.0825,-0.0164,-0.0671,-0.0240,-0.0502,-0.0513,-0.0769,-0.0400,-0.0357,-0.0228,-0.0151,-0.0106,-0.0117,-0.0135,-0.0083,-0.0080,-0.0113,-0.3623,-0.6818,-0.0042,-0.0019,-0.0018,-0.3343,-0.1873,-0.2764,-0.1109,-0.1903,-0.1242,-0.0387,-0.1012,-0.0520,-0.0505,-0.1690,-0.1012,-0.0271,-0.0641,-0.0495,-0.3394,-0.0327,-0.0535,-0.0450,-0.0304,-0.0263,-0.6818,-0.1330,-0.1023,-0.3117,-0.5437,3.2740,-0.1608,-0.1232,2.2752,-0.2604,-0.3599,-0.4239,-0.2419,-0.2715,-0.2085,-0.6818,-0.1963,-0.6591,-0.5600,-0.2913,-0.1097,-0.0463,-0.1187,-0.2777,-0.1969,-0.2278,-0.1705,-0.1397,-0.1893,-0.2443,-0.1731,-0.1898,4.6068,-0.1711,-0.1268,-0.1202,-0.1135,-0.1937,-0.0707,-0.1556,-0.2123,-0.1335,-0.1408,-0.1885,-0.3691,-0.3638,1.4871,-0.8532,-0.2579,-0.1784,-0.2320,-0.1672,-0.1761,1.7344,-0.8911,-0.2701,-0.1860,-0.2230,-0.1710,-0.2040,-0.2030,-0.1489,-1.1776,1.0234,0.5610,-1.6849,1.4645,1
2197288,0.3215,1.7734,1.9601,1.8053,1.3644,1.7087,1.6784,1.6516,1.6047,2.1530,1.6393,1.9468,1.6502,1.6475,1.5872,2.1289,1.4648,2.2235,1.8122,1.6367,2.2648,1.9765,1.6507,1.6527,-0.5219,1.3754,1.9917,1.4305,1.6408,1.2476,-0.7310,-0.6357,1.1953,-0.4928,-0.3229,-0.5362,-0.0440,-0.0379,-0.7560,0.9181,-0.1030,-0.2880,-0.0790,-0.0519,-0.0336,-0.1671,-0.0446,-0.0696,-0.0804,-0.1312,3.0719,-0.0787,-0.0699,-0.0163,-0.0225,-0.0825,-0.0164,-0.0671,-0.0240,-0.0502,-0.0513,-0.0769,-0.0400,-0.0357,-0.0228,-0.0151,-0.0106,-0.0117,-0.0135,-0.0083,-0.0080,-0.0113,-0.3623,-0.6818,-0.0042,-0.0019,-0.0018,-0.3343,-0.1873,-0.2764,-0.1109,-0.1903,-0.1242,-0.0387,-0.1012,-0.0520,-0.0505,-0.1690,-0.1012,-0.0271,-0.0641,-0.0495,-0.3394,-0.0327,-0.0535,-0.0450,-0.0304,-0.0263,-0.6818,-0.1330,-0.1023,-0.3117,-0.5437,3.2740,-0.1608,-0.1232,2.2752,-0.2604,-0.3599,-0.4239,-0.2419,-0.2715,-0.2085,-0.6818,-0.1963,-0.6591,-0.5600,-0.2913,-0.1097,-0.0463,-0.1187,-0.2777,-0.1969,-0.2278,-0.1705,-0.1397,-0.1893,-0.2443,-0.1731,-0.1898,4.6068,-0.1711,-0.1268,-0.1202,-0.1135,-0.1937,-0.0707,-0.1556,-0.2123,-0.1335,-0.1408,-0.1885,-0.3691,-0.3638,1.4871,-0.8532,-0.2579,-0.1784,-0.2320,-0.1672,-0.1761,1.7344,-0.8911,-0.2701,-0.1860,-0.2230,-0.1710,-0.2040,-0.2030,-0.1489,-1.7715,1.0234,0.5610,-1.6849,1.4645,1
2197289,0.3215,1.7734,1.9601,1.8053,1.3644,1.7087,1.6784,1.6516,1.6047,2.1530,1.6393,1.9468,1.6502,1.6475,1.5872,2.1289,1.4648,2.2235,1.8122,1.6367,2.2648,1.9765,1.6507,1.6527,-0.5219,1.3754,1.9917,1.4305,1.6408,1.2476,-0.1191,-0.6357,-0.8366,-0.4928,3.0970,-0.5362,-0.0440,-0.0379,-0.7560,0.9181,-0.1030,-0.2880,-0.0790,-0.0519,-0.0336,-0.1671,-0.0446,-0.0696,-0.0804,-0.1312,3.0719,-0.0787,-0.0699,-0.0163,-0.0225,-0.0825,-0.0164,-0.0671,-0.0240,-0.0502,-0.0513,-0.0769,-0.0400,-0.0357,-0.0228,-0.0151,-0.0106,-0.0117,-0.0135,-0.0083,-0.0080,-0.0113,-0.3623,-0.6818,-0.0042,-0.0019,-0.0018,-0.3343,-0.1873,-0.2764,-0.1109,-0.1903,-0.1242,-0.0387,-0.1012,-0.0520,-0.0505,-0.1690,-0.1012,-0.0271,-0.0641,-0.0495,-0.3394,-0.0327,-0.0535,-0.0450,-0.0304,-0.0263,-0.6818,-0.1330,-0.1023,-0.3117,-0.5437,3.2740,-0.1608,-0.1232,2.2752,-0.2604,-0.3599,-0.4239,-0.2419,-0.2715,-0.2085,-0.6818,-0.1963,-0.6591,-0.5600,-0.2913,-0.1097,-0.0463,-0.1187,-0.2777,-0.1969,-0.2278,-0.1705,-0.1397,-0.1893,-0.2443,-0.1731,-0.1898,4.6068,-0.1711,-0.1268,-0.1202,-0.1135,-0.1937,-0.0707,-0.1556,-0.2123,-0.1335,-0.1408,-0.1885,-0.3691,-0.3638,1.4871,-0.8532,-0.2579,-0.1784,-0.2320,-0.1672,-0.1761,1.7344,-0.8911,-0.2701,-0.1860,-0.2230,-0.1710,-0.2040,-0.2030,-1.4326,-0.5836,1.0234,0.5610,-1.6849,1.4645,1


In [24]:
processed_train_dataset.to_csv('../data/processed/processed_data.csv.zip', index=False, compression='zip')

### Conclusion

In this phase:

- Missing values were imputed using the most frequent value (mode).
- Label Encoding and One-Hot Encoding techniques were applied to encode categorical features.
- Numerical columns were scaled using the Standard Scaler.
- The scaler object has been saved as a .pkl file for future use during model deployment and individual predictions.
