Import libraries

In [18]:
from typing import NamedTuple
import pandas as pd 
import os 
from sklearn.ensemble import RandomForestRegressor
from matplotlib import pyplot as plt
%matplotlib inline


import google.cloud.aiplatform as aip
from google.cloud import storage  # noqa: F401

from kfp import dsl
from kfp.v2 import compiler
from kfp.v2.dsl import (component,
                        Dataset,
                        Input,
                        Model,
                        Output,
                        Metrics,
                        ClassificationMetrics, 
                        OutputPath,
                        InputPath)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from scipy.stats import randint

import numpy as np
from math import sqrt

from sklearn.model_selection import (
        RandomizedSearchCV,
    )

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# Preprocessing

In [26]:
df_train_uri = "".join([os.getenv("BUCKET_URI"), "raw_data/", "train.csv"])

df_train = pd.read_csv(df_train_uri)

df_train["Stay_In_Current_City_Years"] = df_train[
    "Stay_In_Current_City_Years"
].str.replace("+", "")
df_train["Stay_In_Current_City_Years"] = df_train[
    "Stay_In_Current_City_Years"
].astype(int)

## Dropping User_id and Product_ID
df_train = df_train.drop("User_ID", axis=1)
df_train = df_train.drop("Product_ID", axis=1)
df_train = df_train.drop("Product_Category_3", axis=1)

## Imputing missing values with mode
df_train["Product_Category_2"].mode()[0]
df_train["Product_Category_2"] = df_train["Product_Category_2"].fillna(
    df_train["Product_Category_2"].mode()[0]
)


In [27]:
df_train["Occupation"].value_counts()

Occupation
4     72308
0     69638
7     59133
1     47426
17    40043
20    33562
12    31179
14    27309
2     26588
16    25371
6     20355
3     17650
10    12930
5     12177
15    12165
11    11586
19     8461
13     7728
18     6622
9      6291
8      1546
Name: count, dtype: int64

# Data Transformation

In [5]:
# Handle categorical to integer transformation for 'Gender'
gender_mapping = {"F": 0, "M": 1}
df_train["Gender"] = df_train["Gender"].map(gender_mapping)

# Columns to encode
cols = ["Age", "City_Category", "Stay_In_Current_City_Years"]

combined_df = df_train[cols]

# Initialize the LabelEncoder
le = LabelEncoder()

# Apply LabelEncoder to each column and transform back to DataFrame
for col in cols:
    combined_df[col] = le.fit_transform(combined_df[col])

# Split the combined data back into train and test sets
df_train[cols] = combined_df
df_train["Purchase"] = np.log1p(df_train["Purchase"])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df[col] = le.fit_transform(combined_df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df[col] = le.fit_transform(combined_df[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df[col] = le.fit_transform(combined_df[col])


In [6]:
df_train.columns

Index(['Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Purchase'],
      dtype='object')

# With a without demog

In [7]:
df_train_with_demog = df_train 
df_train_without_demog = df_train[['City_Category','Stay_In_Current_City_Years', 'Product_Category_1','Product_Category_2', 'Purchase']]


In [55]:
dfs = [df_train_with_demog,df_train_without_demog]

In [8]:
df_train_with_demog["Gender"].value_counts()

Gender
1    414259
0    135809
Name: count, dtype: int64

In [9]:
df_train_with_demog["Age"].value_counts()

Age
2    219587
3    110013
1     99660
4     45701
5     38501
6     21504
0     15102
Name: count, dtype: int64

In [10]:
df_train_with_demog["Marital_Status"].value_counts()

Marital_Status
0    324731
1    225337
Name: count, dtype: int64

In [11]:
df_train_with_demog["Occupation"].value_counts()

Occupation
4     72308
0     69638
7     59133
1     47426
17    40043
20    33562
12    31179
14    27309
2     26588
16    25371
6     20355
3     17650
10    12930
5     12177
15    12165
11    11586
19     8461
13     7728
18     6622
9      6291
8      1546
Name: count, dtype: int64

# Training

In [56]:
for idx,df in enumerate(dfs):

    if idx == 0:
        print("Analyzing the dataset, including demographic information")
        print(df.columns)
    else:
        print("Analyzing the dataset, excluding demographic information")
        print(df.columns)

    df_train, df_test = train_test_split(
        df, 
        test_size=.2, 
        random_state=42
    )

    x_train = df_train.drop("Purchase", axis=1)
    y_train = np.array(df_train["Purchase"])

    x_test = df_test.drop("Purchase", axis=1)
    y_test = np.array(df_test["Purchase"])

    regressor = RandomForestRegressor(
        max_depth=13, 
        n_estimators=10, 
        oob_score=True,
        random_state=42
    )

    regressor.fit(x_train, y_train)

    xgb_y_pred = regressor.predict(x_test)

    mae = mean_absolute_error(y_test, xgb_y_pred)
    mse = mean_squared_error(y_test, xgb_y_pred)
    r2 = r2_score(y_test, xgb_y_pred)
    rmse = sqrt(mean_squared_error(y_test, xgb_y_pred))

    print(f"Mean Absolute Error: {mae}")
    print(f"R2 Score:  {r2}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"Mean Squared Error: {mse}")

    print("--------------------------------------------------------------")


Analyzing the dataset, including demographic information
Index(['Gender', 'Age', 'Occupation', 'City_Category',
       'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1',
       'Product_Category_2', 'Purchase'],
      dtype='object')


  warn(


Mean Absolute Error: 0.28085443420246403
R2 Score:  0.7418593983646372
Root Mean Squared Error: 0.37474516852213763
Mean Squared Error: 0.14043394133068532
--------------------------------------------------------------
Analyzing the dataset, excluding demographic information
Index(['City_Category', 'Stay_In_Current_City_Years', 'Product_Category_1',
       'Product_Category_2', 'Purchase'],
      dtype='object')
Mean Absolute Error: 0.285126732821897
R2 Score:  0.7357490703607114
Root Mean Squared Error: 0.3791544393316921
Mean Squared Error: 0.14375808886492975
--------------------------------------------------------------


  warn(
