In [77]:
import pandas as pd
import numpy as np
import xgboost as xgb

from math import sqrt

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import (
        RandomizedSearchCV,
        train_test_split,
    )

from scipy.stats import randint



def feature_engineering(df_train: pd.DataFrame, df_test: pd.DataFrame):
    # Handle categorical to integer transformation for 'Gender'
    gender_mapping = {"F": 0, "M": 1}
    df_train["Gender"] = df_train["Gender"].map(gender_mapping)
    df_test["Gender"] = df_test["Gender"].map(gender_mapping)

    # Columns to encode
    cols = ["Age", "City_Category", "Stay_In_Current_City_Years"]

    # Combine train and test for consistent encoding
    combined_df = pd.concat([df_train[cols], df_test[cols]], axis=0)

    # Initialize the LabelEncoder
    le = LabelEncoder()

    # Apply LabelEncoder to each column and transform back to DataFrame
    for col in cols:
        combined_df[col] = le.fit_transform(combined_df[col])

    # Split the combined data back into train and test sets
    df_train[cols] = combined_df.iloc[: len(df_train), :]
    df_test[cols] = combined_df.iloc[len(df_train) :, :]

    df_train.reset_index(drop=True, inplace=True)
    df_train["Purchase"] = np.log1p(df_train["Purchase"])

    return df_train, df_test


def basic_preprocessing(df_train: pd.DataFrame, df_test: pd.DataFrame):
    df_train["Stay_In_Current_City_Years"] = df_train[
        "Stay_In_Current_City_Years"
    ].str.replace("+", "")
    df_train["Stay_In_Current_City_Years"] = df_train[
        "Stay_In_Current_City_Years"
    ].astype(int)

    df_test["Stay_In_Current_City_Years"] = df_test[
        "Stay_In_Current_City_Years"
    ].str.replace("+", "")
    df_test["Stay_In_Current_City_Years"] = df_test[
        "Stay_In_Current_City_Years"
    ].astype(int)

    ## Dropping User_id and Product_ID
    df_train = df_train.drop("User_ID", axis=1)
    df_test = df_test.drop("User_ID", axis=1)
    df_train = df_train.drop("Product_ID", axis=1)
    df_test = df_test.drop("Product_ID", axis=1)

    df_train = df_train.drop("Product_Category_3", axis=1)
    df_test = df_test.drop("Product_Category_3", axis=1)

    ## Imputing missing values with mode
    df_train["Product_Category_2"].mode()[0]
    df_train["Product_Category_2"] = df_train["Product_Category_2"].fillna(
        df_train["Product_Category_2"].mode()[0]
    )
    df_train.isnull().sum()

    df_test["Product_Category_2"].mode()[0]
    df_test["Product_Category_2"] = df_test["Product_Category_2"].fillna(
        df_test["Product_Category_2"].mode()[0]
    )
    df_test.isnull().sum()

    return df_train, df_test


In [14]:
df_train = pd.read_csv(r'C:\Users\CAMNG3\Downloads\train.csv')
df_test = pd.read_csv(r'C:\Users\CAMNG3\Downloads\test.csv')

df_train, df_test = basic_preprocessing(df_train, df_test)
df_train, df_test = feature_engineering(df_train, df_test)

df_train.to_csv(r".\data\pre_processed_train.csv", index=False)
df_test.to_csv(r".\data\pre_processed_test.csv", index=False)



In [15]:
X=df_train.drop('Purchase',axis=1)
y=df_train['Purchase']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# XGB Regressor

In [81]:
xgb_reg = xgb.XGBRegressor(learning_rate=0.5, max_depth=15, seed=0)

xgb_reg.fit(X_train, y_train)

xgb_y_pred = xgb_reg.predict(X_train)
print('Training set MAE',mean_absolute_error(y_train, xgb_y_pred))
print('Training set MSE',mean_squared_error(y_train, xgb_y_pred))
print('Training R2_Score',r2_score(y_train, xgb_y_pred))
print("Training RMSE of XGBoost Model is ",sqrt(mean_squared_error(y_train, xgb_y_pred)))

print("----------------------------------")

xgb_y_pred = xgb_reg.predict(X_test)
print('Test set MAE',mean_absolute_error(y_test, xgb_y_pred))
print('Test set MSE',mean_squared_error(y_test, xgb_y_pred))
print('Test R2_Score',r2_score(y_test, xgb_y_pred))
from math import sqrt
print("Test RMSE of XGBoost Model is ",sqrt(mean_squared_error(y_test, xgb_y_pred)))

XGB Test set MAE 0.2859335382173821
XGB Test set MSE 0.15213890327265409
XGB Test R2_Score 0.7206602250216348
XGB Test RMSE of XGBoost Model is  0.3900498728017407
----------------------------------
Random Forest Test set MAE 0.28264333427329164
Random Forest Test set MSE 0.1463590336157364
Random Forest Test R2_Score 0.7312725500393469
Random Forest Test RMSE of Regressor Model is  0.3825689919684244


# RandomForest Regressor

In [75]:
# Fitting Random Forest Regression to the dataset
regressor = RandomForestRegressor(n_estimators=10, random_state=0)
 
# Fit the regressor with x and y data
regressor.fit(X_train, y_train)

# Make predictions
regr_y_pred = regressor.predict(X_train)

print('Training set MAE',mean_absolute_error(y_train, regr_y_pred))
print('Training set MSE',mean_squared_error(y_train, regr_y_pred))
print('Training R2_Score',r2_score(y_train, regr_y_pred))
print("Training RMSE of Regressor Model is ",sqrt(mean_squared_error(y_train, regr_y_pred)))

print("----------------------------------")

regr_y_pred = regressor.predict(X_test)
print('Test set MAE',mean_absolute_error(y_test, regr_y_pred))
print('Test set MSE',mean_squared_error(y_test, regr_y_pred))
print('Test R2_Score',r2_score(y_test, regr_y_pred))
print("Test RMSE of Regressor Model is ",sqrt(mean_squared_error(y_test, regr_y_pred)))



Training set MAE 0.23634514883449112
Training set MSE 0.10665119059453569
Training R2_Score 0.8051570567953704
Training RMSE of Regressor Model is  0.32657493871167714
----------------------------------
Test set MAE 0.28264333427329164
Test set MSE 0.1463590336157364
Test R2_Score 0.7312725500393469
Test RMSE of Regressor Model is  0.3825689919684244


# HyperParameter tuning

In [83]:
regressor = RandomForestRegressor(n_estimators=10, random_state=0)

param_dist = {
    "max_depth": randint(5, 15),  # Maximum depth of the tree
    "n_estimators": randint(9, 13),
    "min_samples_leaf": randint(1, 3),
}

param_comb = 5

random_search = RandomizedSearchCV(
    regressor,
    param_distributions=param_dist,
    n_iter=param_comb,
    scoring="neg_root_mean_squared_error",
    cv=10,
    verbose=0,
)

random_search.fit(X_train, y_train)
regressor_best = random_search.best_estimator_

regr_y_pred = regressor_best.predict(X_test)
print('Test set MAE',mean_absolute_error(y_test, regr_y_pred))
print('Test set MSE',mean_squared_error(y_test, regr_y_pred))
print('Test R2_Score',r2_score(y_test, regr_y_pred))
print("Test RMSE of Regressor Model is ",sqrt(mean_squared_error(y_test, regr_y_pred)))

Test set MAE 0.2814727951442279
Test set MSE 0.14050631475177378
Test R2_Score 0.7420186323056361
Test RMSE of Regressor Model is  0.3748417195987845


# Call the endpoint

In [97]:
from google.cloud import aiplatform 

PROJECT_NUMBER='121050757542'
ENDPOINT_ID='9119259280820666368'

endpoint_name = f"projects/{PROJECT_NUMBER}/locations/us-central1/endpoints/{ENDPOINT_ID}"
endpoint = aiplatform.Endpoint(endpoint_name=endpoint_name)


In [130]:

endpoint.predict(instances=xgb.DMatrix([[1,2,3,4,5,6,87,9]]).get_data().A.tolist())

InternalServerError: 500 {"detail":"The following exception has occurred: TypeError. Arguments: (\"Not supported type for data.<class 'xgboost.core.DMatrix'>\",)."}