In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


def feature_engineering(df_train: pd.DataFrame, df_test: pd.DataFrame):
    # Handle categorical to integer transformation for 'Gender'
    gender_mapping = {"F": 0, "M": 1}
    df_train["Gender"] = df_train["Gender"].map(gender_mapping)
    df_test["Gender"] = df_test["Gender"].map(gender_mapping)

    # Columns to encode
    cols = ["Age", "City_Category", "Stay_In_Current_City_Years"]

    # Combine train and test for consistent encoding
    combined_df = pd.concat([df_train[cols], df_test[cols]], axis=0)

    # Initialize the LabelEncoder
    le = LabelEncoder()

    # Apply LabelEncoder to each column and transform back to DataFrame
    for col in cols:
        combined_df[col] = le.fit_transform(combined_df[col])

    # Split the combined data back into train and test sets
    df_train[cols] = combined_df.iloc[: len(df_train), :]
    df_test[cols] = combined_df.iloc[len(df_train) :, :]

    df_train = df_train[df_train.Product_Category_1 > 2.5]
    df_train.reset_index(drop=True, inplace=True)
    df_train["Purchase"] = np.log1p(df_train["Purchase"])

    return df_train, df_test


def basic_preprocessing(df_train: pd.DataFrame, df_test: pd.DataFrame):
    df_train["Stay_In_Current_City_Years"] = df_train[
        "Stay_In_Current_City_Years"
    ].str.replace("+", "")
    df_train["Stay_In_Current_City_Years"] = df_train[
        "Stay_In_Current_City_Years"
    ].astype(int)

    df_test["Stay_In_Current_City_Years"] = df_test[
        "Stay_In_Current_City_Years"
    ].str.replace("+", "")
    df_test["Stay_In_Current_City_Years"] = df_test[
        "Stay_In_Current_City_Years"
    ].astype(int)

    ## Dropping User_id and Product_ID
    df_train = df_train.drop("User_ID", axis=1)
    df_test = df_test.drop("User_ID", axis=1)
    df_train = df_train.drop("Product_ID", axis=1)
    df_test = df_test.drop("Product_ID", axis=1)

    df_train = df_train.drop("Product_Category_3", axis=1)
    df_test = df_test.drop("Product_Category_3", axis=1)

    ## Imputing missing values with mode
    df_train["Product_Category_2"].mode()[0]
    df_train["Product_Category_2"] = df_train["Product_Category_2"].fillna(
        df_train["Product_Category_2"].mode()[0]
    )
    df_train.isnull().sum()

    df_test["Product_Category_2"].mode()[0]
    df_test["Product_Category_2"] = df_test["Product_Category_2"].fillna(
        df_test["Product_Category_2"].mode()[0]
    )
    df_test.isnull().sum()

    return df_train, df_test


In [13]:
df_train = pd.read_csv(r'C:\Users\CAMNG3\Downloads\Data\raw_data_train.csv')
df_test = pd.read_csv(r'C:\Users\CAMNG3\Downloads\Data\raw_data_test.csv')

df_train, df_test = basic_preprocessing(df_train, df_test)

df_train, df_test = feature_engineering(df_train, df_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["Purchase"] = np.log1p(df_train["Purchase"])


In [14]:
X=df_train.drop('Purchase',axis=1)
y=df_train['Purchase']

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

from xgboost.sklearn import XGBRegressor
xgb_reg = XGBRegressor(learning_rate=1.0, max_depth=6, min_child_weight=40, seed=0)

xgb_reg.fit(X_train, y_train)

xgb_y_pred = xgb_reg.predict(X_test)
print('MAE',mean_absolute_error(y_test, xgb_y_pred))
print('MSE',mean_squared_error(y_test, xgb_y_pred))
print('R2_Score',r2_score(y_test, xgb_y_pred))
from math import sqrt
print("RMSE of XGBoost Model is ",sqrt(mean_squared_error(y_test, xgb_y_pred)))

MAE 0.26984723669053534
MSE 0.13205568902643186
R2_Score 0.7656962333878472
RMSE of XGBoost Model is  0.3633946739103806
