In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


def feature_engineering(df_train: pd.DataFrame, df_test: pd.DataFrame):
    # Handle categorical to integer transformation for 'Gender'
    gender_mapping = {"F": 0, "M": 1}
    df_train["Gender"] = df_train["Gender"].map(gender_mapping)
    df_test["Gender"] = df_test["Gender"].map(gender_mapping)

    # Columns to encode
    cols = ["Age", "City_Category", "Stay_In_Current_City_Years"]

    # Combine train and test for consistent encoding
    combined_df = pd.concat([df_train[cols], df_test[cols]], axis=0)

    # Initialize the LabelEncoder
    le = LabelEncoder()

    # Apply LabelEncoder to each column and transform back to DataFrame
    for col in cols:
        combined_df[col] = le.fit_transform(combined_df[col])

    # Split the combined data back into train and test sets
    df_train[cols] = combined_df.iloc[: len(df_train), :]
    df_test[cols] = combined_df.iloc[len(df_train) :, :]

    df_train = df_train[df_train.Product_Category_1 > 2.5]
    df_train.reset_index(drop=True, inplace=True)
    df_train["Purchase"] = np.log1p(df_train["Purchase"])

    return df_train, df_test


def basic_preprocessing(df_train: pd.DataFrame, df_test: pd.DataFrame):
    df_train["Stay_In_Current_City_Years"] = df_train[
        "Stay_In_Current_City_Years"
    ].str.replace("+", "")
    df_train["Stay_In_Current_City_Years"] = df_train[
        "Stay_In_Current_City_Years"
    ].astype(int)

    df_test["Stay_In_Current_City_Years"] = df_test[
        "Stay_In_Current_City_Years"
    ].str.replace("+", "")
    df_test["Stay_In_Current_City_Years"] = df_test[
        "Stay_In_Current_City_Years"
    ].astype(int)

    ## Dropping User_id and Product_ID
    df_train = df_train.drop("User_ID", axis=1)
    df_test = df_test.drop("User_ID", axis=1)
    df_train = df_train.drop("Product_ID", axis=1)
    df_test = df_test.drop("Product_ID", axis=1)

    df_train = df_train.drop("Product_Category_3", axis=1)
    df_test = df_test.drop("Product_Category_3", axis=1)

    ## Imputing missing values with mode
    df_train["Product_Category_2"].mode()[0]
    df_train["Product_Category_2"] = df_train["Product_Category_2"].fillna(
        df_train["Product_Category_2"].mode()[0]
    )
    df_train.isnull().sum()

    df_test["Product_Category_2"].mode()[0]
    df_test["Product_Category_2"] = df_test["Product_Category_2"].fillna(
        df_test["Product_Category_2"].mode()[0]
    )
    df_test.isnull().sum()

    return df_train, df_test


ImportError: cannot import name 'ComplexWarning' from 'numpy.core.numeric' (c:\Users\CAMNG3\Desktop\gcp-ml-specialisation-demo2\env\Lib\site-packages\numpy\core\numeric.py)

In [None]:
df_train = pd.read_csv(r'C:\Users\CAMNG3\Downloads\train.csv')
df_test = pd.read_csv(r'C:\Users\CAMNG3\Downloads\test.csv')

df_train, df_test = basic_preprocessing(df_train, df_test)

df_train, df_test = feature_engineering(df_train, df_test)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["Purchase"] = np.log1p(df_train["Purchase"])


In [3]:
# X=df_train.drop('Purchase',axis=1)
# y=df_train['Purchase']

# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# import xgboost as xgb
# xgb_reg = xgb.XGBRegressor(learning_rate=1.0, max_depth=6, min_child_weight=40, seed=0)

# xgb_reg.fit(X_train, y_train)

# xgb_y_pred = xgb_reg.predict(X_test)
# print('MAE',mean_absolute_error(y_test, xgb_y_pred))
# print('MSE',mean_squared_error(y_test, xgb_y_pred))
# print('R2_Score',r2_score(y_test, xgb_y_pred))
# from math import sqrt
# print("RMSE of XGBoost Model is ",sqrt(mean_squared_error(y_test, xgb_y_pred)))

In [4]:
# train using dmatrix
import xgboost as xgb
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

X = df_train.drop('Purchase', axis=1)
y = df_train['Purchase']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fitting Random Forest Regression to the dataset
regressor = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)
 
# Fit the regressor with x and y data
regressor.fit(X, y)

# Make predictions
predictions = regressor.predict(X_test)

# Evaluate model
print('MAE', mean_absolute_error(np.array(y_test), predictions))
print('MSE', mean_squared_error(y_test, predictions))
print('R2_Score', r2_score(y_test, predictions))
print("RMSE of XGBoost Model is ", sqrt(mean_squared_error(y_test, predictions)))

ImportError: cannot import name 'ComplexWarning' from 'numpy.core.numeric' (c:\Users\CAMNG3\Desktop\gcp-ml-specialisation-demo2\env\Lib\site-packages\numpy\core\numeric.py)

In [143]:
pip install scikit-learn==1.3

^C
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.

[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting scikit-learn==1.3
  Downloading scikit_learn-1.3.0-cp311-cp311-win_amd64.whl.metadata (11 kB)
Downloading scikit_learn-1.3.0-cp311-cp311-win_amd64.whl (9.2 MB)
   ---------------------------------------- 0.0/9.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/9.2 MB 682.7 kB/s eta 0:00:14
    --------------------------------------- 0.2/9.2 MB 3.0 MB/s eta 0:00:04
   -- ------------------------------------- 0.7/9.2 MB 6.0 MB/s eta 0:00:02
   ------ --------------------------------- 1.4/9.2 MB 8.9 MB/s eta 0:00:01
   --------- ------------------------------ 2.2/9.2 MB 10.9 MB/s eta 0:00:01
   ------------- -------------------------- 3.2/9.2 MB 12.7 MB/s eta 0:00:01
   ------------------- -------------------- 4.5/9.2 MB 15.2 MB/s eta 0:00:01
   ------------------------- -------------- 6.0/9.2 MB 17.3 MB/s eta 0:00:01
   ------------------------------- -------- 7.2/9.2 MB 18.4 MB/s eta 0:00:01
   ------------------------------------ --- 8.3/9.2 MB 19.0 MB/s eta 

# Call the endpoint

In [97]:
from google.cloud import aiplatform 

PROJECT_NUMBER='121050757542'
ENDPOINT_ID='9119259280820666368'

endpoint_name = f"projects/{PROJECT_NUMBER}/locations/us-central1/endpoints/{ENDPOINT_ID}"
endpoint = aiplatform.Endpoint(endpoint_name=endpoint_name)


In [130]:

endpoint.predict(instances=xgb.DMatrix([[1,2,3,4,5,6,87,9]]).get_data().A.tolist())

InternalServerError: 500 {"detail":"The following exception has occurred: TypeError. Arguments: (\"Not supported type for data.<class 'xgboost.core.DMatrix'>\",)."}