In [None]:
# Connecting the Program to google drive for dataset Upload
# This will ask for google drive access permission

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install xgboost
!pip install lightgbm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_absolute_percentage_error
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import VotingRegressor


In [None]:
# Load the data
train_df = pd.read_csv("/content/drive/MyDrive/AmazonML/train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/AmazonML/test.csv"



In [None]:
train_df.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
0,1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
1,2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2,2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
3,1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
4,283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [None]:
test_df.head()

Unnamed: 0,PRODUCT_ID,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID
0,604373,Manuel d'Héliogravure Et de Photogravure En Re...,,,6142
1,1729783,DCGARING Microfiber Throw Blanket Warm Fuzzy P...,[QUALITY GUARANTEED: Luxury cozy plush polyest...,<b>DCGARING Throw Blanket</b><br><br> <b>Size ...,1622
2,1871949,I-Match Auto Parts Front License Plate Bracket...,"[Front License Plate Bracket Made Of Plastic,D...",Replacement for The Following Vehicles:2020 LE...,7540
3,1107571,PinMart Gold Plated Excellence in Service 1 Ye...,[Available as a single item or bulk packed. Se...,Our Excellence in Service Lapel Pins feature a...,12442
4,624253,"Visual Mathematics, Illustrated by the TI-92 a...",,,6318


In [None]:

# Feature engineering
def preprocess_data(df):
    df['text'] = df['TITLE'].fillna("") + " " + df['DESCRIPTION'].fillna("") + " " + df['BULLET_POINTS'].fillna("")
    return df[['text', 'PRODUCT_TYPE_ID']]

X = preprocess_data(train_df)
y = train_df['PRODUCT_LENGTH']
X_test = preprocess_data(test_df)


# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define the base models with GPU support
# xgb_model = xgb.XGBRegressor(random_state=42)
# lgb_model = LGBMRegressor(random_state=42)

# Define the base models
xgb_model = xgb.XGBRegressor(random_state=42)
lgb_model = lgb.LGBMRegressor(random_state=42)


# Define the ensemble model
ensemble_model = VotingRegressor(estimators=[('xgb', xgb_model), ('lgb', lgb_model)])

# Hyperparameter tuning
param_distributions = {
    'xgb__n_estimators': [100, 500, 1000],
    'xgb__max_depth': [3, 5, 7],
    'lgb__n_estimators': [100, 500, 1000],
    'lgb__max_depth': [3, 5, 7]
}

random_search = RandomizedSearchCV(
    ensemble_model,
    param_distributions=param_distributions,
    scoring='neg_mean_absolute_percentage_error',
    n_iter=10,
    cv=3,
    random_state=42,
    n_jobs=-1
)

# Train the model with hyperparameter tuning
random_search.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = random_search.predict(X_val)

# Calculate the evaluation metric
score = max(0, 100 * (1 - mean_absolute_percentage_error(y_val.to_array(), y_pred)))
print(f"Score: {score:.4f}")

# Make predictions on the test set
predictions = random_search.predict(X_test)

# Prepare the submission file
submission_df = pd.DataFrame({"PRODUCT_ID": test_df["PRODUCT_ID"].to_array(), "PRODUCT_LENGTH": predictions})
submission_df.set_index("PRODUCT_ID", inplace=True)
submission_df.to_csv("submission.csv")