In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from nltk.stem.snowball import SnowballStemmer # Reduces words to their base form
from sklearn.model_selection import train_test_split # Splits the data into train and test set
import time

stemmer = SnowballStemmer('english')

In [18]:
# Load the data into DFs
df_train = pd.read_csv(r'C:\Users\vladp\OneDrive\Desktop\Y3\DataScience\assignment3\Datascience2025_A3\Data\train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv(r'C:\Users\vladp\OneDrive\Desktop\Y3\DataScience\assignment3\Datascience2025_A3\Data\test.csv', encoding="ISO-8859-1")
df_pro_desc = pd.read_csv(r'C:\Users\vladp\OneDrive\Desktop\Y3\DataScience\assignment3\Datascience2025_A3\Data\product_descriptions.csv')

num_train = df_train.shape[0]

In [19]:
# This function stems a string to normalize words
def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

# This function counts how many words str1 and str2 have in common
def str_common_word(str1, str2):
	return sum(int(str2.find(word) >= 0) for word in str1.split())

In [20]:
# Combines train and test dataframes to a signle one
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')

In [21]:
# Applies the stemmer to all text fields
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

In [22]:
# Determines the num of words in search terms and concats all texts into a single string
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
df_all['product_info'] = df_all['search_term'] + "\t" + df_all['product_title'] + "\t" + df_all['product_description']

# Count how often titles and descriptions contain search words
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0], x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0], x.split('\t')[2]))

In [23]:
# Splits combined DF back into train and test set, keeping only the numerical values
df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]

In [24]:
# Extracts input features and labels
id_test = df_test['id']
y = df_train['relevance'].values
X = df_train.drop(['id','relevance'], axis=1).values

# Split the training set for evaluation (80% train, 20% validation)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [25]:
# Defining the model
gbr = GradientBoostingRegressor(random_state=42)

# Defining the parameter grid for optimization
param_grid = {
    'n_estimators': [50, 100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'min_samples_split': [2, 5, 10]  
}

In [26]:
# Apply RandomizedSearchCV to find best params
random_search = RandomizedSearchCV(
    estimator=gbr,
    param_distributions=param_grid,
    n_iter=50,
    scoring='neg_root_mean_squared_error',
    cv=5,
    random_state=42,
    verbose=1,
    n_jobs=1
)

In [27]:
start_time = time.time()

random_search.fit(X_train_split, y_train_split)

end_time = time.time()
print(f"HPO Processing Time: {end_time - start_time:.2f} seconds")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
HPO Processing Time: 3215.09 seconds


In [28]:
from sklearn.metrics import mean_squared_error

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Make predictions on the validation split
y_pred_optimized = random_search.predict(X_val_split)

# Calculate RMSE
rmse_optimized = np.sqrt(mean_squared_error(y_val_split, y_pred_optimized))
print(f"Optimized GBR RMSE: {rmse_optimized:.4f}")


Best Parameters: {'subsample': 0.9, 'n_estimators': 200, 'min_samples_split': 10, 'max_depth': 3, 'learning_rate': 0.05}
Optimized GBR RMSE: 0.4808
