In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor # Ensembles combine many decision trees to improve performance
from nltk.stem.snowball import SnowballStemmer # Reduces words to their base form
from sklearn.model_selection import train_test_split # Splits the data into train and test set
import time

stemmer = SnowballStemmer('english')

In [2]:
# Load the data into DFs
df_train = pd.read_csv(r'C:\Users\vladp\OneDrive\Desktop\Y3\DataScience\assignment3\Datascience2025_A3\Data\train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv(r'C:\Users\vladp\OneDrive\Desktop\Y3\DataScience\assignment3\Datascience2025_A3\Data\test.csv', encoding="ISO-8859-1")
df_pro_desc = pd.read_csv(r'C:\Users\vladp\OneDrive\Desktop\Y3\DataScience\assignment3\Datascience2025_A3\Data\product_descriptions.csv')

num_train = df_train.shape[0]

In [3]:
# This function stems a string to normalize words
def str_stemmer(s):
	return " ".join([stemmer.stem(word) for word in s.lower().split()])

# This function counts how many words str1 and str2 have in common
def str_common_word(str1, str2):
	return sum(int(str2.find(word) >= 0) for word in str1.split())

In [4]:
# Combines train and test dataframes to a signle one
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_pro_desc, how='left', on='product_uid')

In [5]:
# Applies the stemmer to all text fields
df_all['search_term'] = df_all['search_term'].map(lambda x:str_stemmer(x))
df_all['product_title'] = df_all['product_title'].map(lambda x:str_stemmer(x))
df_all['product_description'] = df_all['product_description'].map(lambda x:str_stemmer(x))

In [6]:
# Determines the num of words in search terms and concats all texts into a single string
df_all['len_of_query'] = df_all['search_term'].map(lambda x:len(x.split())).astype(np.int64)
df_all['product_info'] = df_all['search_term'] + "\t" + df_all['product_title'] + "\t" + df_all['product_description']

# Count how often titles and descriptions contain search words
df_all['word_in_title'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0], x.split('\t')[1]))
df_all['word_in_description'] = df_all['product_info'].map(lambda x:str_common_word(x.split('\t')[0], x.split('\t')[2]))

In [7]:
# Splits combined DF back into train and test set, keeping only the numerical values
df_all = df_all.drop(['search_term','product_title','product_description','product_info'],axis=1)
df_train = df_all.iloc[:num_train]
df_test = df_all.iloc[num_train:]

In [8]:
# Extracts input features and labels
id_test = df_test['id']
y = df_train['relevance'].values
X = df_train.drop(['id','relevance'], axis=1).values

# Split the training set for evaluation (80% train, 20% validation)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [9]:
# Train the ensemble method  on 80% of the original training data
clf_start_time = time.time()
rf = RandomForestRegressor(n_estimators=15, max_depth=6, random_state=0)
clf = BaggingRegressor(rf, n_estimators=45, max_samples=0.1, random_state=25)
clf.fit(X_train_split, y_train_split)

In [10]:
from sklearn.metrics import mean_squared_error
# Makes the predictions on the 20% test split and calculates the RMSE for BR
y_val_pred = clf.predict(X_val_split)
clf_end_time = time.time()
rmse = np.sqrt(mean_squared_error(y_val_split, y_val_pred))  
print("Baseline Validation RMSE:", rmse)
elapsed_time = clf_end_time - clf_start_time
print(f"Baseline Processing Time: {elapsed_time:.2f} seconds")

Baseline Validation RMSE: 0.4817214529959809
Baseline Processing Time: 4.95 seconds


In [15]:
# Test performance of linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Makes the predictions on the 20% test split and calculates the RMSE
start_time = time.time()
model.fit(X_train_split, y_train_split)
y_val_pred = model.predict(X_val_split)
end_time = time.time()
rmse = np.sqrt(mean_squared_error(y_val_split, y_val_pred))
print("Linear Regression Validation RMSE:", rmse)
elapsed_time = end_time - start_time
print(f"Linear Regression Processing Time: {elapsed_time:.2f} seconds")

Linear Regression Validation RMSE: 0.4924510678228164
Linear Regression Processing Time: 0.01 seconds


In [12]:
# Test performance of ridge regression model
from sklearn.linear_model import Ridge
model = Ridge(alpha=1.0)

# Makes the predictions on the 20% test split and calculates the RMSE
start_time = time.time()
model.fit(X_train_split, y_train_split)
y_val_pred = model.predict(X_val_split)
end_time = time.time()
rmse = np.sqrt(mean_squared_error(y_val_split, y_val_pred))
print("Ridge Regression Validation RMSE:", rmse)
elapsed_time = end_time - start_time
print(f"Ridge Regression Processing Time: {elapsed_time:.2f} seconds")

Ridge Regression Validation RMSE: 0.49245096473852085
Ridge Regression Processing Time: 0.02 seconds


In [13]:
# Test performance of GBR model
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(n_estimators=45, max_depth=6, random_state=42)

# Makes the predictions on the 20% test split and calculates the RMSE
start_time = time.time()
model.fit(X_train_split, y_train_split)
y_val_pred = model.predict(X_val_split)
end_time = time.time()
rmse = np.sqrt(mean_squared_error(y_val_split, y_val_pred))
print("GBR Regression Validation RMSE:", rmse)
elapsed_time = end_time - start_time
print(f"GBR Regression Processing Time: {elapsed_time:.2f} seconds")

GBR Regression Validation RMSE: 0.4801684195221727
GBR Regression Processing Time: 3.45 seconds
