# FIT5212 Assignment 2

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Installations
#!pip install numpy==1.23.5
#!pip install implicit
#!pip install scikit-surprise

Collecting scikit-surprise
  Using cached scikit_surprise-1.1.4.tar.gz (154 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2463298 sha256=a6e247efa3b5e522b968c0fb91f565b1267f87027949314722114421cae06c15
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4
Collecting implicit
  Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp311-cp311-manylinux2014_x86_64.whl (8.9 MB)
[2K   [90m━━━━━━━━━━━━━

Importing Libraries

In [18]:
import pandas as pd
import scipy.sparse as sparse
from implicit.als import AlternatingLeastSquares
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
import numpy as np

Import the train and test set

In [15]:
train = pd.read_csv("/content/drive/MyDrive/Semi Structured/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Semi Structured/test.csv")

# ALS Method

## Test 1

In [8]:
# Map user and product IDs to integers (required for matrix indexing)
user_map = {user: idx for idx, user in enumerate(train['user_id'].unique())}
product_map = {product: idx for idx, product in enumerate(train['product_id'].unique())}
user_inv_map = {idx: user for user, idx in user_map.items()}
product_inv_map = {idx: product for product, idx in product_map.items()}

# Add integer IDs
train['user_idx'] = train['user_id'].map(user_map)
train['product_idx'] = train['product_id'].map(product_map)

# Build a sparse matrix in item-user format
rating_matrix = sparse.csr_matrix(
    (train['rating'], (train['product_idx'], train['user_idx']))
)

# crafting the model and the specifications
als_model = AlternatingLeastSquares(
    factors=50,
    regularization=0.1,
    iterations=20,
    use_gpu=False
)
als_model.fit(rating_matrix)

# Map test users and products to internal IDs
test['user_idx'] = test['user_id'].map(user_map)
test['product_idx'] = test['product_id'].map(product_map)

# Handle missing users/products gracefully
test['user_idx'] = test['user_idx'].fillna(-1).astype(int)
test['product_idx'] = test['product_idx'].fillna(-1).astype(int)

# Get user and item factors
user_factors = als_model.user_factors
item_factors = als_model.item_factors

num_users = user_factors.shape[0]
num_items = item_factors.shape[0]

# Predict rating using dot product of latent vectors
def predict_score(row):
    u = row['user_idx']
    p = row['product_idx']
    if u < 0 or u >= num_users or p < 0 or p >= num_items:
        return 3.0  # fallback value
    return np.dot(user_factors[u], item_factors[p])

test['rating'] = test.apply(predict_score, axis=1)
test['rating'] = test['rating'].clip(1, 5).round(2)

submission = test[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/ALS.csv", index=False)

# SVD

In [5]:
# We only need these columns for the model
data = train[['user_id', 'product_id', 'rating']]

# Define the rating scale
reader = Reader(rating_scale=(1, 5))

# Load data into Surprise's format
surprise_data = Dataset.load_from_df(data, reader)

# Split for validation
trainset, valset = train_test_split(surprise_data, test_size=0.2, random_state=42)

# Train the model
svd_model = SVD()
svd_model.fit(trainset)

# Evaluate on validation set
predictions = svd_model.test(valset)
rmse = accuracy.rmse(predictions)
print(f"Validation RMSE: {rmse:.4f}")

test2 = test.copy()

# Predict using the trained model
test2['rating'] = test2.apply(
    lambda row: svd_model.predict(row['user_id'], row['product_id']).est,
    axis=1
)

# Clip predictions to rating scale
test2['rating'] = test2['rating'].clip(1, 5).round(2)

submission = test2[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/SVD.csv", index=False)

RMSE: 0.9366
Validation RMSE: 0.9366


In [6]:
# define the hyperparameter options
param_grid = {
    'n_factors': [50, 100],
    'lr_all': [0.002, 0.005],
    'reg_all': [0.02, 0.1]
}

# Perform the grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(surprise_data)

print("Best RMSE:", gs.best_score['rmse'])
print("Best Params:", gs.best_params['rmse'])

# Retrain best model on full data
best_model = gs.best_estimator['rmse']
trainset = surprise_data.build_full_trainset()
best_model.fit(trainset)

Best RMSE: 0.9399456757870149
Best Params: {'n_factors': 100, 'lr_all': 0.005, 'reg_all': 0.1}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7970247a9410>

In [8]:
test3 = test.copy()
# Predict using the best model
test3['rating'] = test3.apply(
    lambda row: best_model.predict(row['user_id'], row['product_id']).est,
    axis=1
)

# Clip to valid rating range
test3['rating'] = test3['rating'].clip(1, 5).round(2)
print("Best RMSE:", gs.best_score['rmse'])
print("Best Params:", gs.best_params['rmse'])

# Save CSV with required format
submission = test3[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/svd_optimized.csv", index=False)

## Further SVD Parameter Experiments

In [None]:
param_grid = {
    'n_factors': [100, 150],
    'lr_all': [0.005, 0.007],
    'reg_all': [0.1, 0.12],
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(surprise_data)

# Retrain best model on full data
best_model = gs.best_estimator['rmse']
trainset = surprise_data.build_full_trainset()
best_model.fit(trainset)

test7 = test.copy()
# Predict using the best model
test7['rating'] = test7.apply(
    lambda row: best_model.predict(row['user_id'], row['product_id']).est,
    axis=1
)

# Clip to valid rating range
test7['rating'] = test7['rating'].clip(1, 5).round(2)

# Save CSV with required format
submission = test7[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/svd_optimized4.csv", index=False)
print("Best RMSE:", gs.best_score['rmse'])
print("Best Params:", gs.best_params['rmse'])

In [None]:
param_grid = {
    'n_factors': [150, 160, 175],
    'lr_all': [0.006, 0.007, 0.008],
    'reg_all': [0.09, 0.1],
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(surprise_data)

# Retrain best model on full data
best_model = gs.best_estimator['rmse']
trainset = surprise_data.build_full_trainset()
best_model.fit(trainset)

test7 = test.copy()
# Predict using the best model
test7['rating'] = test7.apply(
    lambda row: best_model.predict(row['user_id'], row['product_id']).est,
    axis=1
)

# Clip to valid rating range
test7['rating'] = test7['rating'].clip(1, 5).round(2)

# Save CSV with required format
submission = test7[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/svd_optimized4.csv", index=False)
print("Best RMSE:", gs.best_score['rmse'])
print("Best Params:", gs.best_params['rmse'])

In [None]:
param_grid = {
    'n_factors': [175, 185, 195],
    'lr_all': [0.008, 0.009, 0.01],
    'reg_all': [0.095, 0.1],
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(surprise_data)

# Retrain best model on full data
best_model = gs.best_estimator['rmse']
trainset = surprise_data.build_full_trainset()
best_model.fit(trainset)

test7 = test.copy()
# Predict using the best model
test7['rating'] = test7.apply(
    lambda row: best_model.predict(row['user_id'], row['product_id']).est,
    axis=1
)

# Clip to valid rating range
test7['rating'] = test7['rating'].clip(1, 5).round(2)

# Save CSV with required format
submission = test7[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/svd_optimized5.csv", index=False)

print("Best RMSE:", gs.best_score['rmse'])
print("Best Params:", gs.best_params['rmse'])

Best RMSE: 0.9284535318463298
Best Params: {'n_factors': 185, 'lr_all': 0.01, 'reg_all': 0.095}


In [None]:
param_grid = {
    'n_factors': [190, 191, 192, 193, 194],
    'lr_all': [0.015, 0.016, 0.017, 0.018],
    'reg_all': [0.095],
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(surprise_data)

# Retrain best model on full data
best_model = gs.best_estimator['rmse']
trainset = surprise_data.build_full_trainset()
best_model.fit(trainset)

test7 = test.copy()
# Predict using the best model
test7['rating'] = test7.apply(
    lambda row: best_model.predict(row['user_id'], row['product_id']).est,
    axis=1
)

# Clip to valid rating range
test7['rating'] = test7['rating'].clip(1, 5).round(2)

# Save CSV with required format
submission = test7[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/svd_optimized7.csv", index=False)

print("Best RMSE:", gs.best_score['rmse'])
print("Best Params:", gs.best_params['rmse'])

Best RMSE: 0.9192523465540559
Best Params: {'n_factors': 193, 'lr_all': 0.018, 'reg_all': 0.095}


In [None]:
param_grid = {
    'n_factors': [180, 185, 190],
    'lr_all': [0.01, 0.012, 0.015],
    'reg_all': [0.095],
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(surprise_data)

# Retrain best model on full data
best_model = gs.best_estimator['rmse']
trainset = surprise_data.build_full_trainset()
best_model.fit(trainset)

test7 = test.copy()
# Predict using the best model
test7['rating'] = test7.apply(
    lambda row: best_model.predict(row['user_id'], row['product_id']).est,
    axis=1
)

# Clip to valid rating range
test7['rating'] = test7['rating'].clip(1, 5).round(2)

# Save CSV with required format
submission = test7[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/svd_optimized6.csv", index=False)

print("Best RMSE:", gs.best_score['rmse'])
print("Best Params:", gs.best_params['rmse'])

Best RMSE: 0.9214975483593362
Best Params: {'n_factors': 190, 'lr_all': 0.015, 'reg_all': 0.095}


# SVD Filtered

In [None]:
# filter out the unhelpful reviews
filtered_df = train[train['votes'] > 0]

In [7]:
# We only need these columns for the model
data = filtered_df[['user_id', 'product_id', 'rating']]

# Define the rating scale
reader = Reader(rating_scale=(1, 5))

# Load data into Surprise's format
surprise_data = Dataset.load_from_df(data, reader)

# Split for validation
trainset, valset = train_test_split(surprise_data, test_size=0.2, random_state=42)

# Train the model
svd_model = SVD()
svd_model.fit(trainset)

# Evaluate on validation set
predictions = svd_model.test(valset)
rmse = accuracy.rmse(predictions)
print(f"Validation RMSE: {rmse:.4f}")

param_grid = {
    'n_factors': [170, 185, 193],
    'lr_all': [0.012, 0.015, 0.018],
    'reg_all': [0.095],
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(surprise_data)

# Retrain best model on full data
best_model = gs.best_estimator['rmse']
trainset = surprise_data.build_full_trainset()
best_model.fit(trainset)

print("Best RMSE:", gs.best_score['rmse'])
print("Best Params:", gs.best_params['rmse'])

test4 = test.copy()
# Predict using the best model
test4['rating'] = test4.apply(
    lambda row: best_model.predict(row['user_id'], row['product_id']).est,
    axis=1
)

# Clip to valid rating range
test4['rating'] = test4['rating'].clip(1, 5).round(2)

# Save CSV with required format
submission = test4[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/svd_optimized2.csv", index=False)

RMSE: 0.9593
Validation RMSE: 0.9593
Best RMSE: 0.9517115791315832
Best Params: {'n_factors': 150, 'lr_all': 0.009, 'reg_all': 0.095}


# SVD Weighted

In [None]:
weighted_df = train.copy()
weighted_df['confidence'] = 1 + train['helpful_votes'] / (train['votes'] + 1)
weighted_df['adjusted_rating'] = weighted_df['rating'] * weighted_df['confidence']

In [12]:
# We only need these columns for the model
data = weighted_df[['user_id', 'product_id', 'adjusted_rating']]

# Define the rating scale
reader = Reader(rating_scale=(1, 5))

# Load data into Surprise's format
surprise_data = Dataset.load_from_df(data, reader)

# Split for validation
trainset, valset = train_test_split(surprise_data, test_size=0.2, random_state=42)

# Train the model
svd_model = SVD()
svd_model.fit(trainset)

# Evaluate on validation set
predictions = svd_model.test(valset)
rmse = accuracy.rmse(predictions)
print(f"Validation RMSE: {rmse:.4f}")

param_grid = {
    'n_factors': [100, 150, 193],
    'lr_all': [0.012, 0.015, 0.018],
    'reg_all': [0.095],
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(surprise_data)

print("Best RMSE:", gs.best_score['rmse'])
print("Best Params:", gs.best_params['rmse'])

# Retrain best model on full data
best_model = gs.best_estimator['rmse']
trainset = surprise_data.build_full_trainset()
best_model.fit(trainset)

test5 = test.copy()
# Predict using the best model
test5['rating'] = test5.apply(
    lambda row: best_model.predict(row['user_id'], row['product_id']).est,
    axis=1
)

# Clip to valid rating range
test5['rating'] = test5['rating'].clip(1, 5).round(2)

# Save CSV with required format
submission = test5[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/svd_optimized3.csv", index=False)

RMSE: 2.6053
Validation RMSE: 2.6053
Best RMSE: 2.602923130852104
Best Params: {'n_factors': 100, 'lr_all': 0.018, 'reg_all': 0.095}


# TF-IDF

In [13]:
# Build TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(train['product_name'])


# Fit nearest neighbors model
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(tfidf_matrix)

product_idx_map = {pid: idx for idx, pid in enumerate(train['product_id'])}
idx_product_map = {idx: pid for pid, idx in product_idx_map.items()}

top_products = train['product_id'].value_counts().head(20000).index
top_indices = [product_idx_map[pid] for pid in top_products if pid in product_idx_map]

# Filter TF-IDF matrix to just top products
tfidf_subset = tfidf_matrix[top_indices]

# Now only compute neighbors for these
distances, indices = nn_model.kneighbors(tfidf_subset)

# Precompute similar product IDs (excluding self)
similar_items_cache = {}
# distances, indices = nn_model.kneighbors(tfidf_matrix)

for i, row in enumerate(indices):
    pid = idx_product_map[top_indices[i]]  # Correct product ID from original index
    similar_ids = [idx_product_map[idx] for idx in row[1:] if idx in idx_product_map]
    similar_items_cache[pid] = similar_ids

# Precompute average rating per product
product_ratings = train.groupby('product_id')['rating'].mean().to_dict()
global_avg = train['rating'].mean()

def fast_content_predict(product_id):
    similar_ids = similar_items_cache.get(product_id, [])
    ratings = [product_ratings.get(pid) for pid in similar_ids if pid in product_ratings]
    return round(np.mean(ratings) if ratings else global_avg, 2)


In [14]:
test6 = test.copy()
test6['rating'] = test6['product_id'].apply(fast_content_predict)
test6['rating'] = test6['rating'].clip(1, 5)

# Save CSV with required format
submission = test6[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/tf_idf.csv", index=False)

# XGBoost

In [6]:
# Encode user_id and product_id
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

train['user_enc'] = user_encoder.fit_transform(train['user_id'])
train['product_enc'] = product_encoder.fit_transform(train['product_id'])

# Compute helpful_ratio with smoothing
train['helpful_ratio'] = train['helpful_votes'] / (train['votes'] + 1)

# Basic features
features = ['user_enc', 'product_enc', 'votes', 'helpful_votes', 'helpful_ratio']
X = train[features]
y = train['rating']

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    objective='reg:squarederror',
    verbosity=0
)
model.fit(X_train, y_train)

# Load test set
test['product_name'] = test['product_name'].fillna("")

# Encode using trained encoders
test['user_enc'] = test['user_id'].map(lambda x: user_encoder.transform([x])[0] if x in user_encoder.classes_ else -1)
test['product_enc'] = test['product_id'].map(lambda x: product_encoder.transform([x])[0] if x in product_encoder.classes_ else -1)
test['votes'] = test.get('votes', 0)
test['helpful_votes'] = test.get('helpful_votes', 0)
test['helpful_ratio'] = test['helpful_votes'] / (test['votes'] + 1)

# Replace unknown encodings with medians
median_user = int(train['user_enc'].median())
median_product = int(train['product_enc'].median())
test['user_enc'] = test['user_enc'].replace(-1, median_user)
test['product_enc'] = test['product_enc'].replace(-1, median_product)

# Predict ratings
X_test = test[features]
test['rating'] = model.predict(X_test)
test['rating'] = test['rating'].clip(1, 5).round(2)

# Export CSV
submission = test[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/xgboost.csv", index=False)

In [23]:
# reimport a different version of train_test_split that works
from sklearn.model_selection import train_test_split

# Encode user_id and product_id
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

train['user_enc'] = user_encoder.fit_transform(train['user_id'])
train['product_enc'] = product_encoder.fit_transform(train['product_id'])

# Compute helpful_ratio with smoothing
train['helpful_ratio'] = train['helpful_votes'] / (train['votes'] + 1)

# Calculate the average rating a user given across all products
user_avg_rating = train.groupby('user_id')['rating'].mean().to_dict()

# Calculate how many ratings a user has made
user_rating_count = train['user_id'].value_counts().to_dict()

# Calculate the average rating a product has received from users
product_avg_rating = train.groupby('product_id')['rating'].mean().to_dict()

# Calculate how many times a product has been rated
product_rating_count = train['product_id'].value_counts().to_dict()

# Map the average data to the training dataset
train['user_avg_rating'] = train['user_id'].map(user_avg_rating)
train['user_rating_count'] = train['user_id'].map(user_rating_count)
train['product_avg_rating'] = train['product_id'].map(product_avg_rating)
train['product_rating_count'] = train['product_id'].map(product_rating_count)

# Define the new features
features = [
    'user_enc', 'product_enc',
    'votes', 'helpful_votes', 'helpful_ratio',
    'user_avg_rating', 'user_rating_count',
    'product_avg_rating', 'product_rating_count'
]
X = train[features]
y = train['rating']

# Train/test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# define the hyperparameter range
param_range = {
    "n_estimators": randint(100, 300),
    "max_depth": randint(6, 8),
    "learning_rate": uniform(0.01, 0.2),
    "subsample": uniform(0.6, 0.4),
    "colsample_bytree": uniform(0.6, 0.4),
    "reg_alpha": uniform(0, 0.5),
    "reg_lambda": uniform(0.5, 1.5)
}

# perform the search to see which config is the best
search = RandomizedSearchCV(
    estimator=xgb.XGBRegressor(objective='reg:squarederror', verbosity=0),
    param_distributions=param_range,
    scoring='neg_root_mean_squared_error',
    n_iter=10,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)
search.fit(X_train, y_train)
model = search.best_estimator_

# Load test set
test['product_name'] = test['product_name'].fillna("")

# Encode using trained encoders
test['user_enc'] = test['user_id'].map(lambda x: user_encoder.transform([x])[0] if x in user_encoder.classes_ else -1)
test['product_enc'] = test['product_id'].map(lambda x: product_encoder.transform([x])[0] if x in product_encoder.classes_ else -1)
test['votes'] = test.get('votes', 0)
test['helpful_votes'] = test.get('helpful_votes', 0)
test['helpful_ratio'] = test['helpful_votes'] / (test['votes'] + 1)

test['user_avg_rating'] = test['user_id'].map(user_avg_rating)
test['user_rating_count'] = test['user_id'].map(user_rating_count)
test['product_avg_rating'] = test['product_id'].map(product_avg_rating)
test['product_rating_count'] = test['product_id'].map(product_rating_count)


# Replace unknown encodings with medians
median_user = int(train['user_enc'].median())
median_product = int(train['product_enc'].median())
test['user_enc'] = test['user_enc'].replace(-1, median_user)
test['product_enc'] = test['product_enc'].replace(-1, median_product)

# Predict ratings
X_test = test[features]
test['rating'] = model.predict(X_test)
test['rating'] = test['rating'].clip(1, 5).round(2)

# Export CSV
submission = test[['ID', 'rating']]
submission.to_csv("/content/drive/MyDrive/Semi Structured/xgboost2.csv", index=False)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
