In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import random


In [2]:
movie_df = pd.read_csv('./df_movies_final.csv')
ratings_df = pd.read_csv('./ratings_without_timestamp.csv')

In [3]:
AllRatings = ratings_df

In [17]:
AllRatings_sub_df = AllRatings[AllRatings['userId'] <= 5000]
AllRatings_sub = AllRatings[AllRatings['userId'] <= 5000]
AllRatings_sub

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
733150,5000,1073,2.0
733151,5000,1210,3.0
733152,5000,1356,3.0
733153,5000,1393,4.0


In [18]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [19]:
# ratings_size = AllRatings
AllRatings_sub = AllRatings_sub.sample(frac=1, random_state=42).reset_index(drop=True)
AllRatings_sub = AllRatings_sub.to_numpy()
ratingsTrain = AllRatings_sub[:int(len(AllRatings_sub)*0.6)]
ratingsValid = AllRatings_sub[int(len(AllRatings_sub)*0.6):int(len(AllRatings_sub)*0.8)]
ratingsTest = AllRatings_sub[int(len(AllRatings_sub)*0.8):]

usersPerItem = defaultdict(set)
itemsPerUser = defaultdict(set)
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
ratingOnly = [] # to calculate global average
ratingDict = {}
userID, itemID = set(), set()

for user, item, rating in ratingsTrain:
    user, item, rating = str(int(user)), str(int(item)), rating
    userID.add(user)
    itemID.add(item)

    ratingsPerUser[user].append((item,rating))
    ratingsPerItem[item].append((user,rating))
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    ratingDict[(user,item)] = rating
    ratingOnly.append(rating)



In [None]:
# Grace version Latent Factor Models only bias

def preprocess_data(ratingsTrain):
    user_ratings = defaultdict(list)
    item_ratings = defaultdict(list)
    all_ratings = []
    for u, i, r in ratingsTrain:
        rating = int(r)
        user_ratings[u].append((i, rating))
        item_ratings[i].append((u, rating))
        all_ratings.append(rating)
    return user_ratings, item_ratings, all_ratings

def compute_biases(user_ratings, item_ratings, global_average, lamb):
    """Compute user and item biases using gradient descent."""
    user_bias = {u: 0 for u in user_ratings}
    item_bias = {i: 0 for i in item_ratings}
    
    # Gradient descent for biases
    for iteration in range(50):  # Fixed number of iterations
        # Update user biases
        for u in user_ratings:
            user_bias[u] = sum(r - (global_average + item_bias[i]) for i, r in user_ratings[u]) / (
                lamb + len(user_ratings[u])
            )
        # Update item biases
        for i in item_ratings:
            item_bias[i] = sum(r - (global_average + user_bias[u]) for u, r in item_ratings[i]) / (
                lamb + len(item_ratings[i])
            )
    return user_bias, item_bias

def rating_prediction(ratingsValid, lamb=5.0):
    """Predict ratings using user and item biases."""
    user_ratings, item_ratings, all_ratings = preprocess_data(ratingsTrain)
    global_average = np.mean(all_ratings)
    user_bias, item_bias = compute_biases(user_ratings, item_ratings, global_average, lamb)

    pred_rating = []
    true_rating = []
    for u, i, r in ratingsValid:
        p = global_average + user_bias.get(u, 0) + item_bias.get(i, 0)
        true_rating.append(r)
        pred_rating.append(p)
    mse = MSE(pred_rating, true_rating)
    print(mse)

rating_prediction(ratingsValid)



0.7738235360013701


In [None]:
# Kris's Version Latent Factor Models with bias+gamma

lambda_reg = 0.2
learning_rate = 0.01
n_epochs = 40
K = 5

best_mse = float('inf')
patience = 5
patience_counter = 0

# 
ratings_train = [rating for _, _, rating in ratingsTrain]
alpha = np.mean(ratings_train)

user_biases = defaultdict(float)
item_biases = defaultdict(float)

userGamma = defaultdict(lambda: [0.0] * K, {user: [random.random() * 0.1 - 0.05 for _ in range(K)] for user in ratingsPerUser})
itemGamma = defaultdict(lambda: [0.0] * K, {item: [random.random() * 0.1 - 0.05 for _ in range(K)] for item in ratingsPerItem})

for epoch in range(n_epochs):
    if epoch < 30:
        learning_rate = 0.01
    else:
        learning_rate = 0.001

    for user, item, rating in ratingsTrain:
        if user not in userGamma:
            userGamma[user] = [random.random() * 0.1 - 0.05 for _ in range(K)]
        if item not in itemGamma:
            itemGamma[item] = [random.random() * 0.1 - 0.05 for _ in range(K)]
        
        prediction = alpha + user_biases.get(user, 0) + item_biases.get(item, 0) + inner(userGamma[user], itemGamma[item])
        error = rating - prediction

        user_biases[user] += learning_rate * (error - lambda_reg * user_biases[user])
        item_biases[item] += learning_rate * (error - lambda_reg * item_biases[item])
        
        for k in range(K):
            userGamma[user][k] += learning_rate * (error * itemGamma[item][k] - lambda_reg * userGamma[user][k])
            itemGamma[item][k] += learning_rate * (error * userGamma[user][k] - lambda_reg * itemGamma[item][k])

    squared_errors = []
    for user, item, rating in ratingsValid:
        prediction = alpha + user_biases.get(user, 0) + item_biases.get(item, 0) + inner(userGamma.get(user, [0] * K), itemGamma.get(item, [0] * K))
        squared_errors.append((rating - prediction) ** 2)
    mse = np.mean(squared_errors)
    print(f"Epoch {epoch+1}/{n_epochs}, Validation MSE: {mse}")

    if mse < best_mse:
        best_mse = mse
        patience_counter = 0
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping...")
            break


Epoch 1/40, Validation MSE: 0.8095351285851702
Epoch 2/40, Validation MSE: 0.7833409579031452
Epoch 3/40, Validation MSE: 0.7724539950360055
Epoch 4/40, Validation MSE: 0.7665107008245938
Epoch 5/40, Validation MSE: 0.7628291333981808
Epoch 6/40, Validation MSE: 0.7603709818138574
Epoch 7/40, Validation MSE: 0.7586445599239138
Epoch 8/40, Validation MSE: 0.7573868074140805
Epoch 9/40, Validation MSE: 0.7564447957696249
Epoch 10/40, Validation MSE: 0.7557240183371043
Epoch 11/40, Validation MSE: 0.7551632406432471
Epoch 12/40, Validation MSE: 0.754721250169726
Epoch 13/40, Validation MSE: 0.7543694269527974
Epoch 14/40, Validation MSE: 0.754087363452561
Epoch 15/40, Validation MSE: 0.7538601730522678
Epoch 16/40, Validation MSE: 0.7536767778727188
Epoch 17/40, Validation MSE: 0.7535287870842289
Epoch 18/40, Validation MSE: 0.7534097432748597
Epoch 19/40, Validation MSE: 0.7533146048545979
Epoch 20/40, Validation MSE: 0.7532393836107224
Epoch 21/40, Validation MSE: 0.7531808864705776
Epo

In [20]:
data_merge = AllRatings_sub_df.merge(movie_df, on='movieId')
data_merge

Unnamed: 0,userId,movieId,rating,title,year,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,296,5.0,Pulp Fiction,1994,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,306,3.5,Three Colors: Red (Trois couleurs: Rouge),1994,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,307,5.0,Three Colors: Blue (Trois couleurs: Bleu),1993,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,665,5.0,Underground,1995,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,1,899,3.5,Singin' in the Rain,1952,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
733150,5000,1073,2.0,Willy Wonka & the Chocolate Factory,1971,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
733151,5000,1210,3.0,Star Wars: Episode VI - Return of the Jedi,1983,0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,0
733152,5000,1356,3.0,Star Trek: First Contact,1996,0,1,1,0,0,...,0,0,0,0,0,0,1,1,0,0
733153,5000,1393,4.0,Jerry Maguire,1996,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

data_merge['user_avg_rating'] = data_merge.groupby('userId')['rating'].transform('mean')
data_merge['movie_avg_rating'] = data_merge.groupby('movieId')['rating'].transform('mean')

X = data_merge.drop(['rating', 'title'], axis=1)

label_encoder = LabelEncoder()
X['year'] = label_encoder.fit_transform(X['year'])


y = data_merge['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
from sklearn.neural_network import MLPRegressor
model = MLPRegressor(hidden_layer_sizes=(128, 64), activation='relu', max_iter=500)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = MSE(y_pred, y_test)
mse

1.0546463678174751

0.8247223657009919

In [22]:
import lightgbm as lgb

model = lgb.LGBMRegressor(learning_rate=0.1, n_estimators=500, max_depth=10)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = MSE(y_pred, y_test)
mse

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025098 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1166
[LightGBM] [Info] Number of data points in the train set: 586524, number of used features: 25
[LightGBM] [Info] Start training from score 3.542564


0.6846602443344569

5000 user
with year column mse = 0.6846602443344569
without year column mse = 0.6879536726867219