# Iterative SVD

We first install the necessary packages.

In [None]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357289 sha256=ef4c2369be5278197dede32066538c59c0f3002f90ebde73e7c283223146953d
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succe

In [None]:
import pandas as pd
import numpy as np

## Temporal Splitting on Newest Data

We split the data temporally. In other words, we first find the 180000 most recent ratings. We then reserve the newest 20000 of the 180000 ratings as the test set. Among the remaining 160000, we randomly split them into a training set of 80000 that is to be used for each of the individual models, and another training set of 80000 that is to be used for the ensemble model.

In [None]:
full = pd.read_csv('test.csv').drop(columns=['Unnamed: 0'])[-180000:] # despite its name, test.csv actually contains the 1000000 newest ratings sorted by rating date from the oldest to the newest
full # full contains the 180000 newest ratings

Unnamed: 0,movie_id,user_id,rating,date,Year,Title
20000,17147,427460,4,2005-12-29,1996,She's the One
20001,4353,1605111,5,2005-12-29,2002,Curb Your Enthusiasm: Season 3
20002,3923,13949,3,2005-12-29,1997,Beverly Hills Ninja
20003,6874,200779,3,2005-12-29,2003,The Cooler
20004,12582,851475,4,2005-12-29,2003,Mystic River
...,...,...,...,...,...,...
199995,8993,2183787,4,2005-12-31,2005,Family Guy Presents: Stewie Griffin: The Untol...
199996,7430,258170,4,2005-12-31,2001,Six Feet Under: Season 1
199997,8467,1534359,5,2005-12-31,1996,Eraser
199998,10168,2543295,2,2005-12-31,2003,The League of Extraordinary Gentlemen


In [None]:
train_full = full[:160000] # the 160000 ratings used in training
train_full

Unnamed: 0,movie_id,user_id,rating,date,Year,Title
20000,17147,427460,4,2005-12-29,1996,She's the One
20001,4353,1605111,5,2005-12-29,2002,Curb Your Enthusiasm: Season 3
20002,3923,13949,3,2005-12-29,1997,Beverly Hills Ninja
20003,6874,200779,3,2005-12-29,2003,The Cooler
20004,12582,851475,4,2005-12-29,2003,Mystic River
...,...,...,...,...,...,...
179995,13629,1318034,4,2005-12-31,1951,Alice in Wonderland
179996,17324,1719503,4,2005-12-31,2005,Hitch
179997,17324,22846,5,2005-12-31,2005,Hitch
179998,3860,1799620,2,2005-12-31,2003,Bruce Almighty


In [None]:
test = full[-20000:] # the 20000 ratings used for testing
test

Unnamed: 0,movie_id,user_id,rating,date,Year,Title
180000,16445,1250138,2,2005-12-31,2003,"House of 1,000 Corpses"
180001,6850,714682,1,2005-12-31,2005,Lords of Dogtown
180002,3441,859907,2,2005-12-31,2005,Kicking & Screaming
180003,10748,2373473,3,2005-12-31,1987,Hamburger Hill
180004,5496,1678873,2,2005-12-31,2004,"I, Robot"
...,...,...,...,...,...,...
199995,8993,2183787,4,2005-12-31,2005,Family Guy Presents: Stewie Griffin: The Untol...
199996,7430,258170,4,2005-12-31,2001,Six Feet Under: Season 1
199997,8467,1534359,5,2005-12-31,1996,Eraser
199998,10168,2543295,2,2005-12-31,2003,The League of Extraordinary Gentlemen


In [None]:
from sklearn.model_selection import train_test_split

# Randomly split the 160000 training data into a set used for individual models and another set used for the ensemble model
train, ensemble_train = train_test_split(train_full, test_size=0.5, random_state=42)
print(len(train))
print(len(ensemble_train))

80000
80000


In [None]:
train # "train" is used for training each of the individual models

Unnamed: 0,movie_id,user_id,rating,date,Year,Title
72231,15582,1386463,3,2005-12-29,2002,Sweet Home Alabama
75364,9087,1380250,3,2005-12-29,1992,Hero
53303,16793,2188505,3,2005-12-29,1973,The Exorcist
47730,5762,1109774,2,2005-12-29,2000,Almost Famous
136959,10072,1067658,3,2005-12-30,2004,Prime Suspect 6
...,...,...,...,...,...,...
139879,2457,71480,2,2005-12-30,2004,A Cinderella Story
123694,11089,2300374,4,2005-12-30,2001,"Monsters, Inc."
151932,6134,2610903,4,2005-12-30,2004,Collateral
166867,5762,1784540,4,2005-12-31,2000,Almost Famous


In [None]:
ensemble_train # "ensemble_train" is used for training the ensemble model

Unnamed: 0,movie_id,user_id,rating,date,Year,Title
140476,17574,2475007,5,2005-12-30,1996,Eye for an Eye
52693,14312,583131,4,2005-12-29,1993,Jurassic Park
99958,10747,1795937,4,2005-12-30,1987,Can't Buy Me Love
96366,4306,617075,4,2005-12-30,1999,The Sixth Sense
102343,4302,1999688,5,2005-12-30,1982,An Officer and a Gentleman
...,...,...,...,...,...,...
141665,191,1965579,5,2005-12-30,2003,X2: X-Men United
142463,1104,1498526,3,2005-12-30,1983,Krull
68961,2153,1916203,3,2005-12-29,1993,Free Willy
151628,15788,1988882,3,2005-12-30,2003,Matchstick Men


In [None]:
len(train['movie_id'].unique()) # number of movies in training set for individual models

8070

In [None]:
len(train['user_id'].unique()) # number of users in training set for individual models

16062

#### Further Split of Train and Validation Sets

We use the validation set approach to select the hyperparameters for each of the individual models. In this case, we further divide the training set for the individual models into a non-validation set of size 64000 (used for training in the validation set approach) and a validation set of size 16000 (used for testing in the validation set approach).

In [None]:
non_validation = train[:64000]
non_validation # used for training when selecting hyperparameters

Unnamed: 0,movie_id,user_id,rating,date,Year,Title
72231,15582,1386463,3,2005-12-29,2002,Sweet Home Alabama
75364,9087,1380250,3,2005-12-29,1992,Hero
53303,16793,2188505,3,2005-12-29,1973,The Exorcist
47730,5762,1109774,2,2005-12-29,2000,Almost Famous
136959,10072,1067658,3,2005-12-30,2004,Prime Suspect 6
...,...,...,...,...,...,...
30015,7879,40960,3,2005-12-29,2002,Super Troopers
80369,6274,240400,3,2005-12-30,1990,The Hunt for Red October
55706,16796,2026731,5,2005-12-29,2001,Brian's Song
45223,357,649154,4,2005-12-29,2003,House of Sand and Fog


In [None]:
validation = train[64000:]
validation # used for testing when selecting hyperparameters

Unnamed: 0,movie_id,user_id,rating,date,Year,Title
31584,6833,793778,5,2005-12-29,1995,Billy Madison
161832,16784,1663055,3,2005-12-31,2005,The Sisterhood of the Traveling Pants
63677,5496,2397099,4,2005-12-29,2004,"I, Robot"
70478,10906,1208781,2,2005-12-29,2004,Cellular
148826,17154,1489446,5,2005-12-30,1993,Philadelphia
...,...,...,...,...,...,...
139879,2457,71480,2,2005-12-30,2004,A Cinderella Story
123694,11089,2300374,4,2005-12-30,2001,"Monsters, Inc."
151932,6134,2610903,4,2005-12-30,2004,Collateral
166867,5762,1784540,4,2005-12-31,2000,Almost Famous


## compute_rmse Function

We define a compute_rmse function that takes in the predicted output and actual output, so that we can conveniently compute the RMSE of our predictions.

In [None]:
# Takes in two NumPy arrays
def compute_rmse(predicted, actual):
    return np.sqrt(np.mean((predicted - actual) * (predicted - actual)))

## Model Training and Testing

We implement the Iterative SVD algorithm in the IterativeSVD class below.

In [None]:
from scipy.sparse import lil_matrix, csr_matrix
from scipy.sparse.linalg import svds

class IterativeSVD:
    def __init__(self, k=10, num_epochs=10):
        # Initialize instance variables
        self.k = k
        self.num_epochs = num_epochs
        self.R = None
        self.global_mean = 0
        self.user_map = None
        self.movie_map = None
        self.user_means = []
        self.movie_means = []

    def fit(self, X, y):
        # Map user_id and movie_id to indices
        users = X['user_id'].unique()
        movies = X['movie_id'].unique()
        user_map = {user_id: idx for idx, user_id in enumerate(users)}
        movie_map = {movie_id: idx for idx, movie_id in enumerate(movies)}
        self.user_map = user_map
        self.movie_map = movie_map
        self.global_mean = np.mean(y)

        for u in users:
            self.user_means.append(np.mean(y[X['user_id'] == u]))
        for m in movies:
            self.movie_means.append(np.mean(y[X['movie_id'] == m]))
        self.user_means = np.array(self.user_means)
        self.movie_means = np.array(self.movie_means)

        # Create the sparse ratings matrix to save memory
        num_users = len(users)
        num_movies = len(movies)
        R = lil_matrix((num_users, num_movies))
        for (user, movie, rating) in zip(X['user_id'], X['movie_id'], y):
            R[user_map[user], movie_map[movie]] = rating - 0.5 * self.user_means[user_map[user]] - 0.5 * self.movie_means[movie_map[movie]]
        R = R.tocsr()

        # Main loop
        for epoch in range(self.num_epochs):
            # Perform SVD
            U, sigma, VT = svds(R, k=self.k)
            sigma = np.diag(sigma)

            # Reconstruct sparse matrix
            R_reconstructed = csr_matrix(U) @ csr_matrix(sigma) @ csr_matrix(VT)

            # Update the non-zero entries
            mask_non_zero = R != 0
            mask_zero = R == 0
            R = R.multiply(mask_non_zero) + R_reconstructed.multiply(mask_zero)
            self.R = R

            # Logging curent progress
            print(f"Epoch {epoch + 1} completed.")


    def predict(self, X):
        # Get the corresponding index for each user id and each movie_id
        user_indices = np.array([self.user_map.get(u, -1) for u in X['user_id']])
        item_indices = np.array([self.movie_map.get(i, -1) for i in X['movie_id']])

        # Initialize predictions
        predictions = np.full(len(X), self.global_mean)  # Set global mean as default prediction value
        valid_mask = (user_indices != -1) & (item_indices != -1)

        # Loop to predict each data point
        for idx in np.where(valid_mask)[0]:
            user_idx = user_indices[idx]
            item_idx = item_indices[idx]
            predictions[idx] = (
                self.R[user_idx, item_idx]
                + 0.5 * self.user_means[user_idx]
                + 0.5 * self.movie_means[item_idx]
            )

        return predictions


    def get_full_predictions(self):
        return self.R.toarray() + self.global_mean


Below is the code to select the best value of K. We did not run this piece of code due to limited computing power.

In [None]:
'''best_rmse = np.inf
best_params = None

# Candidate hyperparameter values
K_list = [10, 20, 100]

# Main loop
for K in K_list:
  model = IterativeSVD(K, 3)
  model.fit(non_validation.drop(columns=['rating']), non_validation['rating'])
  rmse = compute_rmse(validation['rating'].to_numpy(), model.predict(validation))
  if (rmse < best_rmse):
    best_rmse = rmse
    best_params = [K]'''

We train an Iterative SVD model with $K=2$ and 3 epochs, and then evaluate its test RMSE.

In [None]:
# Model training
iter_svd_100000 = IterativeSVD(2,3)
iter_svd_100000.fit(train[['movie_id', 'user_id']], train['rating'])

78428


  iter_svd_100000.fit(train[['movie_id', 'user_id']], train['rating'])


Epoch 1 completed.
120243000
Epoch 2 completed.
120243000
Epoch 3 completed.
120243000


In [None]:
# Evaluate the test RMSE
compute_rmse(test['rating'].to_numpy(), iter_svd_100000.predict(test))

0.9649977299206953

## Data Preparation for the Ensemble Model

Next, we prepare a .csv file of predicted values from ensemble_train by Iterative SVD, which will be used as part of the training input for the ensemble model.

In [None]:
predictions_iter_svd_train = iter_svd_100000.predict(ensemble_train)
predictions_iter_svd_train

array([3.64941684, 4.44109747, 3.55769938, ..., 3.79766259, 3.37804541,
       3.96089828])

In [None]:
ensemble_train['rating']

Unnamed: 0,rating
140476,5
52693,4
99958,4
96366,4
102343,5
...,...
141665,5
142463,3
68961,3
151628,3


In [None]:
predictions_df_train_iter_svd = pd.DataFrame(predictions_iter_svd_train, columns=['PredictedRatingsIterSVD'])
predictions_df_train_iter_svd

Unnamed: 0,PredictedRatingsIterSVD
0,3.649417
1,4.441097
2,3.557699
3,4.287699
4,4.336262
...,...
79995,3.888325
79996,3.145347
79997,3.797663
79998,3.378045


In [None]:
predictions_df_train_iter_svd.to_csv('iter_svd_train_predicted.csv')

Then, we prepare a .csv file of predicted values from the test set by Iterative SVD, which will be used as part of the testing input for the ensemble model.

In [None]:
predictions_iter_svd_test = iter_svd_100000.predict(test)
predictions_iter_svd_test

array([2.85359464, 3.37018092, 3.676725  , ..., 4.1829002 , 2.99296803,
       3.90000012])

In [None]:
predictions_df_test_iter_svd = pd.DataFrame(predictions_iter_svd_test, columns=['PredictedRatingsIterSVD'])
predictions_df_test_iter_svd

Unnamed: 0,PredictedRatingsIterSVD
0,2.853595
1,3.370181
2,3.676725
3,3.332523
4,3.670148
...,...
19995,4.035606
19996,3.676725
19997,4.182900
19998,2.992968


In [None]:
predictions_df_test_iter_svd.to_csv('iter_svd_test_predicted.csv')