# Imports

In [1]:
import json
from time import perf_counter

from firebase import firebase
import pandas as pd
import numpy as np

print("Libraries Imported")

Libraries Imported


# Read Data

In [2]:
try:
    fb_app = firebase.FirebaseApplication('https://watchthis-a7537-default-rtdb.firebaseio.com', None)
    result = fb_app.get('/ratings', None)

    with open("database_backup.json", "w") as db_file:
        json.dump(result, db_file)
    
    print("Database Successfully Read and Backed Up")
    print(f"Database Contains {len(result)} entries")
    
except:
    print("Database could not be read, using last backup")
    
    with open("database_backup.json", "w") as db_file:
        result = json.load(db_file)
        
    print(f"Backup Read - Contains {len(result)} entries")

Database Successfully Read and Backed Up
Database Contains 33 entries


# Format Data

In [3]:
ratings = []
for rating in result.values():
    row = (rating["userId"], rating["movieId"], rating["rating"], rating["timestamp"])
    ratings.append(row)

ratings_df = pd.DataFrame(columns=["userId", "movieId", "rating", "timestamp"], data=ratings)
ratings_df["rating"] = ratings_df["rating"].replace(-1, -7).replace(1, 7)

print("Data Formated - Unique Elements:")
print(ratings_df[["userId", "movieId"]].nunique())

Data Formated - Unique Elements:
userId      7
movieId    30
dtype: int64


# User and Movie Mappings

In [4]:
movie_mappings = (ratings_df['movieId']
                      .drop_duplicates()
                      .sort_values()
                      .reset_index(drop=True)
                      .reset_index()
                      .set_index("movieId")["index"]
                      .to_dict()
                 )

user_mappings = (ratings_df['userId']
                      .drop_duplicates()
                      .sort_values()
                      .reset_index(drop=True)
                      .reset_index()
                      .set_index("userId")["index"]
                      .to_dict()
                )

user_mappings_reverse = {value:key for key, value in user_mappings.items()}
movie_mappings_reverse = {value:key for key, value in movie_mappings.items()}

with open("movie_mappings.json", "w") as movie_file:
    json.dump(movie_mappings, movie_file)

with open("user_mappings.json", "w") as user_file:
    json.dump(user_mappings, user_file)

print("User and Movie Mappings Created and Saved")

User and Movie Mappings Created and Saved


# Preparing Data

In [5]:
# Apply Mappings
ratings_df["movieId"] = ratings_df["movieId"].replace(movie_mappings)
ratings_df["userId"] = ratings_df["userId"].replace(user_mappings)

# Change Type for efficiency
ratings_df["movieId"] = ratings_df["movieId"].astype("uint16")
ratings_df["userId"] = ratings_df["userId"].astype("uint16")
ratings_df["rating"] = ratings_df["rating"].astype("int8")

# Save Dataset
ratings_df.to_csv("watch_this_dataset.csv", index=False)

print("Data Ready and Saved")

Data Ready and Saved


# Pivot Matrix

In [6]:
ratings_sparse = ratings_df.pivot(index='userId', columns='movieId', values='rating')
ratings_sparse

movieId,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-7.0,,7.0,,,,,,,,...,,,,,7.0,,,,,
1,,,,,,,,7.0,,,...,,,,,,,,,,
2,,,,,7.0,,,,,,...,,,,,,,,,,
3,,,,,,,,,7.0,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,-7.0,7.0,,,,-7.0,,,,-7.0,...,,7.0,,-7.0,,7.0,,,7.0,7.0
6,,,,7.0,,,-7.0,,,,...,-7.0,,7.0,,,,7.0,7.0,,


In [7]:
print(ratings_df.info(memory_usage="deep"))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     33 non-null     uint16
 1   movieId    33 non-null     uint16
 2   rating     33 non-null     int8  
 3   timestamp  33 non-null     int64 
dtypes: int64(1), int8(1), uint16(2)
memory usage: 557.0 bytes
None


In [8]:
def stochastic_gradient_descent_l2(X_, alpha=0.01, lambda_=0.14, num_epochs=50, mini_batch_size=1, r=None, seed=42):
    
    # Pandas DataFrame to NumPy
    X = X_.to_numpy()

    if r is None:
        r = X.shape[1]
    
    np.random.seed(seed)
    
    # Initialize p and q
    variance = 0.1    
    p = np.random.normal(0, variance, (X.shape[0], r))
    q = np.random.normal(0, variance, (X.shape[1], r))
    
    train_rmses = []
    
    # Iterate only over non-nan indexes
    non_nan_indexes = list(zip(*np.where(~np.isnan(X))))
    
    if mini_batch_size == 0:
        mini_batch_size = len(non_nan_indexes) - 1
    
    for _ in range(num_epochs):
        
        # Stochastic part
        np.random.shuffle(non_nan_indexes)
        
        # Save p and q
        p_new = p.copy()
        q_new = q.copy()
        
        for index, (u, i) in enumerate(non_nan_indexes):
            
            # Update p and q every mini_batch_size iterations
            if index % mini_batch_size == 0:
                p = p_new.copy()
                q = q_new.copy()
            
            # Update p and q new with L2 regularization derivatives
            error = X[u][i] - np.dot(p[u], q[i]) 
            
            p_new[u] += alpha * (error * q[i] - lambda_ * (2 * p[u])) / mini_batch_size
            q_new[i] += alpha * (error * p[u] - lambda_ * (2 * q[i])) / mini_batch_size
        
        # Calculate RSME
        rmse = np.sqrt(np.nanmean((X - p @ q.T) ** 2))
        train_rmses.append(rmse)
        
    return p, q, train_rmses

In [9]:
def calc_rmse(train, test, p, q):
    errors = []
    
    X_approx = p @ q.T
    
    for _, (user_, movie_, rating) in test[["userId", "movieId", "rating"]].iterrows():
        user_index = np.argmax(train.index == user_)
        movie_index = np.argmax(train.columns == movie_)
        error = rating - X_approx[user_index, movie_index]
        errors.append(error)

    errors = np.array(errors)
    return np.sqrt(np.mean(errors ** 2))

# Training on Data

In [10]:
print("Fitting the Model")

start_time = perf_counter()
p, q, rmse_train = stochastic_gradient_descent_l2(ratings_sparse)
elapsed_time = perf_counter() - start_time

print(f"Model Fit successfully - RMSE on Data: {rmse_train[-1]:.2f} - Fitting Time: {elapsed_time:.2f}s")

with open("p_vector.json", "w") as p_file:
    json.dump(p.tolist(), p_file)

with open("q_vector.json", "w") as q_file:
    json.dump(q.tolist(), q_file)

print("Parameters Saved")

Fitting the Model
Model Fit successfully - RMSE on Data: 0.46 - Fitting Time: 0.16s
Parameters Saved


# Predictions

In [11]:
X_approx = p @ q.T
predictions = pd.DataFrame(X_approx)

predictions_replaced = predictions.rename(user_mappings_reverse).rename(movie_mappings_reverse, axis=1)
predictions_replaced

Unnamed: 0,tt0117951,tt0119217,tt0120689,tt0120815,tt0133093,tt0167261,tt0172495,tt0246578,tt0266697,tt0378194,...,tt1392170,tt1392190,tt1431045,tt1504320,tt1663202,tt1825683,tt2024544,tt2084970,tt2278388,tt4154796
big_weevle99,-6.781451,3.654665,6.659073,1.207573,-0.788283,-3.303112,-0.80233,-0.094382,-0.019336,-3.069736,...,-0.845201,3.577608,0.095467,-3.724834,6.634191,3.783735,1.019347,0.742881,3.592005,3.362712
fine_judge76,0.381039,-0.272711,-0.152748,-1.34722,-1.215995,0.625873,1.287446,6.197365,2.012823,0.267797,...,1.054041,-0.35426,-1.430988,0.115042,0.291044,-0.049458,-1.72987,-1.130907,-0.291964,-0.526516
fine_young76,0.614642,-0.979773,-0.747382,0.764408,6.252815,0.535771,-0.890267,-1.232333,-2.146984,0.905596,...,-0.770385,-0.882749,0.846056,0.855848,-0.444456,-0.727735,0.784203,0.96838,-1.114711,-0.875436
glad_berry17,-0.36457,0.868326,-0.08851,0.111129,-2.133922,-0.59964,0.356629,1.952417,6.449446,-0.802182,...,0.244005,1.009081,-0.022552,-0.803126,0.00645,0.627701,-0.221422,-0.104537,0.835572,0.747439
glad_donny85,0.145133,-1.099378,0.536445,-0.296638,1.716453,0.991851,0.029454,1.238219,-1.159608,1.10021,...,-0.094787,-1.217877,-0.239197,1.124977,0.655249,-1.294539,-0.263662,-0.03796,-1.161559,-1.279485
main_sharp88,-6.598948,6.581253,3.995693,1.729285,-1.660388,-6.583263,-1.618338,-0.540202,1.352832,-6.57749,...,-1.484073,6.516721,1.089904,-6.587427,4.011464,6.587648,2.015406,1.808816,6.588818,6.584878
slow_fagin83,-1.38263,0.922095,0.848314,6.600199,1.610711,-1.915275,-6.625682,-2.401843,-0.357885,-1.24388,...,-6.626947,1.983336,6.597274,-1.592672,0.489992,1.427508,6.624387,6.622307,1.33436,1.873329


# Recommendations

In [12]:
print("Computing Best Recommendations for Every User")
start_time = perf_counter()

recommendations = {}

for user, movies in predictions_replaced.iterrows():
    watched_movies = set(ratings_df[ratings_df["userId"] == user]["movieId"])
    
    predicted_recommendations = np.argsort(movies)[::-1]
    
    user_recommendations = predicted_recommendations[~predicted_recommendations.isin(watched_movies)]
    
    recommendations[user] = list(user_recommendations[:10].replace(movie_mappings_reverse))

elapsed_time = perf_counter() - start_time
print(f"Recommendations Computed - Computation Time: {elapsed_time:.3f}s")

Computing Best Recommendations for Every User
Recommendations Computed - Computation Time: 0.042s


In [14]:
with open("recommendations.json", "w") as recommendations_file:
    json.dump(recommendations, recommendations_file)

print("Recommendations Saved")

recommendations

Recommendations Saved


{'big_weevle99': ['tt0120689',
  'tt1663202',
  'tt0800369',
  'tt1825683',
  'tt0119217',
  'tt2278388',
  'tt1392190',
  'tt0416449',
  'tt0816692',
  'tt4154796'],
 'fine_judge76': ['tt0246578',
  'tt0266697',
  'tt0172495',
  'tt0382932',
  'tt1345836',
  'tt1392170',
  'tt1205489',
  'tt0167261',
  'tt1010048',
  'tt0117951'],
 'fine_young76': ['tt0133093',
  'tt1228705',
  'tt0382932',
  'tt2084970',
  'tt0378194',
  'tt1504320',
  'tt1431045',
  'tt1010048',
  'tt2024544',
  'tt0120815'],
 'glad_berry17': ['tt0266697',
  'tt0246578',
  'tt1392190',
  'tt0119217',
  'tt0468569',
  'tt0816692',
  'tt2278388',
  'tt0499549',
  'tt4154796',
  'tt0800369'],
 'glad_donny85': ['tt0382932',
  'tt0133093',
  'tt0246578',
  'tt1228705',
  'tt1504320',
  'tt0378194',
  'tt1010048',
  'tt0167261',
  'tt1205489',
  'tt1663202'],
 'main_sharp88': ['tt0416449',
  'tt2278388',
  'tt1825683',
  'tt0468569',
  'tt0499549',
  'tt4154796',
  'tt0119217',
  'tt0816692',
  'tt0800369',
  'tt1392190']

# Suggestions

In [15]:
print("Computing Suggestion to Rate for Every User")
start_time = perf_counter()

suggestions = {}

for user, movies in predictions_replaced.iterrows():
    watched_movies = set(ratings_df[ratings_df["userId"] == user]["movieId"])
    
    predicted_suggestions = np.argsort(np.abs(movies))
    
    user_suggestions = predicted_suggestions[~predicted_suggestions.isin(watched_movies)]
    
    suggestions[user] = list(user_suggestions[:10].replace(movie_mappings_reverse))

elapsed_time = perf_counter() - start_time
print(f"Suggestions Computed - Computation Time: {elapsed_time:.3f}s")

Computing Suggestion to Rate for Every User
Suggestions Computed - Computation Time: 0.052s


In [16]:
with open("suggestions.json", "w") as suggestions_file:
    json.dump(suggestions, suggestions_file)

print("Suggestions Saved")

suggestions

Suggestions Saved


{'big_weevle99': ['tt0266697',
  'tt0246578',
  'tt1431045',
  'tt2084970',
  'tt0133093',
  'tt0382932',
  'tt0172495',
  'tt1392170',
  'tt1345836',
  'tt2024544'],
 'fine_judge76': ['tt1825683',
  'tt1504320',
  'tt0468569',
  'tt0120689',
  'tt0378194',
  'tt0119217',
  'tt1663202',
  'tt2278388',
  'tt1392190',
  'tt0117951'],
 'fine_young76': ['tt1205489',
  'tt1663202',
  'tt0167261',
  'tt0117951',
  'tt1345836',
  'tt1825683',
  'tt0120689',
  'tt0120815',
  'tt1392170',
  'tt2024544'],
 'glad_berry17': ['tt1663202',
  'tt1431045',
  'tt1345836',
  'tt0120689',
  'tt2084970',
  'tt0120815',
  'tt2024544',
  'tt1392170',
  'tt0172495',
  'tt0117951'],
 'glad_donny85': ['tt0172495',
  'tt2084970',
  'tt1392170',
  'tt0117951',
  'tt1345836',
  'tt1431045',
  'tt2024544',
  'tt0120815',
  'tt0120689',
  'tt1663202'],
 'main_sharp88': ['tt0246578',
  'tt1431045',
  'tt0266697',
  'tt1392170',
  'tt0172495',
  'tt0133093',
  'tt0120815',
  'tt2084970',
  'tt0382932',
  'tt2024544']