In [1]:
from google.cloud import bigquery
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split

In [2]:
client = bigquery.Client(project="virtualization-and-cloud")

In [3]:
query = """
SELECT userId, movieId, rating
FROM `virtualization-and-cloud.movies.ratings`
"""
ratings_df = client.query(query).to_dataframe()

In [10]:
reader = Reader(rating_scale=(0.1, 5.0))

In [11]:
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

In [12]:
trainset, testset = train_test_split(data, test_size=0.2)

In [8]:
# Use SVD or SVD++
model = SVD()  # or SVDpp()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1d75a0ff2b0>

In [13]:
# Evaluate
predictions = model.test(testset)

NameError: name 'model' is not defined

In [10]:
from surprise import accuracy

print("RMSE:", accuracy.rmse(predictions))

RMSE: 0.7859
RMSE: 0.785879139334552


In [11]:
from surprise import dump

# Save the model to a file
dump.dump('svd_model.pkl', algo=model)

In [6]:
from surprise import SVD
from surprise.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_factors': [50, 100, 150],
    'n_epochs': [20, 30],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.1]
}

# Run grid search
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, n_jobs=6, joblib_verbose=2)
gs.fit(data)

# Output best RMSE and best hyperparameters
print("Best RMSE:", gs.best_score['rmse'])
print("Best params:", gs.best_params['rmse'])

# Use the best model
best_model = gs.best_estimator['rmse']
trainset = data.build_full_trainset()
best_model.fit(trainset)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed: 227.7min
[Parallel(n_jobs=6)]: Done 108 out of 108 | elapsed: 801.4min finished


Best RMSE: 0.7911706351666332
Best params: {'n_factors': 50, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.02}


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x260575d4d60>

In [9]:
import json

# Save best parameters to a JSON file
with open("best_svd_params.json", "w") as f:
    json.dump(gs.best_params['rmse'], f, indent=4)

In [7]:
trainset, testset = train_test_split(data, test_size=0.2)
best_model.fit(trainset)
predictions = best_model.test(testset)

from surprise import accuracy
print("Final RMSE on Test:", accuracy.rmse(predictions))

RMSE: 0.7823
Final RMSE on Test: 0.7822703370699458


In [8]:
import pickle

# Save model to file
with open("svd_best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

In [10]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_df[["userId", "movieId", "rating"]], reader)

In [11]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [15]:
from surprise import dump
_, loaded_model = dump.load('svd_model.pkl')

In [16]:
# You don't need to re-fit the model; it's already trained
predictions = loaded_model.test(testset)

# Calculate and print RMSE
print("RMSE on test set:", accuracy.rmse(predictions))

RMSE: 0.6999
RMSE on test set: 0.6999447550324025


In [15]:
import pickle
with open("svd_best_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [None]:
# You don't need to re-fit the model; it's already trained
predictions = loaded_model.test(testset)

# Calculate and print RMSE
print("RMSE on test set:", accuracy.rmse(predictions))

In [None]:
with open("svd_best_model.pkl", "rb") as f:
    loaded_model = pickle.load(f)

In [21]:
all_movie_ids = ratings_df['movieId'].unique()

In [22]:
user_id = 1
user_rated = ratings_df[ratings_df['userId'] == user_id]['movieId'].values

In [23]:
movies_to_predict = [movie_id for movie_id in all_movie_ids if movie_id not in user_rated]

# Predict ratings for all unseen movies
predictions = [
    (movie_id, loaded_model.predict(user_id, movie_id).est)
    for movie_id in movies_to_predict
]

# Sort by predicted rating and take top 10
top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:10]

# Print or return the top 10 movie IDs and scores
print("Top 10 Recommendations for User 1:")
for movie_id, predicted_rating in top_n:
    print(f"Movie ID: {movie_id}, Predicted Rating: {predicted_rating:.2f}")

Top 10 Recommendations for User 1:
Movie ID: 1721, Predicted Rating: 5.00
Movie ID: 72641, Predicted Rating: 4.85
Movie ID: 182723, Predicted Rating: 4.85
Movie ID: 1917, Predicted Rating: 4.85
Movie ID: 217655, Predicted Rating: 4.83
Movie ID: 73881, Predicted Rating: 4.80
Movie ID: 165421, Predicted Rating: 4.78
Movie ID: 3916, Predicted Rating: 4.75
Movie ID: 215541, Predicted Rating: 4.74
Movie ID: 72998, Predicted Rating: 4.74


In [24]:
user_ids = ratings_df['userId'].unique()  # Get unique user IDs

In [25]:
recommendations = []

In [26]:
import pandas as pd

In [None]:
for user_id in user_ids:
    # Get all movie IDs (you can replace this with a list of movie IDs from your dataset)
    all_movie_ids = ratings_df['movieId'].unique()

    # Generate predictions for all movies for the current user
    predictions = [loaded_model.predict(user_id, movie_id) for movie_id in all_movie_ids]

    # Sort the predictions by predicted rating in descending order
    predictions_sorted = sorted(predictions, key=lambda x: x.est, reverse=True)

    # Select the top 100 predictions
    top_100_predictions = predictions_sorted[:100]

    # Store the recommendations in the DataFrame
    for rank, prediction in enumerate(top_100_predictions, 1):
        recommendations.append({
            'userId': user_id,
            'movieId': prediction.iid,
            'predictedRating': prediction.est,
            'rank': rank
        })

# Convert the recommendations list to a Pandas DataFrame
recommendations_df = pd.DataFrame(recommendations)

# Optional: Save to a CSV file (for review before pushing to BigQuery)
recommendations_df.to_csv('top_100_recommendations.csv', index=False)

In [5]:
all_movie_ids = ratings_df['movieId'].unique().tolist()
user_ids = ratings_df['userId'].unique().tolist()

In [6]:
def load_model():
    with open("svd_best_model.pkl", "rb") as f:
        return pickle.load(f)

In [7]:
def get_top_100_recommendations(user_id, movie_ids):
    model = load_model()  # Load model inside the process
    predictions = model.test([(user_id, movie_id, 0) for movie_id in movie_ids])
    top_100 = sorted(predictions, key=lambda x: x.est, reverse=True)[:100]
    return [
        {
            'userId': user_id,
            'movieId': pred.iid,
            'predictedRating': pred.est,
            'rank': rank
        }
        for rank, pred in enumerate(top_100, 1)
    ]

recommendations = []

In [8]:
from concurrent.futures import ProcessPoolExecutor, as_completed

In [None]:
with ProcessPoolExecutor() as executor:
    futures = [executor.submit(get_top_100_recommendations, user_id, all_movie_ids) for user_id in user_ids]
    for future in as_completed(futures):
        recommendations.extend(future.result())

# Save to CSV
recommendations_df = pd.DataFrame(recommendations)
recommendations_df.to_csv('top_100_recommendations.csv', index=False)

In [17]:
from collections import defaultdict

def get_top_k(predictions, k=10):
    top_k = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_k[uid].append((iid, est))

    # Sort by estimated rating
    for uid, user_ratings in top_k.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_k[uid] = user_ratings[:k]

    return top_k

In [23]:
from collections import defaultdict

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    # Map the predictions to each user
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()

    for uid, user_ratings in user_est_true.items():
        # Sort by estimated rating
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        top_k = user_ratings[:k]

        # Number of relevant items in top-k
        relevant_recommended = sum((true_r >= threshold) for (_, true_r) in top_k)
        # Total number of relevant items for this user
        total_relevant = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        if total_relevant > 0:
            precisions[uid] = relevant_recommended / k
            recalls[uid] = relevant_recommended / total_relevant
        else:
            # Skip this user in the average
            continue

    # Average across users who had at least one relevant item
    avg_precision = sum(precisions.values()) / len(precisions) if precisions else 0
    avg_recall = sum(recalls.values()) / len(recalls) if recalls else 0

    return avg_precision, avg_recall

In [None]:
prec, rec = precision_recall_at_k(predictions, k=10)

In [30]:
print(f"Precision@10: {prec:.4f}")
print(f"Recall@10: {rec:.4f}")

Precision@10: 0.5603
Recall@10: 0.7906
