In [42]:
import sys
import surprise

import pandas as pd
from utils.timer import Timer
from datasets.python_splitters import python_random_split
from evaluation.python_evaluation import (
    rmse,
    mae,
    rsquared,
    exp_var,
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
    get_top_k_items,
)
from models.surprise_utils import (
    predict,
    compute_ranking_predictions,
)
from utils.notebook_utils import store_metadata


print(f"System version: {sys.version}")
print(f"Surprise version: {surprise.__version__}")

System version: 3.11.5 (v3.11.5:cce6ba91b3, Aug 24 2023, 10:50:31) [Clang 13.0.0 (clang-1300.0.29.30)]
Surprise version: 1.1.3


# Explanation
The Singular Value Decomposition (SVD) algorithm used in the Surprise library is a matrix factorization technique commonly employed in recommendation systems. Here's a quick summary of how it works:

- Matrix Factorization: SVD is a technique from linear algebra used to decompose a matrix into three separate matrices. In the context of recommendation systems, it's used to decompose the user-item interaction matrix into two lower-dimensional matrices: one representing users and their latent factors and the other representing items and their latent factors.

- Latent Factors: The idea behind SVD is to represent users and items in a lower-dimensional latent space, where each dimension captures some underlying characteristics or preferences. For example, in a movie recommendation system, latent factors might represent genres, directors, or actors.

- Optimization: The decomposition process aims to find the best approximation of the original matrix by minimizing the error between the predicted ratings and the actual ratings in the training dataset. This optimization is typically achieved using techniques like stochastic gradient descent.

- Prediction: Once the model is trained, it can predict the rating a user would give to an item by taking the dot product of the user's latent factors and the item's latent factors.

- Recommendation: Recommendations can be generated by ranking the predicted ratings for unseen items and recommending the top-ranked items to users.

SVD and its variants have been popular in recommendation systems due to their effectiveness in capturing complex user-item interactions and providing accurate recommendations. The Surprise library provides an easy-to-use interface for implementing SVD and other collaborative filtering algorithms for recommendation tasks.

In [43]:
# Top k items to recommend
TOP_K = 10

In [44]:
# Specify the path to your CSV file
csv_file_path = "/Users/marianareyes/Desktop/ie_tower/chatbots-2/chatlib/datasets/ratings_final.csv"

# Read the CSV file into a pandas DataFrame
data = pd.read_csv(csv_file_path, usecols=['userID', 'itemID', 'rating'])

data.head()

Unnamed: 0,userID,itemID,rating
0,19,2151,3
1,42,1583,3
2,8,1590,4
3,47,974,4
4,13,619,1


In [45]:
train, test = python_random_split(data, 0.75)

In [46]:
train_set = surprise.Dataset.load_from_df(
    train, reader=surprise.Reader("ml-100k")
).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x12b28db50>

In [47]:
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

with Timer() as train_time:
    svd.fit(train_set)

print(f"Took {train_time.interval} seconds for training.")

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Took 0.04484105599112809 seconds for training.


In [48]:
predictions = predict(svd, test, usercol="userID", itemcol="itemID")
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,1,1583,2.96887
1,20,2019,2.974477
2,43,871,2.826589
3,28,942,2.864249
4,40,1154,2.90683


In [49]:
with Timer() as test_time:
    all_predictions = compute_ranking_predictions(
        svd, train, usercol="userID", itemcol="itemID", remove_seen=True
    )

print(f"Took {test_time.interval} seconds for prediction.")

Took 2.0243811139953323 seconds for prediction.


In [50]:
all_predictions.head()

Unnamed: 0,userID,itemID,prediction
750,12,602,2.806356
751,12,1909,3.204151
752,12,322,2.918021
753,12,476,3.272648
754,12,561,2.444986


In [51]:
eval_rmse = rmse(test, predictions)
eval_mae = mae(test, predictions)
eval_rsquared = rsquared(test, predictions)
eval_exp_var = exp_var(test, predictions)

eval_map = map_at_k(test, all_predictions, col_prediction="prediction", k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction="prediction", k=TOP_K)
eval_precision = precision_at_k(
    test, all_predictions, col_prediction="prediction", k=TOP_K
)
eval_recall = recall_at_k(test, all_predictions, col_prediction="prediction", k=TOP_K)


print(
    "RMSE:\t\t%f" % eval_rmse,
    "MAE:\t\t%f" % eval_mae,
    "rsquared:\t%f" % eval_rsquared,
    "exp var:\t%f" % eval_exp_var,
    sep="\n",
)

print("----")

print(
    "MAP:\t\t%f" % eval_map,
    "NDCG:\t\t%f" % eval_ndcg,
    "Precision@K:\t%f" % eval_precision,
    "Recall@K:\t%f" % eval_recall,
    sep="\n",
)

RMSE:		1.406256
MAE:		1.205430
rsquared:	-0.024852
exp var:	-0.018494
----
MAP:		0.000000
NDCG:		0.000000
Precision@K:	0.000000
Recall@K:	0.000000


In [52]:
# Specify the user ID for which you want to compute predictions
user_id_to_predict = 17  # Replace 1 with the actual user ID you want to predict for

# Get the items the user has already seen
items_seen_by_user = data[data['userID'] == user_id_to_predict]['itemID'].tolist()

# Filter out the items the user has already seen from all predictions
unseen_items = [item for item in train_set.all_items() if item not in items_seen_by_user]

# Compute predictions for the specified user
user_predictions = [(item, svd.predict(user_id_to_predict, item).est) for item in unseen_items]

# Sort the predictions by rating value in descending order
sorted_predictions = sorted(user_predictions, key=lambda x: x[1], reverse=True)

# Get the top 10 results
top_10_results = sorted_predictions[:10]

# Create a DataFrame for the top 10 results
top_10_df = pd.DataFrame(columns=['userID', 'itemID', 'prediction'])
for item_id, rating in top_10_results:
    top_10_df = pd.concat([top_10_df, pd.DataFrame({'userID': [user_id_to_predict],
                                                     'itemID': [item_id],
                                                     'prediction': [rating]})], ignore_index=True)

print("Top 10 recommended items for user", user_id_to_predict)
print(top_10_df)

Top 10 recommended items for user 17
  userID itemID  prediction
0     17    369    3.283073
1     17    335    3.280204
2     17    465    3.270833
3     17    231    3.264843
4     17    200    3.248397
5     17     17    3.098913
6     17    422    3.028019
7     17    228    2.994309
8     17    457    2.976814
9     17    134    2.952377


In [53]:
print(items_seen_by_user)

[1929, 1737, 752, 941, 608, 1454, 2110, 216, 1148, 1576, 666, 861, 1027, 2118, 1911, 667, 2041, 1570, 242, 2094, 214, 454, 1532, 2022, 594, 833, 254, 2010, 405, 368, 323]


In [54]:
# Specify the path to your CSV file
csv_file_path = "/Users/marianareyes/Desktop/ie_tower/chatbots-2/chatlib/datasets/data_with_itemID.csv"

# Read the CSV file into a pandas DataFrame
data = pd.read_csv(csv_file_path)

data.head()

Unnamed: 0,itemID,name,gender,age,year,major,nationality,languages,hobbies
0,1,John Smith,Male,20,2,Computer Science and Artificial Intelligence,USA,English,"Playing video games, reading, hiking"
1,2,Emily Johnson,Female,21,3,Business Administration,Canada,"English, French","Painting, playing guitar, photography"
2,3,Michael Williams,Male,22,4,Economics,UK,English,"Playing football, watching movies, traveling"
3,4,Sarah Brown,Female,20,1,Communication and Digital Media,Australia,English,"Writing, photography, dancing"
4,5,David Jones,Male,19,1,Architecture,Germany,"German, English","Drawing, playing piano, cooking"
