# **Recommendation System: Collaborative Filtering + Matrix Factorisation & Singular Value Decomposition (SVD)**

In [None]:
# get movielens dataset
! curl https://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

# Install and Import Modules
!pip install scikit-surprise
from surprise import Dataset, Reader
from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import accuracy
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split



# extract zip file
import zipfile
with zipfile.ZipFile("ml-latest-small.zip", 'r') as zip_ref:
  zip_ref.extractall('data')

# import the dataset
import pandas as pd
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

# Take a look at movies_df
movies_df.head()

# Take a look at ratings_df
ratings_df.head()

# Load and process data
df = pd.merge(ratings_df, movies_df[['movieId', 'genres']], on = 'movieId', how = 'left')
df

user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
mlb = MultiLabelBinarizer()

df['userId'] = user_encoder.fit_transform(df['userId'])
df['movieId'] = movie_encoder.fit_transform(df['movieId'])

df = df.join(pd.DataFrame(mlb.fit_transform(df.pop('genres').str.split('|')), columns = mlb.classes_, index = df.index ))
df.drop(columns = "(no genres listed)", inplace=True)
df

# build model with collaborative filtering
train_df, test_df = train_test_split(df, test_size = 0.2)
train_df

reader = Reader(rating_scale = (0.5, 5))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()
trainset

model_svd = SVD()
model_svd.fit(trainset)

predictions_svd = model_svd.test(trainset.build_anti_testset())
accuracy.rmse(predictions_svd)

# Make Recommendations
def get_top_n_recommendations(user_id, n=5):
  user_movies = df[df['userId'] == user_id]['movieId'].unique()
  all_movies = df['movieId'].unique()
  movies_to_predict = list(set(all_movies) - set(user_movies))

  user_movie_pairs = [(user_id, movie_id, 0) for movie_id in movies_to_predict]
  predictions_cf = model_svd.test(user_movie_pairs)

  top_n_recommendations = sorted(predictions_cf, key = lambda x: x.est)[:n]

  for pred in top_n_recommendations:
    predicted_rating = pred.est
    print(predicted_rating)


  top_n_movie_ids = [int(pred.iid) for pred in top_n_recommendations]

  top_n_movies = movie_encoder.inverse_transform(top_n_movie_ids)

  return top_n_movies

user_id = 220
recommendations = get_top_n_recommendations(user_id)
top_n_movies_titles = movies_df[movies_df['movieId'].isin(recommendations)]['title'].tolist()
print(f"Top 5 Recommendations for User {user_id}:")
for i, title in enumerate(top_n_movies_titles, 1):
  print(f"{i}.{title}")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  2394k      0 --:--:-- --:--:-- --:--:-- 2400k
RMSE: 0.4787
2.466948022249676
2.5610694009690267
2.562302711607097
2.6161414961973732
2.6675655731923094
Top 5 Recommendations for User 220:
1.Anaconda (1997)
2.Speed 2: Cruise Control (1997)
3.Batman & Robin (1997)
4.Godzilla (1998)
5.Honey, I Blew Up the Kid (1992)


In [None]:
from surprise import accuracy
from sklearn.metrics import precision_recall_fscore_support

# Build test set
testset = list(test_df[['userId', 'movieId', 'rating']].itertuples(index=False, name=None))

# Predict ratings
predictions_svd = model_svd.test(testset)

# Calculate RMSE and MAE
rmse = accuracy.rmse(predictions_svd)
mae = accuracy.mae(predictions_svd)

# Function to calculate precision, recall, and F1-score
def calculate_precision_recall_f1(predictions, threshold=3.5):
    y_true = [pred.r_ui for pred in predictions]
    y_pred = [pred.est for pred in predictions]
    y_pred_binary = [1 if pred >= threshold else 0 for pred in y_pred]
    y_true_binary = [1 if true >= threshold else 0 for true in y_true]
    precision, recall, f1, _ = precision_recall_fscore_support(y_true_binary, y_pred_binary, average='binary')
    return precision, recall, f1

# Calculate precision, recall, and F1-score
precision, recall, f1 = calculate_precision_recall_f1(predictions_svd)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")


RMSE: 0.8796
MAE:  0.6745
RMSE: 0.8795806411254578
MAE: 0.674516336852846
Precision: 0.7944574917801784
Recall: 0.6864448051948052
F1-Score: 0.7365120836054866


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
