In [None]:
# Import TensorFlow and hub
import tensorflow as tf
import tensorflow_hub as hub

# Plotting
import matplotlib.pyplot as plt

# some important packages
import os
import re
import numpy as np
import pandas as pd

# scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

# get movielens dataset
!curl https://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ml-latest-small.zip

# extract zip file
import zipfile
with zipfile.ZipFile("ml-latest-small.zip", 'r') as zip_ref:
    zip_ref.extractall('data')

# Load Universal Sentence Encoder
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(model_url)
print('Model Loaded')

def embed(texts):
    return model(texts)

# Load dataset
movies_df = pd.read_csv('data/ml-latest-small/movies.csv')
ratings_df = pd.read_csv('data/ml-latest-small/ratings.csv')

# Preprocess data
movies_df = movies_df[["movieId", "title", "genres"]].dropna().reset_index(drop=True)
ratings_df = ratings_df[["userId", "movieId", "rating"]]

# Combine title and genres for embeddings
movies_df['combined'] = movies_df['title'] + ' ' + movies_df['genres']
combined_texts = list(movies_df['combined'])

# Generate embeddings for movies
embeddings = embed(combined_texts)
print("The embedding shape is:", embeddings.shape)

# Ensure movie IDs are properly aligned
movie_id_to_index = {movie_id: index for index, movie_id in enumerate(movies_df['movieId'])}

# Split the ratings into training and testing sets
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Compute user profiles
user_profiles = {}
for user_id in ratings_df['userId'].unique():
    user_ratings = ratings_df[ratings_df['userId'] == user_id]
    user_movie_indices = [movie_id_to_index.get(movie_id, -1) for movie_id in user_ratings['movieId']]
    user_movie_indices = [index for index in user_movie_indices if index != -1]

    if len(user_movie_indices) > 0:
        user_movie_embeddings = np.array([embeddings[index].numpy() for index in user_movie_indices])
        user_profile = np.average(user_movie_embeddings, axis=0, weights=user_ratings['rating'].values[:len(user_movie_indices)])
        user_profiles[user_id] = user_profile

# Fit Nearest Neighbours model on movie embeddings
nn = NearestNeighbors(n_neighbors=10, metric='cosine')
nn.fit(embeddings)

def get_top_n_recommendations(user_id, n=5):
    if user_id not in user_profiles:
        return []
    user_profile = user_profiles[user_id].reshape(1, -1)
    distances, indices = nn.kneighbors(user_profile, n_neighbors=n)
    recommended_movie_ids = movies_df.iloc[indices[0]]['movieId'].tolist()
    return movies_df[movies_df['movieId'].isin(recommended_movie_ids)]['title'].tolist()

# Example usage
user_id = 220
recommendations = get_top_n_recommendations(user_id)
print(f"Top {len(recommendations)} Recommendations for User {user_id}:")
for i, title in enumerate(recommendations, 1):
    print(f"{i}. {title}")




  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  955k  100  955k    0     0  1288k      0 --:--:-- --:--:-- --:--:-- 1287k
Model Loaded
The embedding shape is: (9742, 512)
Top 5 Recommendations for User 220:
1. Bio-Dome (1996)
2. Orgazmo (1997)
3. Bowfinger (1999)
4. Darkman (1990)
5. Patlabor 2: The Movie (1993)


In [None]:
# Evaluate Nearest Neighbours with USE Model
test_ratings = []
predicted_ratings = []

for _, row in test_df.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']

    if user_id in user_profiles and movie_id in movie_id_to_index:
        user_profile = user_profiles[user_id].reshape(1, -1)
        movie_index = movie_id_to_index[movie_id]
        movie_embedding = embeddings[movie_index].numpy().reshape(1, -1)
        predicted_rating = cosine_similarity(user_profile, movie_embedding)[0][0] * 5  # Scale similarity to rating

        test_ratings.append(row['rating'])
        predicted_ratings.append(predicted_rating)

# Calculate RMSE and MAE
rmse_use = mean_squared_error(test_ratings, predicted_ratings, squared=False)
mae_use = mean_absolute_error(test_ratings, predicted_ratings)

# Calculate precision, recall, and F1-score
precision_use, recall_use, f1_use, _ = precision_recall_fscore_support(
    [1 if r >= 3.5 else 0 for r in test_ratings],
    [1 if r >= 3.5 else 0 for r in predicted_ratings],
    average='binary'
)

print(f"RMSE (USE): {rmse_use}")
print(f"MAE (USE): {mae_use}")
print(f"Precision (USE): {precision_use}")
print(f"Recall (USE): {recall_use}")
print(f"F1-Score (USE): {f1_use}")

RMSE (USE): 1.5305606045269466
MAE (USE): 1.3102012039818995
Precision (USE): 0.6091954022988506
Recall (USE): 0.004295323770159657
F1-Score (USE): 0.008530500563334945


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
