<a href="https://colab.research.google.com/github/CodewithTanzeel/Movie-Recommandation-Systems/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD
import joblib
import pickle
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## Movie DataFrame for Content based Filtering

In [4]:
ratings_cols = ['userId', 'movieId', 'rating', 'timestamp']
ratings_df = pd.read_csv('u.data',sep = '\t',header=None, names=ratings_cols)

movie_cols = [
    "movieId","title","release_date","video_release_date","imdb_url",
    "unknown","Action","Adventure","Animation","Children","Comedy","Crime",
    "Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery",
    "Romance","Sci-Fi","Thriller","War","Western"
]
movies_df = pd.read_csv('u.item',sep = '|',header=None, names=movie_cols, encoding='latin-1')


genre_cols = movie_cols[5:]
movies_with_genre = movies_df[genre_cols]





## User Dataframe for Collborative Filtering

In [5]:
users_df = pd.read_csv(
    'u.user',
    sep="|",
    header=None,
    names=["userId","age","gender","occupation","zip"]
)


In [6]:
# --- Quick sanity checks ---
print("ratings_df:", ratings_df.shape)
print("movies_with_genre:", movies_with_genre.shape)
print("users_df:", users_df.shape)

print("Unique users:", ratings_df["userId"].nunique())
print("Unique movies in ratings:", ratings_df["movieId"].nunique())
print("Rating range:", ratings_df["rating"].min(), "→", ratings_df["rating"].max())

ratings_df: (100000, 4)
movies_with_genre: (1682, 19)
users_df: (943, 5)
Unique users: 943
Unique movies in ratings: 1682
Rating range: 1 → 5


In [9]:
merge_df = pd.merge(ratings_df, movies_df, on='movieId')
display(merge_df.head())
print(f"Shape:  {merge_df.shape}")


Unnamed: 0,userId,movieId,rating,timestamp,title,release_date,video_release_date,imdb_url,unknown,Action,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,L.A. Confidential (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...,0,0,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,878887116,Heavyweights (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Heavyweights%...,0,0,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,Legends of the Fall (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?Legends%20of%...,0,0,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,886397596,Jackie Brown (1997),01-Jan-1997,,http://us.imdb.com/M/title-exact?imdb-title-11...,0,0,...,0,0,0,0,0,0,0,0,0,0


Shape:  (100000, 27)


###Are any NaNs present in rating, title, or genres?

In [10]:
print("NaNs per column:\n", merge_df.isnull().sum())

NaNs per column:
 userId                     0
movieId                    0
rating                     0
timestamp                  0
title                      0
release_date               9
video_release_date    100000
imdb_url                  13
unknown                    0
Action                     0
Adventure                  0
Animation                  0
Children                   0
Comedy                     0
Crime                      0
Documentary                0
Drama                      0
Fantasy                    0
Film-Noir                  0
Horror                     0
Musical                    0
Mystery                    0
Romance                    0
Sci-Fi                     0
Thriller                   0
War                        0
Western                    0
dtype: int64


### ---- Ratings distribution (should be 1–5 only) ----

In [11]:
print("Ratings distribution:\n", merge_df['rating'].value_counts().sort_index())

Ratings distribution:
 rating
1     6110
2    11370
3    27145
4    34174
5    21201
Name: count, dtype: int64


### ---- Movies with missing titles or genres ----

In [12]:
print("Missing titles:", merge_df['title'].isna().sum())

Missing titles: 0


In [13]:
print(merge_df.head())

   userId  movieId  rating  timestamp                       title  \
0     196      242       3  881250949                Kolya (1996)   
1     186      302       3  891717742    L.A. Confidential (1997)   
2      22      377       1  878887116         Heavyweights (1994)   
3     244       51       2  880606923  Legends of the Fall (1994)   
4     166      346       1  886397596         Jackie Brown (1997)   

  release_date  video_release_date  \
0  24-Jan-1997                 NaN   
1  01-Jan-1997                 NaN   
2  01-Jan-1994                 NaN   
3  01-Jan-1994                 NaN   
4  01-Jan-1997                 NaN   

                                            imdb_url  unknown  Action  ...  \
0    http://us.imdb.com/M/title-exact?Kolya%20(1996)        0       0  ...   
1  http://us.imdb.com/M/title-exact?L%2EA%2E+Conf...        0       0  ...   
2  http://us.imdb.com/M/title-exact?Heavyweights%...        0       0  ...   
3  http://us.imdb.com/M/title-exact?Legends%

### (since genre columns are binary, sum=0 means no genre assigned)

In [14]:
print("Movies with no genre info:", (merge_df[genre_cols].sum(axis=1) == 0).sum())

Movies with no genre info: 0


### ---- Cleanup (if needed) ----
 In ml-100k, every movie has at least 1 genre → nothing to drop

In [15]:
merge_df.reset_index(drop=True, inplace=True)
print(f"After Cleanup: {merge_df.shape}")

After Cleanup: (100000, 27)


### ---- Cleanup (if needed) ----
# In ml-100k, every movie has at least 1 genre → nothing to drop

In [16]:
merge_df.reset_index(drop=True, inplace=True)
print(f"After Cleanup: {merge_df.shape}")

After Cleanup: (100000, 27)


### ---- Convert timestamp ----

In [17]:
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit='s')

## ---- Quick check for anomalies in user behavior (optional sanity check) ----

In [18]:
ratings_df_sorted = ratings_df.sort_values(['userId', 'timestamp'])
avg_time_gap = ratings_df_sorted.groupby('userId')['timestamp'].apply(lambda x: x.diff().mean())

print(avg_time_gap.describe())
merge_df.to_pickle('sample_data/merge_df.pkl')

count                          943
mean     0 days 05:29:06.164137756
std      0 days 15:26:09.885964213
min      0 days 00:00:05.225806451
25%      0 days 00:00:20.907741251
50%      0 days 00:00:45.590909090
75%      0 days 01:53:37.921096774
max      7 days 10:08:21.909090909
Name: timestamp, dtype: object


# ---- Temporal Train/Validation/Test Split ----

### Step 1: Sort all ratings by user and timestamp

In [19]:
merge_df = merge_df.sort_values(by=['userId', 'timestamp'])
train_list, val_list, test_list = [], [], []

### Step 2: For each user, split their ratings

In [20]:

for user_id, user_data in merge_df.groupby('userId'):
    n_ratings = len(user_data)
    if n_ratings < 3:
        # If too few ratings, put all in train (to avoid empty test/val)
        train_list.append(user_data)
        continue

    # Define split sizes (80/10/10 rule)
    train_end = int(0.8 * n_ratings)
    val_end = int(0.9 * n_ratings)

    train_list.append(user_data.iloc[:train_end])
    if val_end > train_end:
        val_list.append(user_data.iloc[train_end:val_end])
    if val_end < n_ratings:
        test_list.append(user_data.iloc[val_end:])

train_df = pd.concat(train_list)
val_df = pd.concat(val_list)
test_df = pd.concat(test_list)

Display

In [21]:
print(f"Train size: {train_df.shape}")
print(f"Val size: {val_df.shape}")
print(f"Test size: {test_df.shape}")



Train size: (79619, 27)
Val size: (9942, 27)
Test size: (10439, 27)


Saving the models By use of pickle

In [22]:
train_df.to_pickle('sample_data/train_df.pkl')
val_df.to_pickle('sample_data/val_df.pkl')
test_df.to_pickle('sample_data/test_df.pkl')

In [24]:
POSTIVE_THRESHOLD = 4.0

all_movie_ids = merge_df['movieId'].unique()

def create_pos_neg_samples(df, neg_ratio=1):
    samples = []
    for user_id, user_data in df.groupby('userId'):
        pos_items = user_data[user_data['rating'] >= POSTIVE_THRESHOLD]['movieId'].tolist()

        # mark positives
        for pos in pos_items:
            samples.append((user_id, pos, 1))  # label=1

        # negative sampling proportional to positives
        num_negatives = neg_ratio * len(pos_items)
        if num_negatives > 0:
            neg_items = np.setdiff1d(all_movie_ids , user_data['movieId'].unique())
            neg_samples = np.random.choice(neg_items, size=min(num_negatives, len(neg_items)), replace=False)

            for neg in neg_samples:
                samples.append((user_id, neg, 0))  # label=0

    return pd.DataFrame(samples, columns=['userId', 'movieId', 'label'])

In [25]:
# Apply to splits
train_samples = create_pos_neg_samples(train_df)
val_samples = create_pos_neg_samples(val_df)   # fewer negatives for validation
test_samples = create_pos_neg_samples(test_df)

Displaying and Saving Models For Train,Values,Test

In [26]:
print("Train samples:", train_samples.shape)
print("Val samples:", val_samples.shape)
print("Test samples:", test_samples.shape)

train_samples.to_pickle('sample_data/train_samples.pkl')
val_samples.to_pickle('sample_data/val_samples.pkl')
test_samples.to_pickle('sample_data/test_samples.pkl')

display(train_samples[train_samples['userId'] == 66])


Train samples: (91180, 3)
Val samples: (9588, 3)
Test samples: (9982, 3)


Unnamed: 0,userId,movieId,label
6976,66,258,1
6977,66,294,1
6978,66,300,1
6979,66,127,1
6980,66,50,1
6981,66,9,1
6982,66,471,1
6983,66,298,1
6984,66,237,1
6985,66,508,1


In [27]:
print(train_samples['label'].value_counts(normalize=True))

label
1    0.5
0    0.5
Name: proportion, dtype: float64


# #computing  “most popular” : Highest number of ratings + Highest average rating

In [28]:
# Group by title and compute average rating and count of ratings

movie_ratings = merge_df.groupby('title').agg(
    average_rating = ('rating', 'mean'),
    rating_count = ('rating', 'size')
).reset_index()

In [29]:
display(movie_ratings.head())

Unnamed: 0,title,average_rating,rating_count
0,'Til There Was You (1997),2.333333,9
1,1-900 (1994),2.6,5
2,101 Dalmatians (1996),2.908257,109
3,12 Angry Men (1957),4.344,125
4,187 (1997),3.02439,41


### Sort by Rating count descending, then average rating descending

In [31]:
movie_ratings_sorted = movie_ratings.sort_values(by=['rating_count','average_rating'],ascending=[False,False])

### Display the result

In [32]:
display(movie_ratings_sorted.head())

Unnamed: 0,title,average_rating,rating_count
1398,Star Wars (1977),4.358491,583
333,Contact (1997),3.803536,509
498,Fargo (1996),4.155512,508
1234,Return of the Jedi (1983),4.00789,507
860,Liar Liar (1997),3.156701,485


## Filter out movie with less than 50 ratings

In [33]:
popular_movies = movie_ratings_sorted[movie_ratings_sorted['rating_count'] >= 50]
display(popular_movies.head())
print(popular_movies.shape)
popular_movies.to_pickle('sample_data/popular_movies.pkl')

Unnamed: 0,title,average_rating,rating_count
1398,Star Wars (1977),4.358491,583
333,Contact (1997),3.803536,509
498,Fargo (1996),4.155512,508
1234,Return of the Jedi (1983),4.00789,507
860,Liar Liar (1997),3.156701,485


(605, 3)


### Get top movies based on Genres

In [34]:
genre_movie_ratings = merge_df.groupby(['title']).agg(
    avg_rating=('rating', 'mean'),
    rating_count=('rating', 'size')
).reset_index()

### Expand genres (since they are one-hot in u.item, not string list)

In [35]:
genre_cols = movies_df.columns[6:]   # from 'unknown' onward
movie_genres = movies_df[['movieId','title'] + list(genre_cols)]

### Merge ratings + genres

In [36]:
movie_with_ratings = pd.merge(movie_genres, genre_movie_ratings, on="title")

In [37]:
# For each genre, pick top 3 movies with ≥20 ratings
top_n_per_genre = {}
for genre in genre_cols:
    top_n_per_genre[genre] = (
        movie_with_ratings[movie_with_ratings[genre] == 1]
        .query("rating_count >= 20")
        .sort_values(["avg_rating", "rating_count"], ascending=[False, False])
        .head(3)
    )

with open('sample_data/top_n_per_genre.pkl', 'wb') as f:
    pickle.dump(top_n_per_genre, f)

display(top_n_per_genre.keys())
display(top_n_per_genre['Horror'].head())

dict_keys(['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,avg_rating,rating_count
184,185,Psycho (1960),0,0,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,4.100418,239
182,183,Alien (1979),1,0,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,4.034364,291
207,208,Young Frankenstein (1974),0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,3.945,200


In [38]:
# Get top movies based on Genres

genre_movie_ratings = merge_df.groupby(['title']).agg(
    avg_rating=('rating', 'mean'),
    rating_count=('rating', 'size')
).reset_index()

# Expand genres (since they are one-hot in u.item, not string list)
genre_cols = movies_df.columns[6:]   # from 'unknown' onward
movie_genres = movies_df[['movieId','title'] + list(genre_cols)]

# Merge ratings + genres
movie_with_ratings = pd.merge(movie_genres, genre_movie_ratings, on="title")

# For each genre, pick top 3 movies with ≥20 ratings
top_n_per_genre = {}
for genre in genre_cols:
    top_n_per_genre[genre] = (
        movie_with_ratings[movie_with_ratings[genre] == 1]
        .query("rating_count >= 20")
        .sort_values(["avg_rating", "rating_count"], ascending=[False, False])
        .head(3)
    )

with open('sample_data/top_n_per_genre.pkl', 'wb') as f:
    pickle.dump(top_n_per_genre, f)

display(top_n_per_genre.keys())
display(top_n_per_genre['Horror'].head())

dict_keys(['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,avg_rating,rating_count
184,185,Psycho (1960),0,0,0,0,0,0,0,0,...,1,0,0,1,0,1,0,0,4.100418,239
182,183,Alien (1979),1,0,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,4.034364,291
207,208,Young Frankenstein (1974),0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,3.945,200


In [39]:
user_movie_matrix = merge_df.pivot_table(
    index='userId',
    columns='movieId',  # use 'movieId' if you want numeric columns
    values='rating'
)
#display(user_movie_matrix.head())
user_movie_filled = user_movie_matrix.fillna(0)
display(user_movie_filled.head())

movieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### extract number of latent features (hidden pattens e.g users who like "inception" also like "interstellar")

In [40]:
k =20
svd = TruncatedSVD(n_components=k)
svd_matrix = svd.fit_transform(user_movie_filled)

reconstructed_matrix = svd.inverse_transform(svd_matrix)

### Convert back to DataFrame for easy access

In [41]:
predicted_ratings = pd.DataFrame(
    reconstructed_matrix,
    index=user_movie_filled.index,
    columns=user_movie_filled.columns
)


### ensure predicted ratings are within the bound of 0.5 to 5

In [42]:
predicted_ratings = predicted_ratings.clip(lower=0.5, upper=5.0)
joblib.dump(svd, "sample_data/svd_model.pkl")

['sample_data/svd_model.pkl']

In [43]:
# Assume you already have:
# - svd (trained TruncatedSVD model)
# - user_movie_filled (users × movies ratings matrix)
# - merge_df (to map movieId → title)

# Step 1: Get movie embeddings in latent space (shape: [num_movies, k])
movie_embeddings = svd.components_.T   # transpose to get movies × latent factors

# Step 2: Create a mapping from movieId → title
movie_map = dict(zip(merge_df["movieId"], merge_df["title"]))

# Step 3: Function to get similar movies
def get_similar_movies(movie_name, top_n=10):
    # Find movieId for the given title
    movie_id = merge_df.loc[merge_df["title"] == movie_name, "movieId"].values
    if len(movie_id) == 0:
        raise ValueError(f"Movie '{movie_name}' not found in dataset.")
    movie_id = movie_id[0]

    # Get index of movieId in the pivot matrix
    try:
        movie_idx = list(user_movie_filled.columns).index(movie_id)
    except ValueError:
        raise ValueError(f"Movie '{movie_name}' not present in pivot table.")

    # Get cosine similarity scores for this movie
    similarities = cosine_similarity(
        [movie_embeddings[movie_idx]], movie_embeddings
    )[0]

    # Get top N similar movies (excluding itself)
    similar_idx = similarities.argsort()[::-1][1:top_n+1]

    # Map back to movieIds and titles
    similar_movie_ids = [user_movie_filled.columns[i] for i in similar_idx]
    similar_movies = [(movie_map[mid], similarities[i]) for i, mid in zip(similar_idx, similar_movie_ids)]

    return pd.DataFrame(similar_movies, columns=["Movie", "Similarity Score"])

# Example usage:
print(get_similar_movies("Toy Story (1995)", top_n=5))


                                          Movie  Similarity Score
0           Hunchback of Notre Dame, The (1996)          0.678881
1  Willy Wonka and the Chocolate Factory (1971)          0.657437
2                     Mr. Holland's Opus (1995)          0.636360
3               Star Trek: First Contact (1996)          0.576589
4              James and the Giant Peach (1996)          0.557392


## -------- Example: Recommend for one user -------- #

In [44]:
user_id = 1  # Try any user
user_predictions = predicted_ratings.loc[user_id].sort_values(ascending=False)

# Movies the user has already rated
already_rated = user_movie_matrix.loc[user_id][user_movie_matrix.loc[user_id].notna()].index

# Top 5 recommendations (by movieId)
recommended_ids = user_predictions.drop(labels=already_rated).head(5).index.tolist()

# Map movieId → title
recommended_titles = movies_df.set_index('movieId').loc[recommended_ids, 'title'].tolist()

#print("Top 5 SVD recommendations for user", user_id, ":", recommended_titles)
display(predicted_ratings)
predicted_ratings.to_pickle("sample_data/svd_predicted_ratings.pkl")

movieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.008034,2.150773,1.259207,3.251537,0.566817,0.579443,4.345130,2.895911,3.155485,1.974426,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
2,1.980681,0.500000,0.500000,0.500000,0.500000,0.500000,1.626339,0.506675,2.667495,0.667172,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
3,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
4,0.514219,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
5,3.521468,1.277700,0.500000,1.591976,0.500000,0.500000,2.725043,1.213051,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,2.452425,0.500000,0.500000,0.500000,0.500000,0.500000,1.440608,0.502359,1.792441,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
940,1.949803,0.500000,0.500000,1.955075,0.530658,0.500000,2.360671,1.691660,1.183960,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
941,2.250374,0.500000,0.500000,0.500000,0.500000,0.500000,1.761408,0.500000,0.755616,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
942,1.344377,0.500000,0.500000,0.500000,0.500000,0.500000,0.500000,1.496422,0.500000,0.500000,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5


In [45]:
from sklearn.preprocessing import MultiLabelBinarizer

# 1. Split genres into lists
movies_df['genre_list'] = movies_df.iloc[:, 5:].apply(
    lambda row: [col for col, val in row.items() if val == 1], axis=1
)

# 2. Multi-hot encoding (redundant since it's already multi-hot in ML100K,
# but this ensures consistency)
mlb = MultiLabelBinarizer()
genre_multi_hot = mlb.fit_transform(movies_df['genre_list'])

# 3. Build clean dataframe with movieId, title, and genres
genre_cols = [g for g in mlb.classes_ if g.lower() != "unknown"]

genre_df = pd.DataFrame(genre_multi_hot, columns=mlb.classes_, index=movies_df['movieId'])
# Drop 'unknown' if present
if 'unknown' in genre_df.columns:
    genre_df = genre_df.drop(columns=['unknown'])

movies_with_genre = movies_df[['movieId', 'title']].join(genre_df, on='movieId')

print("Number of genres considered:", len(genre_df.columns))
display(movies_with_genre.head())


Number of genres considered: 18


Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [46]:
# After: train_samples / val_samples / test_samples exist
# After: movies_with_genres exists and has one row per movieId with genre columns

# 1) Build ID maps
unique_users = sorted(merge_df['userId'].unique())
unique_movies = sorted(merge_df['movieId'].unique())
display(len(unique_users))
display(len(unique_movies))
user2idx = {u:i for i,u in enumerate(unique_users)}
movie2idx = {m:i for i,m in enumerate(unique_movies)}

# 2) Apply maps to each split
def add_index_cols(df):
  out = df.copy()
  out['user_idx']  = out['userId'].map(user2idx)
  out['item_idx']  = out['movieId'].map(movie2idx)
  return out

train_samples = add_index_cols(train_samples)
val_samples = add_index_cols(val_samples)
test_samples = add_index_cols(test_samples)

# 3) Keep a genre matrix aligned with item_idx
genre_cols = [c for c in movies_with_genre.columns if c not in ['movieId','title']]
# Create an array where row i corresponds to item_idx i
movies_with_genre['item_idx'] = movies_with_genre['movieId'].map(movie2idx)
genre_matrix = (
    movies_with_genre
    .sort_values('item_idx')
    [genre_df.columns]
    .to_numpy()
)

num_users  = len(unique_users)
num_items  = len(unique_movies)
genre_dim  = len(genre_cols)

print("Users:", num_users, "Movies:", num_items, "Genre dim:", genre_dim)

943

1682

Users: 943 Movies: 1682 Genre dim: 18


# DataFrames

In [47]:
movies_with_genre.to_pickle("sample_data/movies_with_genre.pkl")
train_samples.to_pickle("sample_data/train_samples.pkl")
val_samples.to_pickle("sample_data/val_samples.pkl")
test_samples.to_pickle("sample_data/test_samples.pkl")

In [48]:
# Numpy
np.save("sample_data/genre_matrix.npy", genre_matrix)

# Dicts
with open("sample_data/user2idx.pkl", "wb") as f:
    pickle.dump(user2idx, f)

with open("sample_data/movie2idx.pkl", "wb") as f:
    pickle.dump(movie2idx, f)

In [50]:
# ---- User Tower ---- #
user_input = keras.Input(shape=(1,), name="user_idx")

# 1) Embedding Layer: learns a dense representation for each user
user_embedding = layers.Embedding(
    input_dim=num_users,   # total unique users
    output_dim=32,         # embedding size (tunable hyperparameter)
    name="user_embedding"
)(user_input)

# 2) Flatten embedding (convert [batch, 1, 32] → [batch, 32])
user_vec = layers.Flatten()(user_embedding)

# 3) Dense layers for non-linear transformations (can capture deeper taste patterns)
user_vec = layers.Dense(64, activation="relu")(user_vec)
user_vec = layers.Dense(32, activation="relu")(user_vec)

# 4) Final User Tower Model
user_tower = keras.Model(user_input, user_vec, name="UserTower")
user_tower.summary()


In [51]:
from tensorflow import keras
from tensorflow.keras import layers

# ---- Item Tower ---- #
item_input = keras.Input(shape=(1,), name="item_idx")
genre_input = keras.Input(shape=(genre_dim,), name="genre_vec")

# 1) Item Embedding (learns latent representation of items by ID)
item_embedding = layers.Embedding(
    input_dim=num_items,
    output_dim=32,
    name="item_embedding"
)(item_input)

item_embedding = layers.Flatten()(item_embedding)

# 2) Genre Dense Layers (transform multi-hot genre vector into dense features)
genre_vec = layers.Dense(64, activation="relu")(genre_input)
genre_vec = layers.Dense(32, activation="relu")(genre_vec)

# 3) Combine Item Embedding + Genre Vector
combined = layers.Concatenate()([item_embedding, genre_vec])

# 4) Non-linear transformations
item_vec = layers.Dense(64, activation="relu")(combined)
item_vec = layers.Dense(32, activation="relu")(item_vec)

# 5) Final Item Tower Model
item_tower = keras.Model(
    inputs=[item_input, genre_input],
    outputs=item_vec,
    name="ItemTower"
)

item_tower.summary()


In [52]:
u_in = keras.Input(shape=(1,), name = "user_idx")
i_in =  keras.Input(shape=(1,),  name="item_idx")
g_in = keras.Input(shape=(genre_dim,), name="genre_vec")

u_vec = user_tower(u_in)  #batch size = 32
i_vec = item_tower([i_in, g_in]) #batch size = 32



#Similarity Scores: dot product
logits = layers.Dot(axes = 1, name ="dot")([u_vec,i_vec])

p = layers.Activation("sigmoid", name="p")(logits)

two_tower = keras.Model(inputs=[u_in, i_in, g_in], outputs=p, name="TwoTower")
two_tower.summary()

In [53]:
def df_to_arrays(df, genre_matrix):
    # 1) base indices
    u = df["user_idx"].astype("int32").values
    i = df["item_idx"].astype("int32").values
    y = df["label"].astype("float32").values
    # 2) look up genre vectors aligned by item_idx
    g = genre_matrix[i]  # shape: (batch, genre_dim)
    x = {"user_idx": u, "item_idx": i, "genre_vec": g}
    return x, y

train_x, train_y = df_to_arrays(train_samples, genre_matrix)
val_x,   val_y   = df_to_arrays(val_samples,   genre_matrix)
test_x,  test_y  = df_to_arrays(test_samples,  genre_matrix)

print({k: v.shape for k,v in train_x.items()}, train_y.shape)


{'user_idx': (91180,), 'item_idx': (91180,), 'genre_vec': (91180, 18)} (91180,)


In [54]:
user_tower.save("sample_data/user_tower.keras")
item_tower.save("sample_data/item_tower.keras")
two_tower.save("sample_data/two_tower.keras")

In [55]:
two_tower.compile(optimizer = keras.optimizers.Adam(1e-3),
                  loss="binary_crossentropy",
                    metrics=[keras.metrics.AUC(name="auc"), "accuracy"])


history = two_tower.fit(
    train_x, train_y,
    validation_data=(val_x, val_y),
    epochs=5,
    batch_size=256,
    shuffle=True,
    verbose=1
)

Epoch 1/5
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 15ms/step - accuracy: 0.6477 - auc: 0.7516 - loss: 0.6245 - val_accuracy: 0.7158 - val_auc: 0.7408 - val_loss: 0.6367
Epoch 2/5
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7962 - auc: 0.8465 - loss: 0.5664 - val_accuracy: 0.7044 - val_auc: 0.7529 - val_loss: 0.6430
Epoch 3/5
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8004 - auc: 0.8503 - loss: 0.5639 - val_accuracy: 0.7263 - val_auc: 0.7467 - val_loss: 0.6430
Epoch 4/5
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8006 - auc: 0.8574 - loss: 0.5552 - val_accuracy: 0.7284 - val_auc: 0.7502 - val_loss: 0.6342
Epoch 5/5
[1m357/357[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8060 - auc: 0.8602 - loss: 0.5509 - val_accuracy: 0.7300 - val_auc: 0.7589 - val_loss: 0.6409


In [56]:
# --- 1. Evaluate Two-Tower model ---
test_loss, test_auc, test_acc = two_tower.evaluate(test_x, test_y, verbose=1)
print(f"\nTwo-Tower Test -> Loss: {test_loss:.4f}, AUC: {test_auc:.4f}, Accuracy: {test_acc:.4f}")

# --- 2. Precompute Item Embeddings ---
all_item_idx = np.arange(num_items, dtype="int32")
all_genres   = genre_matrix[all_item_idx]

item_encoder = keras.Model(item_tower.inputs, item_tower.outputs)
all_item_vecs = item_encoder.predict(
    {"item_idx": all_item_idx, "genre_vec": all_genres},
    batch_size=512, verbose=0
)  # shape: (num_items, embedding_dim)


# --- 3. Normalization helper (to 0.5 - 5.0 rating scale) ---
def normalize_scores(scores, min_rating=0.5, max_rating=5.0):
    min_score, max_score = scores.min(), scores.max()
    norm_scores = (scores - min_score) / (max_score - min_score + 1e-8)
    return norm_scores * (max_rating - min_rating) + min_rating


# --- 4. Two-Tower Recommender ---
def recommend_two_tower(user_id, top_k=5, exclude_rated=True):
    if user_id not in user2idx:
        return pd.DataFrame({"Title": [], "TwoTower_PredictedRating": []})

    # 1) Get user embedding
    uidx = user2idx[user_id]
    u_vec = user_tower.predict(np.array([uidx]), verbose=0)  # (1, emb_dim)

    # 2) Dot product with all items
    scores = (u_vec @ all_item_vecs.T).ravel()
    scores = normalize_scores(scores)

    # 3) Exclude already-rated items
    if exclude_rated:
        rated_items = merge_df.loc[merge_df["userId"]==user_id, "movieId"].unique()
        rated_idx   = np.array([movie2idx[m] for m in rated_items if m in movie2idx])
        scores[rated_idx] = -1e9

    # 4) Get Top-K
    top_idx = np.argpartition(scores, -top_k)[-top_k:]
    top_idx = top_idx[np.argsort(scores[top_idx])][::-1]

    top_movie_ids = [unique_movies[i] for i in top_idx]
    top_titles = movies_df.set_index("movieId").loc[top_movie_ids, "title"].tolist()

    return pd.DataFrame({
        "Title": top_titles,
        "TwoTower_PredictedRating": scores[top_idx].round(2)
    }).reset_index(drop=True)


# --- 5. SVD Recommender ---
def recommend_svd(user_id, top_k=5):
    user_predictions = predicted_ratings.loc[user_id].sort_values(ascending=False)

    # Exclude movies already rated by the user
    already_rated = user_movie_matrix.loc[user_id][user_movie_matrix.loc[user_id].notna()].index
    recommendations = user_predictions.drop(labels=already_rated).head(top_k)

    # Map movieId → title for readability
    movieid_to_title = movies_df.set_index("movieId")["title"].to_dict()
    titles = [movieid_to_title[mid] for mid in recommendations.index]

    return pd.DataFrame({
        "Title": titles,
        "SVD_PredictedRating": recommendations.values.round(2)
    }).reset_index(drop=True)



# --- 6. Comparison Helper ---
def compare_recommendations(user_id, top_k=5):
    svd_df = recommend_svd(user_id, top_k)
    twotower_df = recommend_two_tower(user_id, top_k)

    # Side-by-side top-k (not merged by title, just rank-based comparison)
    comparison = pd.concat([svd_df, twotower_df], axis=1)
    return comparison


# --- 7. Demo for one user ---
user_id = 1
display(f"---- Comparison for User {user_id} ----")
display(compare_recommendations(user_id, top_k=5))


[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7292 - auc: 0.7585 - loss: 0.6457

Two-Tower Test -> Loss: 0.6501, AUC: 0.7537, Accuracy: 0.7247


'---- Comparison for User 1 ----'

Unnamed: 0,Title,SVD_PredictedRating,Title.1,TwoTower_PredictedRating
0,Trainspotting (1996),4.01,"English Patient, The (1996)",3.2
1,Sense and Sensibility (1995),3.41,E.T. the Extra-Terrestrial (1982),3.18
2,E.T. the Extra-Terrestrial (1982),3.29,Air Force One (1997),3.0
3,"Close Shave, A (1995)",3.23,Liar Liar (1997),2.91
4,Schindler's List (1993),3.18,Casablanca (1942),2.59


In [None]:
# save
np.save("sample_data/all_item_vecs.npy", all_item_vecs)


In [None]:
%%writefile app_gradio.py
import gradio as gr
import pandas as pd
import numpy as np
import pickle
import joblib
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity

# ===============================
# Load Data + Models
# ===============================
def load_models():
    base_path = "sample_data/"

    with open(base_path + "popular_movies.pkl", "rb") as f:
        popular_movies = pickle.load(f)

    with open(base_path + "top_n_per_genre.pkl", "rb") as f:
        top_n_per_genre = pickle.load(f)

    with open(base_path + "svd_predicted_ratings.pkl", "rb") as f:
        svd_preds = pickle.load(f)

    with open(base_path + "movie2idx.pkl", "rb") as f:
        movie2idx = pickle.load(f)

    with open(base_path + "user2idx.pkl", "rb") as f:
        user2idx = pickle.load(f)

    with open(base_path + "movies_with_genre.pkl", "rb") as f:
        movies_with_genre = pickle.load(f)

    # Models
    user_tower = tf.keras.models.load_model(base_path + "user_tower.keras")
    item_tower = tf.keras.models.load_model(base_path + "item_tower.keras")
    all_item_vecs = np.load(base_path + "all_item_vecs.npy")

    svd_model = joblib.load(base_path + "svd_model.pkl")

    return (popular_movies, top_n_per_genre, svd_preds, movie2idx, user2idx,
            movies_with_genre, user_tower, item_tower, all_item_vecs, svd_model)

(popular_movies, top_n_per_genre, svd_preds, movie2idx, user2idx,
 movies_with_genre, user_tower, item_tower, all_item_vecs, svd_model) = load_models()

# ===============================
# Helper Functions
# ===============================
def recommend_popular(n=10):
    return popular_movies.head(n).to_dict()

def recommend_by_genre(genre, n=10):
    return top_n_per_genre[genre].head(n).to_dict()

def recommend_two_tower(user_id, n=10):
    if user_id not in user2idx:
        return {}

    user_idx = np.array([user2idx[user_id]])
    user_vec = user_tower.predict(user_idx, verbose=0)
    scores = np.dot(all_item_vecs, user_vec.T).flatten()
    top_idx = np.argsort(scores)[::-1][:n]

    inv_movie2idx = {v: k for k, v in movie2idx.items()}
    top_movie_ids = [inv_movie2idx[i] for i in top_idx]

    titles = movies_with_genre.set_index("movieId").loc[top_movie_ids, "title"].values
    return {titles[i]: float(scores[top_idx][i]) for i in range(n)}

def get_similar_movies(movie_name, n=10):
    if movie_name not in movies_with_genre['title'].values:
        return {}
    movie_id = movies_with_genre.loc[movies_with_genre['title'] == movie_name, 'movieId'].values[0]
    if movie_id not in svd_preds.columns:
        return {}
    movie_idx = svd_preds.columns.get_loc(movie_id)
    movie_vec = svd_model.components_[:, movie_idx].reshape(1, -1)
    sims = cosine_similarity(movie_vec, svd_model.components_.T).flatten()
    top_idx = sims.argsort()[::-1][1:n+1]
    sim_movie_ids = svd_preds.columns[top_idx]
    titles = movies_with_genre.set_index("movieId").loc[sim_movie_ids, "title"].values
    return {titles[i]: float(sims[top_idx][i]) for i in range(n)}

# ===============================
# Gradio Interface
# ===============================
with gr.Blocks() as demo:
    gr.Markdown("## 🎬 Movie Recommender System")

    with gr.Tab("Popular Movies"):
        n_pop = gr.Slider(5, 20, 10, label="Number of movies")
        btn_pop = gr.Button("Show Popular")
        output_pop = gr.JSON()
        btn_pop.click(recommend_popular, inputs=n_pop, outputs=output_pop)

    with gr.Tab("Two-Tower Recommendations"):
        user_id = gr.Number(value=1, label="Enter User ID")
        n_rec = gr.Slider(5, 20, 10, label="Number of movies")
        btn_tower = gr.Button("Recommend")
        output_tower = gr.JSON()
        btn_tower.click(recommend_two_tower, inputs=[user_id, n_rec], outputs=output_tower)

    with gr.Tab("Movie Similarity (SVD)"):
        movie_name = gr.Textbox("Enter Movie Name (exact match)")
        n_sim = gr.Slider(5, 20, 10, label="Number of similar movies")
        btn_sim = gr.Button("Find Similar Movies")
        output_sim = gr.JSON()
        btn_sim.click(get_similar_movies, inputs=[movie_name, n_sim], outputs=output_sim)

demo.launch(share=True)
