<a href="https://colab.research.google.com/github/Emaan10/Elevvo-ML.internshipTasks/blob/main/Movie%20Recommendation%20System/Elevvo_Task_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

In [None]:
import os
os.environ['KAGGLE_USERNAME'] = "emaankhaleeq"
os.environ['KAGGLE_KEY'] = "a57bf46b5a7ebe66c37630bffe31992f"
!kaggle datasets list -s movielens

ref                                                             title                                                  size  lastUpdated                 downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  -----------------------------------------------  ----------  --------------------------  -------------  ---------  ---------------  
grouplens/movielens-20m-dataset                                 MovieLens 20M Dataset                             204953792  2018-08-15 23:09:34.430000          82896        667  0.7058824        
sherinclaudia/movielens                                         Movielens                                           6109178  2019-01-21 13:20:43.830000           7347         18  0.8125           
prajitdatta/movielens-100k-dataset                              MovieLens 100K Dataset                              4998818  2017-01-05 12:37:32.947000          33019        220  0.625            
ayushimishra280

In [None]:
!kaggle datasets download -d prajitdatta/movielens-100k-dataset
!unzip -o movielens-100k-dataset.zip -d movielens

Dataset URL: https://www.kaggle.com/datasets/prajitdatta/movielens-100k-dataset
License(s): CC0-1.0
movielens-100k-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  movielens-100k-dataset.zip
  inflating: movielens/ml-100k/README  
  inflating: movielens/ml-100k/allbut.pl  
  inflating: movielens/ml-100k/mku.sh  
  inflating: movielens/ml-100k/u.data  
  inflating: movielens/ml-100k/u.genre  
  inflating: movielens/ml-100k/u.info  
  inflating: movielens/ml-100k/u.item  
  inflating: movielens/ml-100k/u.occupation  
  inflating: movielens/ml-100k/u.user  
  inflating: movielens/ml-100k/u1.base  
  inflating: movielens/ml-100k/u1.test  
  inflating: movielens/ml-100k/u2.base  
  inflating: movielens/ml-100k/u2.test  
  inflating: movielens/ml-100k/u3.base  
  inflating: movielens/ml-100k/u3.test  
  inflating: movielens/ml-100k/u4.base  
  inflating: movielens/ml-100k/u4.test  
  inflating: movielens/ml-100k/u5.base  
  inflating: m

In [None]:
ratings = pd.read_csv(
    "movielens/ml-100k/u.data",
    sep="\t",
    names=["userId","movieId","rating","timestamp"]
)[["userId","movieId","rating"]]

movies = pd.read_csv(
    "movielens/ml-100k/u.item",
    sep="|",
    names=["movieId", "title"] + [f"col{i}" for i in range(2,24)],
    usecols=[0,1],
    encoding="latin-1"
)
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


Train/Test Split

In [None]:
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

# filter cold-start users & movies
test = test[test["userId"].isin(train["userId"].unique())]
test = test[test["movieId"].isin(train["movieId"].unique())]

print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (80000, 3)
Test shape: (19969, 3)


User-Item Matrix (Train only)

In [None]:
user_item_matrix = train.pivot_table(
    index="userId", columns="movieId", values="rating"
).fillna(0)

R = csr_matrix(user_item_matrix.values)
user_ids = user_item_matrix.index
movie_ids = user_item_matrix.columns

print("User-item matrix shape:", user_item_matrix.shape)

User-item matrix shape: (943, 1653)


Similarity Matrices

In [None]:
user_similarity = cosine_similarity(user_item_matrix)
item_similarity = cosine_similarity(user_item_matrix.T)

user_sim_df = pd.DataFrame(user_similarity, index=user_ids, columns=user_ids)
item_sim_df = pd.DataFrame(item_similarity, index=movie_ids, columns=movie_ids)

Recommendation Functions

In [None]:
def recommend_user_based(user_id, k=5):
    if user_id not in user_item_matrix.index:
        return []
    user_ratings = user_item_matrix.loc[user_id]
    seen_movies = set(user_ratings[user_ratings > 0].index)

    # find similar users
    similar_users = user_sim_df[user_id].drop(user_id).sort_values(ascending=False).head(20).index
    sim_scores = user_sim_df.loc[user_id, similar_users]

    # weighted average of ratings
    scores = {}
    for other in similar_users:
        weight = sim_scores[other]
        for movie, rating in user_item_matrix.loc[other].items():
            if movie not in seen_movies and rating > 0:
                scores[movie] = scores.get(movie, 0) + weight * rating

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [m for m, _ in ranked[:k]]

def recommend_item_based(user_id, k=5):
    if user_id not in user_item_matrix.index:
        return []
    user_ratings = user_item_matrix.loc[user_id]
    seen_movies = set(user_ratings[user_ratings > 0].index)

    scores = {}
    for movie, rating in user_ratings.items():
        if rating > 0:
            similar_movies = item_sim_df[movie].sort_values(ascending=False).head(20).index
            for sm in similar_movies:
                if sm not in seen_movies:
                    scores[sm] = scores.get(sm, 0) + rating * item_sim_df.loc[movie, sm]

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [m for m, _ in ranked[:k]]

def recommend_svd(user_id, k=5):
    if user_id not in user_item_matrix.index:
        return []
    svd = TruncatedSVD(n_components=20, random_state=42)
    latent_matrix = svd.fit_transform(R)
    reconstructed = np.dot(latent_matrix, svd.components_)
    user_index = user_item_matrix.index.get_loc(user_id)
    scores = reconstructed[user_index]

    seen_movies = set(user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index)
    movie_score_pairs = [(movie_ids[i], score) for i, score in enumerate(scores) if movie_ids[i] not in seen_movies]
    ranked = sorted(movie_score_pairs, key=lambda x: x[1], reverse=True)
    return [m for m, _ in ranked[:k]]

 Precision@K

In [None]:
def precision_at_k(user_id, rec_movie_ids, test_df, k=5, threshold=3):
    actual = set(test_df[(test_df["userId"] == user_id) &
                         (test_df["rating"] >= threshold)]["movieId"])
    if not actual:  # no relevant items in test
        return None
    hits = sum([1 for r in rec_movie_ids if r in actual])
    return hits / k

 Evaluation

In [None]:
results = []

for user_id in test["userId"].unique()[:50]:  # first 50 users for demo
    rec_user = recommend_user_based(user_id, k=5)
    rec_item = recommend_item_based(user_id, k=5)
    rec_svd = recommend_svd(user_id, k=5)

    p_user = precision_at_k(user_id, rec_user, test, k=5)
    p_item = precision_at_k(user_id, rec_item, test, k=5)
    p_svd = precision_at_k(user_id, rec_svd, test, k=5)

    results.append([
        user_id,
        [movies.set_index("movieId").loc[m, "title"] for m in rec_user if m in movies.set_index("movieId").index],
        p_user,
        [movies.set_index("movieId").loc[m, "title"] for m in rec_item if m in movies.set_index("movieId").index],
        p_item,
        [movies.set_index("movieId").loc[m, "title"] for m in rec_svd if m in movies.set_index("movieId").index],
        p_svd
    ])

results_df = pd.DataFrame(results, columns=[
    "userId",
    "User-based Recommendations", "User-based Precision@5",
    "Item-based Recommendations", "Item-based Precision@5",
    "SVD Recommendations", "SVD Precision@5"
])

print("\nPrecision@5 Comparison Table (first 10 users):")
print(results_df.head(10))

print("\nMean Precision@5:")
print("User-based:", np.nanmean(results_df["User-based Precision@5"]))
print("Item-based:", np.nanmean(results_df["Item-based Precision@5"]))
print("SVD:", np.nanmean(results_df["SVD Precision@5"]))


Precision@5 Comparison Table (first 10 users):
   userId                         User-based Recommendations  \
0     877  [Pulp Fiction (1994), Star Wars (1977), Raider...   
1     815  [Silence of the Lambs, The (1991), Jaws (1975)...   
2      94  [Empire Strikes Back, The (1980), Shawshank Re...   
3     416  [Usual Suspects, The (1995), Star Wars (1977),...   
4     500  [Silence of the Lambs, The (1991), Fargo (1996...   
5     259  [Fargo (1996), Pulp Fiction (1994), Reservoir ...   
6     598  [Titanic (1997), English Patient, The (1996), ...   
7     886  [Princess Bride, The (1987), Empire Strikes Ba...   
8     837  [Fargo (1996), Independence Day (ID4) (1996), ...   
9     521  [Braveheart (1995), Silence of the Lambs, The ...   

   User-based Precision@5                         Item-based Recommendations  \
0                     0.2  [Raiders of the Lost Ark (1981), Indiana Jones...   
1                     0.2  [Silence of the Lambs, The (1991), Pulp Fictio...   
2      