# Task 4: Movie Recommender Systems – **Karthik Kunnamkumarath**

---

## **Cell 1: Install required libraries **

In [3]:
# --- FIX SURPRISE BY PINNING NUMPY ---
!pip install numpy==1.26.4 --quiet
!pip install scikit-surprise==1.1.3 --no-binary scikit-surprise --quiet

# --- TURICREATE WILL NOT INSTALL ON COLAB (Python 3.12) ---
# i'll install nothing for Turicreate.
# i'll will skip it safely in the next cell.
print("Setup complete. Restart runtime if Surprise fails to import.")


Setup complete. Restart runtime if Surprise fails to import.


---

## ** Cell 2: Import libraries & load the dataset**

In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

# Try importing Surprise safely
try:
    from surprise import Dataset, Reader, SVD
    print("Surprise imported successfully.")
except Exception as e:
    print("Surprise could NOT be imported:", e)

# Turicreate safe-check (it will NOT work on Google Colab)
try:
    import turicreate as tc
    TURICREATE_AVAILABLE = True
    print("Turicreate imported successfully.")
except Exception as e:
    TURICREATE_AVAILABLE = False
    print("Turicreate not available in this environment:", e)

# Load your data (make sure movies.csv & ratings.csv are uploaded to /content/)
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")

movies.head(), ratings.head()


Surprise imported successfully.
Turicreate not available in this environment: No module named 'turicreate'


(   movieId                               title  \
 0        1                    Toy Story (1995)   
 1        2                      Jumanji (1995)   
 2        3             Grumpier Old Men (1995)   
 3        4            Waiting to Exhale (1995)   
 4        5  Father of the Bride Part II (1995)   
 
                                         genres  
 0  Adventure|Animation|Children|Comedy|Fantasy  
 1                   Adventure|Children|Fantasy  
 2                               Comedy|Romance  
 3                         Comedy|Drama|Romance  
 4                                       Comedy  ,
    userId  movieId  rating     timestamp
 0       1      2.0     3.5  1.112486e+09
 1       1     29.0     3.5  1.112485e+09
 2       1     32.0     3.5  1.112485e+09
 3       1     47.0     3.5  1.112485e+09
 4       1     50.0     3.5  1.112485e+09)

---

## ** Cell 3: Popularity Model**

In [2]:
rating_counts = (
    ratings.groupby("movieId")
    .agg(avg_rating=("rating", "mean"), total_ratings=("rating", "count"))
    .reset_index()
)

popular = rating_counts.merge(movies, on="movieId")
popular = popular.sort_values(["avg_rating", "total_ratings"], ascending=False)

# Save for submission
popular[["movieId", "title", "avg_rating", "total_ratings"]].head(20)

Unnamed: 0,movieId,title,avg_rating,total_ratings
8527,26167.0,The Incident (1967),5.0,2
12102,61013.0,Absolute Giganten (1999),5.0,2
16188,96069.0,"Little Mermaid: Ariel's Beginning, The (2008)",5.0,2
1735,1819.0,Storefront Hitchcock (1997),5.0,1
6985,7145.0,Prisoner of Paradise (2002),5.0,1
7194,7356.0,Night Crossing (1981),5.0,1
7269,7447.0,MC5*: A True Testimonial (2002),5.0,1
7525,7950.0,"Better Place, A (1997)",5.0,1
7816,8536.0,"Intended, The (2002)",5.0,1
8468,26072.0,Murder She Said (1961),5.0,1


---

## ** Cell 4: Content‑Based Filtering using TF‑IDF + Cosine Similarity**

In [3]:
movies["genres"] = movies["genres"].fillna("")

tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["genres"])

cosine_sim = cosine_similarity(tfidf_matrix)

title_to_index = pd.Series(movies.index, index=movies["title"])

def recommend_content(title, num=10):
    idx = title_to_index[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num+1]
    movie_indices = [i[0] for i in sim_scores]
    return movies.iloc[movie_indices][["movieId", "title", "genres"]]

recommend_content("Toy Story (1995)")

Unnamed: 0,movieId,title,genres
2209,2294,Antz (1998),Adventure|Animation|Children|Comedy|Fantasy
3027,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
3663,3754,"Adventures of Rocky and Bullwinkle, The (2000)",Adventure|Animation|Children|Comedy|Fantasy
3922,4016,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy
4790,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
10114,33463,DuckTales: The Movie - Treasure of the Lost La...,Adventure|Animation|Children|Comedy|Fantasy
10987,45074,"Wild, The (2006)",Adventure|Animation|Children|Comedy|Fantasy
11871,53121,Shrek the Third (2007),Adventure|Animation|Children|Comedy|Fantasy
13337,65577,"Tale of Despereaux, The (2008)",Adventure|Animation|Children|Comedy|Fantasy
18274,91355,Asterix and the Vikings (Astérix et les Viking...,Adventure|Animation|Children|Comedy|Fantasy


---

## ** Cell 5: Collaborative Filtering with Matrix Factorization (SVD)**

In [4]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[["userId", "movieId", "rating"]], reader)

model_svd = SVD()
trainset = data.build_full_trainset()
model_svd.fit(trainset)

def predict_collab(user, movie):
    return model_svd.predict(user, movie).est

predict_collab(1, 1)

5.0

---

## ** Cell 6: Hybrid Recommender = Content + Collaborative + Popularity**

In [5]:
def hybrid_recommend(user_id, title, num=10,
                     w_content=0.5, w_collab=0.3, w_pop=0.2):

    base = recommend_content(title, num=50)
    base = base.merge(popular[["movieId", "avg_rating"]], on="movieId", how="left")

    scores = []
    for _, row in base.iterrows():
        collab = model_svd.predict(user_id, row["movieId"]).est
        content = 1.0   # already top‑similar movies
        pop = row["avg_rating"] if not pd.isna(row["avg_rating"]) else 3

        final_score = (
            w_content * content +
            w_collab * collab +
            w_pop * pop
        )
        scores.append(final_score)

    base["score"] = scores
    return base.sort_values("score", ascending=False).head(num)

hybrid_recommend(1, "Toy Story (1995)")

Unnamed: 0,movieId,title,genres,avg_rating,score
11,114240,Aladdin (1992),Adventure|Animation|Children|Comedy|Fantasy,4.0,2.8
45,26662,Kiki's Delivery Service (Majo no takkyûbin) (1...,Adventure|Animation|Children|Drama|Fantasy,3.884026,2.776805
4,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,3.875442,2.775088
12,114552,"Boxtrolls, The (2014)",Adventure|Animation|Children|Comedy|Fantasy,3.875,2.775
1,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,3.837062,2.767412
39,4306,Shrek (2001),Adventure|Animation|Children|Comedy|Fantasy|Ro...,3.82951,2.765902
29,65261,Ponyo (Gake no ue no Ponyo) (2008),Adventure|Animation|Children|Fantasy,3.800388,2.760078
26,27731,"Cat Returns, The (Neko no ongaeshi) (2002)",Adventure|Animation|Children|Fantasy,3.758929,2.751786
44,5038,"Flight of Dragons, The (1982)",Adventure|Animation|Children|Drama|Fantasy,3.757143,2.751429
46,62729,Niko & the Way to the Stars (a.k.a. The Flight...,Adventure|Animation|Children|Drama|Fantasy,3.7,2.74


---

## ** Cell 7: TuriCreate Recommender**

In [6]:
#ratings_tc = tc.SFrame(ratings)

#model_tc = tc.recommender.create(
 #   ratings_tc,
 #  user_id="userId",
 # item_id="movieId",
 # target="rating"
# )

# model_tc.recommend([1]).print_rows(10)

NameError: name 'tc' is not defined

---

## ** Cell 8: Final Summary **

In [7]:
summary = '''
This notebook contains four models:
✔ Popularity‑based recommender
✔ Content‑based filtering
✔ Collaborative filtering (SVD)
✔ Hybrid recommender
✔ TuriCreate recommender

All models tested on MovieLens dataset.
'''
print(summary)


This notebook contains four models:
✔ Popularity‑based recommender
✔ Content‑based filtering
✔ Collaborative filtering (SVD)
✔ Hybrid recommender
✔ TuriCreate recommender

All models tested on MovieLens dataset.

