In [2]:
!pip install scikit-learn
!pip install scikit-surprise




In [4]:
# Step 1: Import libraries
import pandas as pd
import numpy as np


# Step 2: Load CSV files using absolute path
movies = pd.read_csv('/Users/franckvenance/Desktop/ml-20m/movies.csv')
ratings = pd.read_csv('/Users/franckvenance/Desktop/ml-20m/ratings.csv')
tags = pd.read_csv('/Users/franckvenance/Desktop/ml-20m/tags.csv')

# Step 3: Preview the data
print("Movies:")
display(movies.head())

print("Ratings:")
display(ratings.head())

print("Tags:")
display(tags.head())


Movies:


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Ratings:


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


Tags:


Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [6]:
# Step 2: Popularity-Based Recommender

# 1. Group by movieId to get count and average rating
movie_stats = ratings.groupby('movieId').agg({
    'rating': ['count', 'mean']
}).reset_index()

# Rename columns
movie_stats.columns = ['movieId', 'num_ratings', 'avg_rating']

# 2. Merge with movie titles
popular_movies = pd.merge(movie_stats, movies, on='movieId')

# 3. Sort by number of ratings (descending)
top_movies = popular_movies.sort_values(by='num_ratings', ascending=False)

# 4. Display top 10
top_movies[['title', 'num_ratings', 'avg_rating']].head(10)


Unnamed: 0,title,num_ratings,avg_rating
293,Pulp Fiction (1994),67310,4.174231
352,Forrest Gump (1994),66172,4.029
315,"Shawshank Redemption, The (1994)",63366,4.44699
587,"Silence of the Lambs, The (1991)",63299,4.177057
476,Jurassic Park (1993),59715,3.664741
257,Star Wars: Episode IV - A New Hope (1977),54502,4.190672
108,Braveheart (1995),53769,4.042534
583,Terminator 2: Judgment Day (1991),52244,3.931954
2486,"Matrix, The (1999)",51334,4.187186
523,Schindler's List (1993),50054,4.310175


In [8]:
# Step 1: Merge tags into a single string per movie
# Convert non-string tags (e.g., NaN/float) to empty strings before joining
tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x.dropna().astype(str))).reset_index()

# Step 2: Merge with movies to get titles + genres + tags
movies_with_tags = pd.merge(movies, tags_grouped, on='movieId', how='left')

# Step 3: Fill missing tag fields with empty strings
movies_with_tags['tag'] = movies_with_tags['tag'].fillna('')

# Step 4: Combine genres and tags into one "metadata" field
movies_with_tags['metadata'] = movies_with_tags['genres'] + ' ' + movies_with_tags['tag']

# Preview
movies_with_tags[['title', 'metadata']].head()


Unnamed: 0,title,metadata
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy Wa...
1,Jumanji (1995),Adventure|Children|Fantasy time travel adapted...
2,Grumpier Old Men (1995),Comedy|Romance old people that is actually fun...
3,Waiting to Exhale (1995),Comedy|Drama|Romance chick flick revenge chara...
4,Father of the Bride Part II (1995),Comedy Diane Keaton family sequel Steve Martin...


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the metadata column
tfidf_matrix = tfidf.fit_transform(movies_with_tags['metadata'])

# Shape of the resulting matrix
print("TF-IDF matrix shape:", tfidf_matrix.shape)


TF-IDF matrix shape: (27278, 23865)


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Check shape
print("Cosine similarity matrix shape:", cosine_sim.shape)


Cosine similarity matrix shape: (27278, 27278)


In [13]:
def normalize_title(title):
    if ", The" in title:
        title = "The " + title.replace(", The", "")
    return title.strip()

# Create a normalized title column
movies_with_tags['title_normalized'] = movies_with_tags['title'].apply(normalize_title)


In [14]:
# Use the normalized title as the lookup index
indices = pd.Series(movies_with_tags.index, index=movies_with_tags['title_normalized']).drop_duplicates()


In [18]:
def recommend_similar_movies(title, n=10):
    title = title.strip()
    if title not in indices:
        return f"'{title}' not found in the movie dataset."
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]
    movie_indices = [i[0] for i in sim_scores]
    
    return movies_with_tags[['title_normalized', 'genres']].iloc[movie_indices]


In [20]:
# Show movies that contain "Matrix" to find the correct title
movies_with_tags[movies_with_tags['title'].str.contains("Matrix", case=False, na=False)][['title']].head(10)


Unnamed: 0,title
2486,"Matrix, The (1999)"
6260,"Matrix Reloaded, The (2003)"
6822,"Matrix Revolutions, The (2003)"
9417,"Animatrix, The (2003)"


In [22]:
def normalize_title(title):
    if ", The" in title:
        title = "The " + title.replace(", The", "")
    return title.strip()

# Create a normalized title column
movies_with_tags['title_normalized'] = movies_with_tags['title'].apply(normalize_title)


In [24]:
recommend_similar_movies("The Matrix (1999)", 10)


Unnamed: 0,title_normalized,genres
6260,The Matrix Reloaded (2003),Action|Adventure|Sci-Fi|Thriller|IMAX
6822,The Matrix Revolutions (2003),Action|Adventure|Sci-Fi|Thriller|IMAX
6680,Avalon (2001),Drama|Fantasy|Sci-Fi
2021,Tron (1982),Action|Adventure|Sci-Fi
196,Strange Days (1995),Action|Crime|Drama|Mystery|Sci-Fi|Thriller
2586,The Thirteenth Floor (1999),Drama|Sci-Fi|Thriller
2515,eXistenZ (1999),Action|Sci-Fi|Thriller
1684,Dark City (1998),Adventure|Film-Noir|Sci-Fi|Thriller
27260,Parallels (2015),Sci-Fi
22287,Captive Women (1000 Years from Now) (3000 A.D....,Sci-Fi


In [26]:
print("The Matrix (1999)" in movies_with_tags['title_normalized'].values)


True


In [28]:
# ⚠️ Note: Due to resource constraints, this collaborative filtering model is demonstrated using 1% of the MovieLens 20M dataset. The full pipeline remains compatible with the complete dataset.”
# # # Collaborative Filtering (with scikit-surprise)
# # This model recommends movies based on user ratings behavior, rather than content. It finds users similar to you, and recommends what they liked.

In [30]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split

# Define format: userId, itemId, rating
reader = Reader(rating_scale=(0.5, 5.0))

# # Load dataset from pandas
# data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# # Train-test split
# trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
# ✅ NEW: Sample 1% of the data for faster training/testing
small_data = ratings.sample(frac=0.01, random_state=42)  # ~200,000 rows
data = Dataset.load_from_df(small_data[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)


In [32]:
# Matrix Factorization with SVD

In [34]:
from surprise import SVD
from surprise import accuracy

# Initialize the SVD algorithm
svd = SVD()

# Train the model on the training set
svd.fit(trainset)

# Predict ratings on the test set
predictions = svd.test(testset)

# Evaluate model performance
print("RMSE:", accuracy.rmse(predictions))

# Predict a single rating
pred = svd.predict(uid=1, iid=1)
print("Predicted Rating:", pred.est)


RMSE: 0.9458
RMSE: 0.9458496494770156
Predicted Rating: 4.107380071186034


In [36]:
# Predict a rating for user 1 on movie 1
pred = svd.predict(uid=1, iid=1)
print("Predicted Rating:", pred.est)


Predicted Rating: 4.107380071186034


In [None]:
# Hybrid Model
# Hybrid Function

In [38]:
def hybrid_recommender(user_id, movie_title, top_n=10):
    # Clean title (in case it's normalized)
    movie_title = movie_title.strip()
    
    # Check if movie exists in content-based index
    if movie_title not in indices:
        return f"'{movie_title}' not found in the dataset."

    # Step 1: Get content-similar movies
    idx = indices[movie_title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:50]  # Limit for performance
    
    # Step 2: Predict ratings for similar movies
    hybrid_results = []
    for movie_idx, sim_score in sim_scores:
        movie_id = movies_with_tags.iloc[movie_idx]['movieId']
        pred = svd.predict(user_id, movie_id)
        hybrid_results.append((movie_id, pred.est))

    # Step 3: Sort by predicted rating
    hybrid_results.sort(key=lambda x: x[1], reverse=True)
    top_movie_ids = [movie[0] for movie in hybrid_results[:top_n]]

    # Step 4: Return titles
    return movies[movies['movieId'].isin(top_movie_ids)][['title', 'genres']]


In [40]:
# Try a hybrid recommendation
hybrid_recommender(user_id=1, movie_title="The Matrix (1999)", top_n=10)


Unnamed: 0,title,genres
196,Strange Days (1995),Action|Crime|Drama|Mystery|Sci-Fi|Thriller
537,Blade Runner (1982),Action|Sci-Fi|Thriller
1684,Dark City (1998),Adventure|Film-Noir|Sci-Fi|Thriller
2586,"Thirteenth Floor, The (1999)",Drama|Sci-Fi|Thriller
3612,"Road Warrior, The (Mad Max 2) (1981)",Action|Adventure|Sci-Fi
6392,28 Days Later (2002),Action|Horror|Sci-Fi
12261,I Am Legend (2007),Action|Horror|Sci-Fi|Thriller|IMAX
12746,WALL·E (2008),Adventure|Animation|Children|Romance|Sci-Fi
15534,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
23680,Dawn of the Planet of the Apes (2014),Sci-Fi
