<a href="https://colab.research.google.com/github/EmiljaB/NLP_Projects/blob/Movies_Recomender/Movie_Recommenders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Build a Movie Recommender system with the following methods:
### Emilja Beneja
#### Popularity
#### Content Filter
#### Collaborative Filter
#### Matrix Factorization
*NOTE: I used the 1M dataset because the runtime would disconnect even when switched to GPU, it reached the RAM limit.*

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

### Load and Preprocess Data

In [None]:
!wget https://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip -d ./ml-1m/

--2024-11-16 21:30:29--  https://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2024-11-16 21:30:29 (18.8 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ./ml-1m/ml-1m/
  inflating: ./ml-1m/ml-1m/movies.dat  
  inflating: ./ml-1m/ml-1m/ratings.dat  
  inflating: ./ml-1m/ml-1m/README    
  inflating: ./ml-1m/ml-1m/users.dat  


In [None]:
import pandas as pd

# Load datasets
ratings_file = './ml-1m/ml-1m/ratings.dat'
movies_file = './ml-1m/ml-1m/movies.dat'
ratings = pd.read_csv(ratings_file, sep='::', engine='python', names=['userId', 'movieId', 'rating', 'timestamp'], encoding='latin1')
movies = pd.read_csv(movies_file, sep='::', engine='python', names=['movieId', 'title', 'genres'], encoding='latin1')

# Create user-movie rating matrix
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Align movies in both datasets
common_movie_ids = user_movie_matrix.columns.intersection(movies['movieId'])

# Filter movies and user_movie_matrix
movies_filtered = movies[movies['movieId'].isin(common_movie_ids)].reset_index(drop=True)
user_movie_matrix = user_movie_matrix[common_movie_ids]

print(f"Number of common movies: {len(common_movie_ids)}")


Number of common movies: 3706


### Create Metadata for Content Filtering

In [None]:
# Create metadata by merging movies and tags (simulated since 1M dataset lacks tags.csv)
movies['metadata'] = movies['genres']  # Use genres as metadata

print("\nMetadata for Content Filtering:")
print(movies.head())



Metadata for Content Filtering:
   movieId                               title                        genres  \
0        1                    Toy Story (1995)   Animation|Children's|Comedy   
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy   
2        3             Grumpier Old Men (1995)                Comedy|Romance   
3        4            Waiting to Exhale (1995)                  Comedy|Drama   
4        5  Father of the Bride Part II (1995)                        Comedy   

                       metadata  
0   Animation|Children's|Comedy  
1  Adventure|Children's|Fantasy  
2                Comedy|Romance  
3                  Comedy|Drama  
4                        Comedy  


### Popularity-Based Recommender

In [None]:
# Popularity model: top movies by average rating
popularity_model = ratings.groupby('movieId')['rating'].mean().sort_values(ascending=False).reset_index()

# Merge with movie titles
popularity_model = pd.merge(popularity_model, movies[['movieId', 'title']], on='movieId')
popularity_model.columns = ['movieId', 'avg_rating', 'title']

# Top 10 popular movies
print("\nTop 10 Popular Movies:")
print(popularity_model.head(10))



Top 10 Popular Movies:
   movieId  avg_rating                                      title
0      989         5.0  Schlafes Bruder (Brother of Sleep) (1995)
1     3881         5.0                   Bittersweet Motel (2000)
2     1830         5.0                    Follow the Bitch (1998)
3     3382         5.0                     Song of Freedom (1936)
4      787         5.0         Gate of Heavenly Peace, The (1995)
5     3280         5.0                           Baby, The (1973)
6     3607         5.0                   One Little Indian (1973)
7     3233         5.0                       Smashing Time (1967)
8     3172         5.0                    Ulysses (Ulisse) (1954)
9     3656         5.0                               Lured (1947)


### Content-Based Filtering with TF-IDF and Truncated SVD

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Fill missing metadata with empty strings
movies['metadata'] = movies['metadata'].fillna('')

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['metadata'])

# Dimensionality reduction using Truncated SVD
svd = TruncatedSVD(n_components=10, random_state=42)
latent_matrix_1 = svd.fit_transform(tfidf_matrix)

print("\nLatent Matrix 1 Shape (Content-Based):", latent_matrix_1.shape)




Latent Matrix 1 Shape (Content-Based): (3883, 10)


### Collaborative Filtering on User-Movie Matrix

In [None]:
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

# Create user-movie matrix
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Convert to sparse matrix to save memory
sparse_matrix = csr_matrix(user_movie_matrix.values)

# Dimensionality reduction using Truncated SVD
svd = TruncatedSVD(n_components=50, random_state=42)
latent_matrix_2 = svd.fit_transform(sparse_matrix)

print("\nLatent Matrix 2 Shape (Collaborative-Based):", latent_matrix_2.shape)



Latent Matrix 2 Shape (Collaborative-Based): (6040, 50)


### Matrix Factorization with scikit-learn

In [None]:
!pip install scikit-learn




In [None]:
from sklearn.decomposition import TruncatedSVD
import numpy as np

# Create user-movie rating matrix
user_movie_matrix = ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Convert the DataFrame to a numpy array
X = user_movie_matrix.values

# Apply Truncated SVD
svd = TruncatedSVD(n_components=50, random_state=42)
matrix_factorization = svd.fit_transform(X)

print("\nMatrix Factorization Shape:", matrix_factorization.shape)



Matrix Factorization Shape: (6040, 50)


### Recommending Movies using Matrix Factorization

In [None]:
import numpy as np

# Function to recommend movies to a user based on predicted ratings
def recommend_movies(user_id, n_recommendations=5):
    user_index = user_movie_matrix.index.get_loc(user_id)
    user_ratings = X_reconstructed[user_index]

    # Get indices of movies the user hasn't rated yet
    rated_movies = np.where(user_movie_matrix.iloc[user_index].values > 0)[0]  # Use numpy where
    unrated_movies = np.setdiff1d(np.arange(user_movie_matrix.shape[1]), rated_movies)

    # Get predicted ratings for unrated movies
    predicted_ratings = user_ratings[unrated_movies]

    # Get top N recommendations
    top_indices = predicted_ratings.argsort()[::-1][:n_recommendations]
    movie_indices = unrated_movies[top_indices]

    # Get movie IDs and titles
    recommended_movie_ids = user_movie_matrix.columns[movie_indices]
    recommended_movies = movies[movies['movieId'].isin(recommended_movie_ids)]

    return recommended_movies['title'].values

# Example usage
print("\nRecommendations for User 1:")
print(recommend_movies(1))



Recommendations for User 1:
['Babe (1995)' 'Shawshank Redemption, The (1994)' 'Lion King, The (1994)'
 'Fantasia (1940)' 'Little Mermaid, The (1989)']


### Hybrid Model
Combine the latent features from both content-based filtering and collaborative filtering to create a hybrid model.

##### Content-Based Latent Matrix

In [None]:
# Use metadata for aligned movies
movies_filtered['metadata'] = movies_filtered['title'] + ' ' + movies_filtered['genres']
movies_filtered['metadata'] = movies_filtered['metadata'].fillna('')

# TF-IDF and SVD
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_filtered['metadata'])

svd_content = TruncatedSVD(n_components=50, random_state=42)
latent_matrix_1 = svd_content.fit_transform(tfidf_matrix)

print("\nContent-Based Latent Matrix Shape:", latent_matrix_1.shape)




Content-Based Latent Matrix Shape: (3706, 50)


##### Collaborative Filtering Latent Matrix

In [None]:
from scipy.sparse import csr_matrix

# Sparse matrix for efficiency
sparse_matrix = csr_matrix(user_movie_matrix.values)

# SVD for collaborative filtering
svd_collab = TruncatedSVD(n_components=50, random_state=42)
movie_latent_features = svd_collab.fit_transform(sparse_matrix.T)

print("\nCollaborative Latent Matrix Shape:", movie_latent_features.shape)




Collaborative Latent Matrix Shape: (3706, 50)


##### Combine Latent Matrices
*Here are some extra steps to align the matrix sizes.*

In [None]:
# Align movies
common_movie_ids = user_movie_matrix.columns.intersection(movies['movieId'])
movies_filtered = movies[movies['movieId'].isin(common_movie_ids)].reset_index(drop=True)
user_movie_matrix = user_movie_matrix[common_movie_ids]

In [None]:
# Reindex movies_filtered to match the order of movie IDs in user_movie_matrix
movies_filtered = movies_filtered.set_index('movieId').loc[user_movie_matrix.columns].reset_index()

# Verify the alignment
assert all(movies_filtered['movieId'] == user_movie_matrix.columns), "Movies are not aligned!"


In [None]:
# Combine content-based and collaborative filtering features
hybrid_matrix = np.hstack([latent_matrix_1, movie_latent_features])

print("\nHybrid Matrix Shape:", hybrid_matrix.shape)



Hybrid Matrix Shape: (3706, 100)


###  Implementing Hybrid Recommendations

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between movies using the hybrid matrix
cosine_sim = cosine_similarity(hybrid_matrix)

# Build a reverse mapping of movie titles to indices
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Function to get hybrid recommendations based on movie title
def hybrid_recommendations(title, n_recommendations=5):
    idx = indices[title]

    # Compute similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the most similar movies
    sim_scores = sim_scores[1:n_recommendations+1]
    movie_indices = [i[0] for i in sim_scores]

    # Return the top n most similar movies
    return movies['title'].iloc[movie_indices].values

# Example usage
print("\nHybrid Recommendations for 'Toy Story (1995)':")
print(hybrid_recommendations('Toy Story (1995)'))



Hybrid Recommendations for 'Toy Story (1995)':
['Bad Seed, The (1956)' 'Babe (1995)' 'Rounders (1998)'
 'Hour of the Pig, The (1993)' 'Tie Me Up! Tie Me Down! (1990)']


### Collaborative Filtering with Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity between users
user_similarity = cosine_similarity(user_movie_matrix)

# Convert to DataFrame for ease of use
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

# Function to recommend movies based on similar users
def collaborative_recommendations(user_id, n_recommendations=5):
    # Find similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:]

    # Get ratings from similar users
    similar_users_ratings = ratings[ratings['userId'].isin(similar_users)]

    # Aggregate ratings
    movie_scores = similar_users_ratings.groupby('movieId')['rating'].mean()

    # Exclude movies already rated by the user
    user_rated_movies = ratings[ratings['userId'] == user_id]['movieId']
    movie_scores = movie_scores[~movie_scores.index.isin(user_rated_movies)]

    # Get top N recommendations
    top_movies = movie_scores.sort_values(ascending=False).head(n_recommendations)

    # Return movie titles
    recommended_movies = movies[movies['movieId'].isin(top_movies.index)]
    return recommended_movies['title'].values

# Example usage
print("\nCollaborative Recommendations for User 1:")
print(collaborative_recommendations(1))



Collaborative Recommendations for User 1:
['Gate of Heavenly Peace, The (1995)'
 'Schlafes Bruder (Brother of Sleep) (1995)' 'Follow the Bitch (1998)'
 'Ulysses (Ulisse) (1954)' 'Smashing Time (1967)']


### Content-Based Filtering with Cosine Similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Prepare metadata (combine title and genres)
movies['metadata'] = movies['title'] + ' ' + movies['genres']

# Fill missing values
movies['metadata'] = movies['metadata'].fillna('')

# Apply TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['metadata'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Build a reverse mapping of movie titles to indices
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Function to get content-based recommendations
def content_recommendations(title, n_recommendations=5):
    idx = indices[title]

    # Compute similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the most similar movies
    sim_scores = sim_scores[1:n_recommendations+1]
    movie_indices = [i[0] for i in sim_scores]

    # Return the top n most similar movies
    return movies['title'].iloc[movie_indices].values

# Example usage
print("\nContent-Based Recommendations for 'Toy Story (1995)':")
print(content_recommendations('Toy Story (1995)'))



Content-Based Recommendations for 'Toy Story (1995)':
['Toy Story 2 (1999)' "We're Back! A Dinosaur's Story (1993)"
 'Story of Us, The (1999)' 'L.A. Story (1991)' 'Balto (1995)']


### Popularity-Based Recommender

In [None]:
# Calculate average ratings and number of ratings for each movie
movie_ratings = ratings.groupby('movieId').agg({'rating': ['mean', 'count']})
movie_ratings.columns = ['mean_rating', 'rating_count']

# Merge with movies DataFrame
popularity_df = movies.merge(movie_ratings, on='movieId')

# Recommend top N popular movies
def popularity_recommendations(n_recommendations=10):
    top_movies = popularity_df.sort_values(by=['mean_rating', 'rating_count'], ascending=False).head(n_recommendations)
    return top_movies['title'].values

# Example usage
print("\nTop 10 Popular Movies:")
print(popularity_recommendations())



Top 10 Popular Movies:
['Gate of Heavenly Peace, The (1995)' 'Smashing Time (1967)'
 'Schlafes Bruder (Brother of Sleep) (1995)' 'Follow the Bitch (1998)'
 'Ulysses (Ulisse) (1954)' 'Baby, The (1973)' 'Song of Freedom (1936)'
 'One Little Indian (1973)' 'Lured (1947)' 'Bittersweet Motel (2000)']
