In [53]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [54]:
movies = pd.read_csv("ml-25m/movies.csv")
ratings = pd.read_csv("ml-25m/ratings.csv")

## Preprocessing

In [55]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [56]:
movies['combined_features'] = movies['title'] + ' ' + movies['genres']

In [57]:
movies.head()

Unnamed: 0,movieId,title,genres,combined_features
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure|Animation|Children|...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy


In [58]:
movies.isnull().sum()

movieId              0
title                0
genres               0
combined_features    0
dtype: int64

In [59]:
duplicates = movies[movies.duplicated(subset=['title'], keep=False)]
duplicates
# duplicates_sorted = duplicates.sort_values(by='title')
# len(duplicates_sorted)

Unnamed: 0,movieId,title,genres,combined_features
580,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin (1992) Adventure|Animation|Children|Co...
1710,1788,Men with Guns (1997),Action|Drama,Men with Guns (1997) Action|Drama
2553,2644,Dracula (1931),Horror,Dracula (1931) Horror
2759,2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller,Saturn 3 (1980) Adventure|Sci-Fi|Thriller
3454,3553,Gossip (2000),Drama|Thriller,Gossip (2000) Drama|Thriller
...,...,...,...,...
61525,206125,Lost & Found (2018),Comedy|Drama,Lost & Found (2018) Comedy|Drama
61697,206674,Camino (2016),Comedy,Camino (2016) Comedy
61714,206712,American Woman (2019),(no genres listed),American Woman (2019) (no genres listed)
61800,206925,The Plague (2006),Documentary,The Plague (2006) Documentary


In [60]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0
std,46791.72,39198.86,1.060744,226875800.0
min,1.0,1.0,0.5,789652000.0
25%,40510.0,1196.0,3.0,1011747000.0
50%,80914.0,2947.0,3.5,1198868000.0
75%,121557.0,8623.0,4.0,1447205000.0
max,162541.0,209171.0,5.0,1574328000.0


In [61]:
movies.drop_duplicates(subset=['title'], keep='first', inplace=True)


In [62]:
import re

def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title

In [63]:
movies["clean_title"] = movies["combined_features"].apply(clean_title)

In [64]:
movies

Unnamed: 0,movieId,title,genres,combined_features,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure|Animation|Children|...,Toy Story 1995 AdventureAnimationChildrenComed...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure|Children|Fantasy,Jumanji 1995 AdventureChildrenFantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy|Romance,Grumpier Old Men 1995 ComedyRomance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy|Drama|Romance,Waiting to Exhale 1995 ComedyDramaRomance
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy,Father of the Bride Part II 1995 Comedy
...,...,...,...,...,...
62418,209157,We (2018),Drama,We (2018) Drama,We 2018 Drama
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul (2001) Documentary,Window of the Soul 2001 Documentary
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems (2018) Comedy|Drama,Bad Poems 2018 ComedyDrama
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing (2001) (no genres listed),A Girl Thing 2001 no genres listed


## Content Based Recommendation System

In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [66]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend(watched_titles, k=5):
    # Clean titles
    watched_titles = [clean_title(title) for title in watched_titles]
    
    # Transform watched titles to TF-IDF vectors
    watched_vecs = vectorizer.transform(watched_titles)
    
    # Compute cosine similarity between watched titles and all movies
    similarity = cosine_similarity(watched_vecs, tfidf)

    
    indices = []
    for sim_row in similarity:
        idx = np.argpartition(sim_row, -k)[-k:]
        indices.extend(idx)
    # Remove duplicates and self-matches
    indices = np.unique(indices)
    results = movies.iloc[indices].iloc[::1]
    results
    return results

# Call recommend function with watched titles and inspect the recommendations
watched_titles = ["The Shawshank Redemption"]
recommendations = recommend(watched_titles, 10)
# print("Recommendations:")
recommendations


Unnamed: 0,movieId,title,genres,combined_features,clean_title
314,318,"Shawshank Redemption, The (1994)",Crime|Drama,"Shawshank Redemption, The (1994) Crime|Drama",Shawshank Redemption The 1994 CrimeDrama
14858,78729,24: Redemption (2008),Action|Adventure|Crime|Drama|Thriller,24: Redemption (2008) Action|Adventure|Crime|D...,24 Redemption 2008 ActionAdventureCrimeDramaTh...
17970,93838,The Raid: Redemption (2011),Action|Crime,The Raid: Redemption (2011) Action|Crime,The Raid Redemption 2011 ActionCrime
19152,99735,Road to Redemption (2001),Comedy|Drama,Road to Redemption (2001) Comedy|Drama,Road to Redemption 2001 ComedyDrama
19924,103366,Redemption (Hummingbird) (2013),Action|Crime|Thriller,Redemption (Hummingbird) (2013) Action|Crime|T...,Redemption Hummingbird 2013 ActionCrimeThriller
23254,117539,Pop Redemption (2013),Comedy,Pop Redemption (2013) Comedy,Pop Redemption 2013 Comedy
30707,136861,The Mark: Redemption (2013),Action|Fantasy|Sci-Fi,The Mark: Redemption (2013) Action|Fantasy|Sci-Fi,The Mark Redemption 2013 ActionFantasySciFi
32931,142038,Redemption (2013),(no genres listed),Redemption (2013) (no genres listed),Redemption 2013 no genres listed
40800,160287,Redemption Trail (2013),Drama,Redemption Trail (2013) Drama,Redemption Trail 2013 Drama
54738,190263,The Redemption of the Devil (2015),Documentary,The Redemption of the Devil (2015) Documentary,The Redemption of the Devil 2015 Documentary


In [67]:
watched_titles = [
     "The Shawshank Redemption",
    "The Godfather",
    "The Dark Knight",
    "Pulp Fiction",
    "The Lord of the Rings: The Return of the King"
 ]

recommendations = recommend(watched_titles , 10)
recommendations


Unnamed: 0,movieId,title,genres,combined_features,clean_title
292,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,Pulp Fiction (1994) Comedy|Crime|Drama|Thriller,Pulp Fiction 1994 ComedyCrimeDramaThriller
314,318,"Shawshank Redemption, The (1994)",Crime|Drama,"Shawshank Redemption, The (1994) Crime|Drama",Shawshank Redemption The 1994 CrimeDrama
840,858,"Godfather, The (1972)",Crime|Drama,"Godfather, The (1972) Crime|Drama",Godfather The 1972 CrimeDrama
1190,1221,"Godfather: Part II, The (1974)",Crime|Drama,"Godfather: Part II, The (1974) Crime|Drama",Godfather Part II The 1974 CrimeDrama
2026,2116,"Lord of the Rings, The (1978)",Adventure|Animation|Children|Fantasy,"Lord of the Rings, The (1978) Adventure|Animat...",Lord of the Rings The 1978 AdventureAnimationC...
4887,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,"Lord of the Rings: The Fellowship of the Ring,...",Lord of the Rings The Fellowship of the Ring T...
5840,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,"Lord of the Rings: The Two Towers, The (2002) ...",Lord of the Rings The Two Towers The 2002 Adve...
7028,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,"Lord of the Rings: The Return of the King, The...",Lord of the Rings The Return of the King The 2...
9160,27307,Night of the Day of the Dawn of the Son of the...,Comedy|Horror,Night of the Day of the Dawn of the Son of the...,Night of the Day of the Dawn of the Son of the...
10481,41822,Real Fiction (Shilje sanghwang) (2000),Crime|Drama,Real Fiction (Shilje sanghwang) (2000) Crime|D...,Real Fiction Shilje sanghwang 2000 CrimeDrama


## Recommendation based on content based system for a user

In [68]:
def get_movie_ids_for_user(user_id, ratings_df, threshold=3):
    """
    Get movie IDs for a particular user where the ratings are above a threshold.
    
    Args:
    - user_id (int): The ID of the user.
    - ratings_df (pandas DataFrame): DataFrame containing user ratings.
    - threshold (float): The minimum rating threshold (default=3).

    Returns:
    - list of int: List of movie IDs for the user where the ratings are above the threshold.
    """
    # Filter ratings for the specified user
    user_ratings = ratings[ratings['userId'] == user_id]

    # Filter movie IDs where the ratings are above the threshold
    movie_ids = user_ratings[user_ratings['rating'] > threshold]['movieId'].tolist()

    return movie_ids


In [69]:
def generate_recommendations(user_id, ratings_df, movies_df, vectorizer, tfidf, k=10):
    """
    Generate recommendations for a user based on their previously watched movies.
    
    Args:
    - user_id (int): The ID of the user.
    - ratings_df (pandas DataFrame): DataFrame containing user ratings.
    - movies_df (pandas DataFrame): DataFrame containing movie information.
    - vectorizer (TfidfVectorizer): TF-IDF vectorizer object.
    - tfidf (sparse matrix): TF-IDF matrix.
    - k (int): Number of recommendations to generate (default=10).

    Returns:
    - pandas DataFrame: DataFrame containing recommended movies for the user.
    """
    # Get movie IDs for the user where the ratings are above a threshold (e.g., 3)
    movie_ids = get_movie_ids_for_user(user_id, ratings_df)

    # Get movie titles corresponding to the movie IDs
    watched_titles = movies_df[movies_df['movieId'].isin(movie_ids)]['title'].tolist()
    print(watched_titles)
    # Generate recommendations based on watched titles
    recommendations = recommend(watched_titles, k=30)

    # Filter out movies that the user has already watched
    recommended_movies = recommendations[~recommendations['title'].isin(watched_titles)]

    return recommended_movies.head(5)


In [70]:
# Assuming 'user_id' is the ID of the user for whom you want to generate recommendations
user_id = 2
# Assuming 'ratings_df' is the DataFrame containing user ratings
# Assuming 'movies_df' is the DataFrame containing movie information
# Assuming 'vectorizer' and 'tfidf' are the TF-IDF vectorizer and matrix objects
recommendations = generate_recommendations(user_id, ratings, movies, vectorizer, tfidf)
print("Recommendations for user", user_id, ":")
recommendations


['Toy Story (1995)', 'Braveheart (1995)', 'Apollo 13 (1995)', 'Rob Roy (1995)', 'French Kiss (1995)', 'Star Wars: Episode IV - A New Hope (1977)', 'Shawshank Redemption, The (1994)', 'Tommy Boy (1995)', 'Clear and Present Danger (1994)', 'Forrest Gump (1994)', 'Lion King, The (1994)', 'Fugitive, The (1993)', 'Much Ado About Nothing (1993)', 'Rudy (1993)', "Schindler's List (1993)", 'Shadowlands (1993)', 'Terminator 2: Judgment Day (1991)', 'Rock, The (1996)', 'Godfather, The (1972)', 'My Fair Lady (1964)', "It's a Wonderful Life (1946)", 'Monty Python and the Holy Grail (1975)', 'Star Wars: Episode V - The Empire Strikes Back (1980)', 'Princess Bride, The (1987)', 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', 'Star Wars: Episode VI - Return of the Jedi (1983)', 'Dead Poets Society (1989)', 'Better Off Dead... (1985)', 'Back to the Future (1985)', 'Highlander (1986)', 'High Noon (1952)', 'Ben-Hur (1959)', 'Indiana Jones and the Last Crusade (1989)', '

Unnamed: 0,movieId,title,genres,combined_features,clean_title
13,14,Nixon (1995),Drama,Nixon (1995) Drama,Nixon 1995 Drama
25,26,Othello (1995),Drama,Othello (1995) Drama,Othello 1995 Drama
42,43,Restoration (1995),Drama,Restoration (1995) Drama,Restoration 1995 Drama
54,55,Georgia (1995),Drama,Georgia (1995) Drama,Georgia 1995 Drama
67,68,French Twist (Gazon maudit) (1995),Comedy|Romance,French Twist (Gazon maudit) (1995) Comedy|Romance,French Twist Gazon maudit 1995 ComedyRomance


## Collaborative filtering based recommendation system using Surprise 

In [74]:
def filter_trusted_users(ratings_df, min_ratings=500):
    """
    Filter out users who have rated more than a certain number of movies.
    
    Args:
    - ratings_df (pandas DataFrame): DataFrame containing user ratings.
    - min_ratings (int): Minimum number of ratings required (default=50).

    Returns:
    - pandas DataFrame: DataFrame containing ratings from trusted users.
    """
    # Group ratings by user and count the number of ratings for each user
    user_ratings_count = ratings_df.groupby('userId').size().reset_index(name='ratings_count')

    # Filter users who have rated more than min_ratings movies
    trusted_users = user_ratings_count[user_ratings_count['ratings_count'] > min_ratings]['userId']

    # Filter ratings for trusted users
    trusted_ratings = ratings_df[ratings_df['userId'].isin(trusted_users)]

    return trusted_ratings

# Call the function to filter trusted users and get their ratings
trusted_ratings = filter_trusted_users(ratings, min_ratings=500)


In [75]:
trusted_ratings.count()

userId       8935061
movieId      8935061
rating       8935061
timestamp    8935061
dtype: int64

In [76]:
from collections import defaultdict
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split

from surprise import Dataset, SVD

In [94]:
def create_movie_id_to_name_mapping(movies_df):
    """Create a dictionary mapping movie IDs to movie names from a movies DataFrame.

    Args:
        movies_df (pandas DataFrame): DataFrame containing movie IDs and movie names.

    Returns:
        A dictionary mapping movie IDs to movie names.
    """
    movie_id_to_name = dict(zip(movies_df['movieId'], movies_df['title']))
    return movie_id_to_name

In [98]:
movie_id_to_name = create_movie_id_to_name_mapping(movies)

In [104]:
def get_top_n_movie_names_for_user(predictions, user_id, n=10, movies_df=None):
    """Return the top-N movie names recommended for a specific user from a set of predictions.

    Args:
        predictions (list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        user_id (str or int): The ID of the user for whom recommendations are desired.
        n (int): The number of recommendations to output for the user. Default is 10.
        movies_df (pandas DataFrame): DataFrame containing movie IDs and movie names.

    Returns:
        A list of movie names of size n, representing the top-N recommendations for the specified user.
    """
    # Filter predictions for the specified user
    user_predictions = [(uid, iid, true_r, est) for uid, iid, true_r, est, _ in predictions if uid == user_id]

    # Sort the predictions for the user by estimated rating
    user_predictions.sort(key=lambda x: x[3], reverse=True)

    # Get the top-N recommended movie IDs
    top_n_movie_ids = [iid for _, iid, _, _ in user_predictions[:n]]

    # Map movie IDs to movie names if movies_df is provided
    if movies_df is not None:
        top_n_movie_names = [movies_df[movies_df['movieId'] == movie_id]['title'].iloc[0] for movie_id in top_n_movie_ids]
    else:
        top_n_movie_names = top_n_movie_ids  # Just use movie IDs if movies_df is not provided

    return top_n_movie_names


In [78]:
# Load ratings from a pandas DataFrame
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(trusted_ratings[['userId', 'movieId', 'rating']], reader)

In [79]:
# Perform train-test split
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

In [81]:
# Initialize SVD algorithm with custom parameters
algo = SVD(n_factors=150, n_epochs=200, lr_all=0.01, reg_all=0.1, verbose=True)

In [82]:
# Fit the algorithm on the training set
algo.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
Processing

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x16e8231d6d0>

In [89]:

predictions = algo.test(testset)
predictions

[Prediction(uid=46918, iid=40815, r_ui=4.5, est=3.89947684265283, details={'was_impossible': False}),
 Prediction(uid=72315, iid=200918, r_ui=2.5, est=3.456168745375171, details={'was_impossible': False}),
 Prediction(uid=143568, iid=30850, r_ui=4.5, est=3.5602461847691766, details={'was_impossible': False}),
 Prediction(uid=116788, iid=6705, r_ui=0.5, est=2.4189912621105756, details={'was_impossible': False}),
 Prediction(uid=154806, iid=588, r_ui=3.0, est=3.3308014560037744, details={'was_impossible': False}),
 Prediction(uid=27266, iid=7347, r_ui=4.0, est=3.265460336900632, details={'was_impossible': False}),
 Prediction(uid=62522, iid=106540, r_ui=1.5, est=2.166401643710022, details={'was_impossible': False}),
 Prediction(uid=56282, iid=6303, r_ui=3.5, est=3.277848881932498, details={'was_impossible': False}),
 Prediction(uid=158655, iid=3387, r_ui=2.0, est=2.469439512497876, details={'was_impossible': False}),
 Prediction(uid=24600, iid=153, r_ui=1.0, est=2.6569892587628674, detai

In [106]:
top_n_recommendations = get_top_n_movie_names_for_user(predictions, 47708, n=10, movies_df=movies)

print("Top 10 movie recommendations for user 47708:")
for movie_name in top_n_recommendations:
    print(movie_name)

Top 10 movie recommendations for user 47708:
Godfather: Part II, The (1974)
Apocalypse Now (1979)
Fog of War: Eleven Lessons from the Life of Robert S. McNamara, The (2003)
City of God (Cidade de Deus) (2002)
One Flew Over the Cuckoo's Nest (1975)
Persona (1966)
Andrei Rublev (Andrey Rublyov) (1969)
400 Blows, The (Les quatre cents coups) (1959)
Kingdom, The (Riget) (1994)
Memento (2000)


## Evaluation
- The evaluation has been done on three metrices `precision_at_k1` , `recall_at_k` , `average_precision_at_k`

In [107]:
# Function to calculate Precision@K, Recall@K, and MAP@K
def precision_at_k(actual, predicted, k):
    predicted_k = predicted[:k]
    relevant_k = [item for item in predicted_k if item in actual]
    precision = len(relevant_k) / k if k > 0 else 0
    return precision

def recall_at_k(actual, predicted, k):
    predicted_k = predicted[:k]
    relevant_k = [item for item in predicted_k if item in actual]
    recall = len(relevant_k) / len(actual) if len(actual) > 0 else 0
    return recall

def average_precision_at_k(actual, predicted, k):
    precision_sum = 0
    num_relevant_items = min(k, len(actual))
    num_retrieved_relevant_items = 0

    for i, item in enumerate(predicted[:k]):
        if item in actual:
            num_retrieved_relevant_items += 1
            precision_sum += num_retrieved_relevant_items / (i + 1)

    average_precision = precision_sum / num_relevant_items if num_relevant_items > 0 else 0
    return average_precision

# Function to evaluate recommendations
def evaluate_recommendations(actual, predicted, k):
    precision = precision_at_k(actual, predicted, k)
    recall = recall_at_k(actual, predicted, k)
    map_at_k = average_precision_at_k(actual, predicted, k)
    return precision, recall, map_at_k