In [1]:
# Libraries
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
# Dataset
movies = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\movies.csv', usecols=['movieId','title','genres'], dtype={'movieId':'int32','title':'str','genres':'str'})
ratings = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\ratings.csv', usecols=['userId', 'movieId', 'rating'], dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
tags = pd.read_csv(r'C:\Users\HP\OneDrive\Desktop\tags.csv')

In [3]:
print(movies.shape)
print(ratings.shape)
print(tags.shape)


(9742, 3)
(100836, 3)
(3683, 4)


In [4]:
movies.info()
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int32 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int32(1), object(2)
memory usage: 190.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int32  
 1   movieId  100836 non-null  int32  
 2   rating   100836 non-null  float32
dtypes: float32(1), int32(2)
memory usage: 1.2 MB


In [5]:
print('\n',movies.isnull().sum())
print('\n',ratings.isnull().sum())
print('\n',tags.isnull().sum())


 movieId    0
title      0
genres     0
dtype: int64

 userId     0
movieId    0
rating     0
dtype: int64

 userId       0
movieId      0
tag          0
timestamp    0
dtype: int64


In [6]:
# Merge movies and tags
grouped_tags = tags.groupby('movieId')['tag'].apply(list)
grouped_tags_df = pd.DataFrame(grouped_tags)
movie_tag = pd.merge(movies, grouped_tags_df, left_on='movieId', right_index=True, how='left')

In [7]:
# Preprocess movie data
movie_tag['tag'] = movie_tag['tag'].apply(lambda x: x if isinstance(x, list) else [])
movie_tag['genres'] = movie_tag['genres'].apply(lambda x: x.split('|'))
movie_tag['genres_tags'] = movie_tag['genres'].apply(lambda x: ' '.join(x)) + ' ' + movie_tag['tag'].apply(lambda x: ' '.join(x))
movie_tag['year'] = movie_tag['title'].apply(lambda x: re.findall('\((\d{4})\)', x))
movie_tag['year'] = movie_tag['year'].apply(lambda x: x[0] if x else '')
movie_tag['genres_tags_year'] = movie_tag['genres_tags'] + ' ' + movie_tag['year']
movie_tag['title'] = movie_tag['title'].str.replace('\(\d{4}\)', '', regex=True).str.strip()


In [8]:
# Compute TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie_tag['genres_tags_year'])


In [9]:
# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Create a reverse map of indices and movie titles
indices = pd.Series(movie_tag.index, index=movie_tag['title']).drop_duplicates()

In [10]:
def recommend_popular_movies(n, user_preferences=None):
    movie_ratings = ratings.groupby('movieId')['rating'].mean()
    popular_movies = movie_ratings.sort_values(ascending=False).head(n)
    if user_preferences:
        popular_movies = movie_tag.loc[movie_tag['movieId'].isin(popular_movies.index) & movie_tag['genres'].apply(lambda x: any(pref in x for pref in user_preferences)), 'title']
    else:
        popular_movies = movie_tag.loc[movie_tag['movieId'].isin(popular_movies.index), 'title']
    return popular_movies

In [11]:
# Get popular movies and recommendations for a new user with preferences:
popular_movies = recommend_popular_movies(10)
print("Popular Movies:")
print(popular_movies)
print()

Popular Movies:
4246    Open Hearts (Elsker dig for evigt)
4251                             Lady Jane
7656        Paper Birds (Pájaros de papel)
8107                   Act of Killing, The
8148                  Justice League: Doom
8154               Bill Hicks: Revelations
9083                              Jump In!
9094                                 Human
9096                          L.A. Slasher
9122                       Formula of Love
Name: title, dtype: object



In [12]:
user_movie_choices = []  # Empty list for a new user
user_preferences = ['Action', 'Adventure']  # User preferences (optional)

recommendations = recommend_popular_movies(3, user_preferences)  # Recommend popular movies to a new user with preferences

if recommendations.empty:
    recommendations = popular_movies.sample(3)  # If no recommendations based on preferences, select random popular movies

print("Recommendations:")
print(recommendations)

Recommendations:
9094                                 Human
4246    Open Hearts (Elsker dig for evigt)
8107                   Act of Killing, The
Name: title, dtype: object


In [13]:
def get_recommendations(movie_choices, user_preferences, user_ratings=None, cosine_sim=cosine_sim, num_recommendations=3):
    movie_indices = []
    for choice in movie_choices:
        if choice in indices:
            idx = indices[choice]
            movie_indices.append(idx)
    genre_indices = movie_tag[movie_tag['genres'].apply(lambda x: any(pref in x for pref in user_preferences))].index
    movie_indices.extend(genre_indices)
    movie_indices = list(set(movie_indices))
    if user_ratings is not None:
        for movie, rating in user_ratings.items():
            if movie in indices:
                idx = indices[movie]
                movie_indices.append(idx)
                ratings.loc[len(ratings)] = [1, idx, rating]  # Add the user ratings to the ratings DataFrame
    if len(movie_indices) == 0:
        return "No movies found for the given choices and preferences."
    tfidf_matrix_updated = tfidf.fit_transform(movie_tag['genres_tags_year']) # Update TF-IDF matrix with new ratings
    cosine_sim_updated = linear_kernel(tfidf_matrix_updated, tfidf_matrix_updated) # Update cosine similarity matrix
    sim_scores = []
    for idx in movie_indices:
        sim_scores.extend(list(enumerate(cosine_sim_updated[idx])))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:(num_recommendations + 1)]
        movie_indices = [i[0] for i in sim_scores]
        recommended_movies = movie_tag.loc[movie_indices, 'title']
    return recommended_movies


In [14]:
movie_tag["title"]

0                                Toy Story
1                                  Jumanji
2                         Grumpier Old Men
3                        Waiting to Exhale
4              Father of the Bride Part II
                       ...                
9737    Black Butler: Book of the Atlantic
9738                 No Game No Life: Zero
9739                                 Flint
9740          Bungo Stray Dogs: Dead Apple
9741          Andrew Dice Clay: Dice Rules
Name: title, Length: 9742, dtype: object

In [15]:
user_movie_choices = ['Jumanji', 'The Dark Knight', 'Human', 'Grumpier Old Men']
recommendations = get_recommendations(user_movie_choices, user_preferences)
print("Recommendations:")
print(recommendations)

Recommendations:
8239                                Planes
8171    Captain America II: Death Too Soon
8137                              Oblivion
Name: title, dtype: object
