In [25]:
#Libraries
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


In [10]:
#Dataset
movies=pd.read_csv(r'D:\FastSemesters\semester6\Recommender_System(E)\project\temp\ml-latest-small\movies.csv', usecols=['movieId','title','genres'], dtype={'movieId':'int32','title':'str','genre':'str'})
ratings=pd.read_csv(r'D:\FastSemesters\semester6\Recommender_System(E)\project\temp\ml-latest-small\ratings.csv',usecols=['userId', 'movieId', 'rating'],dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
tags=pd.read_csv(r'D:\FastSemesters\semester6\Recommender_System(E)\project\temp\ml-latest-small\tags.csv',usecols=['userId', 'movieId', 'tag'],dtype={'userId': 'int32', 'movieId': 'int32', 'tag': 'str'})
#links=pd.read_csv(r'C:\Users\sarim\OneDrive\Desktop\RS\Project\Recommendation\Datasets\links.csv')



In [11]:
print(movies.shape)
print(ratings.shape)
#print(links.shape)
print(tags.shape)
print('\n',movies.isnull().sum())
print('\n',ratings.isnull().sum())
#movies.info()
#ratings.info()

(9742, 3)
(100836, 3)
(3683, 3)

 movieId    0
title      0
genres     0
dtype: int64

 userId     0
movieId    0
rating     0
dtype: int64


In [12]:
tags.isna().sum()

userId     0
movieId    0
tag        0
dtype: int64

In [13]:
tags.head()

Unnamed: 0,userId,movieId,tag
0,2,60756,funny
1,2,60756,Highly quotable
2,2,60756,will ferrell
3,2,89774,Boxing story
4,2,89774,MMA


In [14]:
grouped_tags = tags.groupby('movieId')['tag'].apply(list)

grouped_tags_df = pd.DataFrame(grouped_tags)

movie_tag = pd.merge(movies, grouped_tags_df, left_on='movieId', right_index=True, how='left')

movie_tag['tag'] = movie_tag['tag'].apply(lambda x: x if isinstance(x, list) else [])
movie_tag['genres'] = movie_tag['genres'].apply(lambda x: x.split('|'))
movie_tag.head()


Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, fun]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]","[fantasy, magic board game, Robin Williams, game]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]","[moldy, old]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",[]
4,5,Father of the Bride Part II (1995),[Comedy],"[pregnancy, remake]"


In [15]:
movie_tag.isna().sum()

movieId    0
title      0
genres     0
tag        0
dtype: int64

In [16]:
movie_tag['genres_tags'] = movie_tag['genres'].apply(lambda x: ' '.join(x)) + ' ' + movie_tag['tag'].apply(lambda x: ' '.join(x))

In [17]:
movie_tag['year'] = movie_tag['title'].apply(lambda x: re.findall('\((\d{4})\)', x))
movie_tag['year'] = movie_tag['year'].apply(lambda x: x[0] if x else '')
movie_tag['genres_tags_year'] = movie_tag['genres_tags'] + ' ' + movie_tag['year']


In [24]:
movie_tag['title'] = movie_tag['title'].str.replace('\(\d{4}\)', '', regex=True).str.strip()
movie_tag.head(5)

Unnamed: 0,movieId,title,genres,tag,genres_tags,year,genres_tags_year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]","[pixar, pixar, fun]",Adventure Animation Children Comedy Fantasy pi...,1995,Adventure Animation Children Comedy Fantasy pi...
1,2,Jumanji,"[Adventure, Children, Fantasy]","[fantasy, magic board game, Robin Williams, game]",Adventure Children Fantasy fantasy magic board...,1995,Adventure Children Fantasy fantasy magic board...
2,3,Grumpier Old Men,"[Comedy, Romance]","[moldy, old]",Comedy Romance moldy old,1995,Comedy Romance moldy old 1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",[],Comedy Drama Romance,1995,Comedy Drama Romance 1995
4,5,Father of the Bride Part II,[Comedy],"[pregnancy, remake]",Comedy pregnancy remake,1995,Comedy pregnancy remake 1995


# MULTIPLE MOVIES

In [34]:
# Compute TF-IDF matrix
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie_tag['genres_tags_year'])

# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Create a reverse map of indices and movie titles
indices = pd.Series(movie_tag.index, index=movie_tag['title']).drop_duplicates()

In [39]:
def get_recommendations(titles, cosine_sim=cosine_sim, indices=indices):
    movie_indices = []
    for title in titles:
        idx = indices[title]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:4]
        movie_indices.extend([i[0] for i in sim_scores])
    
    return movie_tag['title'].iloc[movie_indices]

In [36]:
movie_tag["title"]

0                                Toy Story
1                                  Jumanji
2                         Grumpier Old Men
3                        Waiting to Exhale
4              Father of the Bride Part II
                       ...                
9737    Black Butler: Book of the Atlantic
9738                 No Game No Life: Zero
9739                                 Flint
9740          Bungo Stray Dogs: Dead Apple
9741          Andrew Dice Clay: Dice Rules
Name: title, Length: 9742, dtype: object

In [42]:
titles = ["Jumanji", "Toy Story","Grumpier Old Men","Andrew Dice Clay: Dice Rules"]
recommendations = get_recommendations(titles)
print(recommendations)

53      Indian in the Cupboard, The
6254            Night at the Museum
9692                    Tomb Raider
1757                  Bug's Life, A
2355                    Toy Story 2
12                            Balto
60      French Twist (Gazon maudit)
152                        Mallrats
157                     Nine Months
885                    Hear My Song
2105                Problem Child 2
2847                What About Bob?
Name: title, dtype: object
