### Importing Libraries

In [1]:
import pandas as pd
import numpy as np

### Content Based Approach

In [2]:
movie_tag=pd.read_csv("CSVs\\genome-scores.csv")
movie_tag.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.029
1,1,2,0.02375
2,1,3,0.05425
3,1,4,0.06875
4,1,5,0.16


In [3]:
tag=pd.read_csv("CSVs\\genome-tags.csv")
tag.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [4]:
movie_tag.describe()

Unnamed: 0,movieId,tagId,relevance
count,14862530.0,14862530.0,14862530.0
mean,39970.87,564.5,0.1153631
std,48149.3,325.6254,0.155432
min,1.0,1.0,0.00025
25%,3680.75,282.75,0.023
50%,7880.0,564.5,0.0545
75%,71249.0,846.25,0.139
max,187595.0,1128.0,1.0


In [5]:
#Keeping Only relevant tags with movies (with a relevance score of greater than 0.65)
movie_tag=movie_tag[movie_tag.relevance>0.65].reset_index()
movie_tag.drop('index',axis=1,inplace=True)
movie_tag.head()

Unnamed: 0,movieId,tagId,relevance
0,1,19,0.66825
1,1,29,0.907
2,1,62,0.66025
3,1,63,0.955
4,1,64,0.98875


#### Generating Keywords 

In [6]:
tmp_df=movie_tag.copy()
tmp_df.head()

Unnamed: 0,movieId,tagId,relevance
0,1,19,0.66825
1,1,29,0.907
2,1,62,0.66025
3,1,63,0.955
4,1,64,0.98875


In [7]:
#Merging the 2 datasets to link correct tag with correct movie Id
tmp_df=tmp_df.merge(tag,left_on="tagId",right_on="tagId")
tmp_df.drop("tagId",axis=1,inplace=True)

#Generating Keywords seperated by , using the tags
tmp_df["Keywords"]=tmp_df.groupby(['movieId'])['tag'].transform(lambda x : ','.join(x))
tmp_df.drop(["tag","relevance"],axis=1,inplace=True)
tmp_df=tmp_df.drop_duplicates().reset_index()
tmp_df.drop("index",axis=1,inplace=True)

tmp_df.head()

Unnamed: 0,movieId,Keywords
0,1,"action,adventure,animals,animated,animation,ca..."
1,6,"action,dialogue,great,great ending,great movie..."
2,9,"action,gunfight,action packed,chase,good actio..."
3,10,"action,adventure,fun movie,good,original,big b..."
4,15,"action,adventure,fun movie,original,big budget..."


In [8]:
tmp_df=tmp_df.sort_values(by=['movieId']).reset_index()
tmp_df.drop('index',inplace=True,axis=1)
tmp_df.head()

Unnamed: 0,movieId,Keywords
0,1,"action,adventure,animals,animated,animation,ca..."
1,2,"adventure,animals,childhood,children,entertain..."
2,3,"original,comedy,good sequel,gunfight,romance,s..."
3,4,"original,adultery,chick flick,girlie movie,rom..."
4,5,"family,feel-good,original,destiny,comedy,good ..."


#### Creating the Cosine similarity Matrix

In [9]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(tmp_df['Keywords'])
count.get_feature_names()

['007',
 '100',
 '11',
 '13',
 '18th',
 '1920s',
 '1930s',
 '1950s',
 '1960s',
 '1970s',
 '1980s',
 '19th',
 '2005',
 '250',
 '3d',
 '70mm',
 '80s',
 'aardman',
 'abortion',
 'absurd',
 'abuse',
 'accent',
 'acting',
 'action',
 'actor',
 'actress',
 'adaptation',
 'adapted',
 'adaption',
 'addiction',
 'adolescence',
 'adoption',
 'adultery',
 'adventure',
 'affectionate',
 'afi',
 'africa',
 'afterlife',
 'age',
 'aging',
 'aids',
 'airplane',
 'airport',
 'alan',
 'alaska',
 'alcatraz',
 'alcoholism',
 'alien',
 'aliens',
 'allegory',
 'almodovar',
 'alone',
 'alter',
 'alternate',
 'amazing',
 'ambiguity',
 'america',
 'american',
 'americans',
 'amnesia',
 'amy',
 'and',
 'android',
 'androids',
 'angeles',
 'animal',
 'animals',
 'animated',
 'animation',
 'anime',
 'anne',
 'antarctica',
 'anti',
 'apocalypse',
 'apocalyptic',
 'appealing',
 'archaeology',
 'argentina',
 'arms',
 'arnold',
 'art',
 'arthur',
 'artificial',
 'artist',
 'artistic',
 'artists',
 'arts',
 'artsy',
 

In [11]:
count_matrix.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.28564159, 0.14744196, ..., 0.13055067, 0.35086059,
        0.34747055],
       [0.28564159, 1.        , 0.09225312, ..., 0.20421099, 0.11554229,
        0.20024535],
       [0.14744196, 0.09225312, 1.        , ..., 0.06324555, 0.2191785 ,
        0.1860521 ],
       ...,
       [0.13055067, 0.20421099, 0.06324555, ..., 1.        , 0.19802951,
        0.11766968],
       [0.35086059, 0.11554229, 0.2191785 , ..., 0.19802951, 1.        ,
        0.36894943],
       [0.34747055, 0.20024535, 0.1860521 , ..., 0.11766968, 0.36894943,
        1.        ]])

In [13]:
movie_info=pd.read_csv("CSVs\\movies.csv")
movie_info.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [84]:
def get_recommendations(movie_name):
    #Defining the Threshold for Cosine Similarity
    threshold=0.7
    
    #Getting the index of the input movie in the dataset, cannot use movie id as some movie ids are missing in between
    index=int(movie_info[movie_info["title"]==movie_name].index.values[0])
    
    #Creating a dictionary of recommendation index and their Cosine similarity
    recommendations={i:j for i,j in enumerate(cosine_sim[index]) if j>threshold}
    movies=recommendations.keys()
    lst=[]
    
    #Getting names of the recommendations
    for i in movies:
        name=movie_info[movie_info.index==i]["title"].values[0]
        lst.append(name)
    return lst

In [85]:
get_recommendations("Toy Story (1995)")

['Toy Story (1995)',
 'All Dogs Go to Heaven (1989)',
 'Topaz (1969)',
 'Happy, Texas (1999)',
 'D.O.A. (1988)',
 'Sidewalks of New York (2001)',
 'Dark Passage (1947)',
 'Lone Wolf and Cub: Sword of Vengeance (Kozure Ôkami: Kowokashi udekashi tsukamatsuru) (1972)',
 'Buddy (2003)',
 "Casque d'or (1952)",
 'Silent Witness (Do Not Disturb) (1999)',
 'End Game (2006)',
 'Stargate: The Ark of Truth (2008)']

In [None]:
movie=pd.read_csv("CSVs\\links.csv")
movie.head()

In [None]:
user_tag=pd.read_csv("CSVs\\tags.csv")
user_tag.head()

In [None]:
user_movie=pd.read_csv("CSVs\\ratings.csv")
user_movie.head()