In [5]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics.pairwise import cosine_similarity as cos
from sklearn.metrics.pairwise import linear_kernel

In [7]:
os.chdir(r'C:\Users\HP\data')

In [8]:
def load_data():
    return pd.read_csv('movies_metadata.csv',low_memory=False)

In [9]:
data=load_data()

In [10]:
#Simple recomendation:
# Decide on the ranking metrics of each movie:
#m: this is the minimum number of votes for a movie to be in the rank.
#C:This is Avarage ranking for all movies 
#R:This is the ranking for a movie
#V: this is the vote count for each movie

def filter_data(data,p):
    
    return data.loc[data.copy()["vote_count"]>(data["vote_count"].quantile(0.9))]

def weighted_average(x,m=160,C=160):
    v=x["vote_count"]
    R=x["vote_average"]
    return ((v/(v+m))*R)+((m/(m+v))*C)


def get_params(data):
    
    
    m=data["vote_count"].quantile(0.90)
    C=data["vote_average"].mean()
    R=data["vote_average"]
    v=data["vote_count"]
    
    return (m,v,R,C)



In [None]:
#Applying the filter to the data we are considering movies in the 90th percentile to recomend
q_data=filter_data(data,0.90)
#Assignng a weighted score to each movie
q_data['scores']=q_data.apply(weighted_average,axis=1)
#Sorting the movie based on the Score in descending order.
q_data=q_data.sort_values('scores',ascending=False)


In [67]:
 q_data[['title', 'vote_count', 'vote_average', 'scores']].head()

Unnamed: 0,title,vote_count,vote_average,scores
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385


## Content-based Recomendations :recommending based on the movie metadata

In [69]:
#Recommending movies or entities based on the similarity between the movie metadata
#We are required to generate vector representations of the word.

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import linear_kernel

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):


In [11]:
metadata=load_data()

In [13]:
metadata["overview"].fillna(' ',inplace=True)

vectorizer=TfidfVectorizer(stop_words='english')


matrix=vectorizer.fit_transform(metadata["overview"].sample(n=10000))

In [14]:
similarity_matrix=linear_kernel(matrix,matrix)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype = np.float


In [15]:
#Below function will allow one to make mappings from the index of a title to the title itself.
index=pd.Series(metadata.index,metadata["title"])
reverse_index=pd.Series(metadata['title'],metadata.index)

In [16]:
def recomendations(name,matrix):
    #This is a function taking in the name of the movie as input and returns 10 most similar movies in the list.
    
    #method below helps get the index of a given matrix.
    Index=index[name]
    
    data= list(enumerate(matrix[Index]))
    
    data=sorted(data,key=lambda x:x[1],reverse=True)
    
    #get the 10 most similar scores
    
    data=data[1:11]
    
    
    movie_indexes=[i[0] for i in similarity_matrix]
    
    movies=[]
    scores=[]
    
    for position,score in enumerate(data):
        
        movies.append(reverse_index[position])
        scores.append(score)
        
    
    return (movies,scores)
 

In [21]:
 recomendations('Toy Story',similarity_matrix)

(['Toy Story',
  'Jumanji',
  'Grumpier Old Men',
  'Waiting to Exhale',
  'Father of the Bride Part II',
  'Heat',
  'Sabrina',
  'Tom and Huck',
  'Sudden Death',
  'GoldenEye'],
 [(2390, 0.1721350830576923),
  (8334, 0.15158476111142524),
  (9395, 0.10911781835039958),
  (1129, 0.10765543047818224),
  (6387, 0.09552516990673901),
  (7707, 0.0913412029882319),
  (6861, 0.08903769213417036),
  (3967, 0.08590453248455349),
  (9291, 0.08491083130522996),
  (8547, 0.0848455223477083)])

In [18]:
## Similarity based on crew and cast data

credits=pd.read_csv('credits.csv')
keywords=pd.read_csv('keywords.csv')



In [17]:
#removing rows with bad ids
metadata=metadata.drop([19730,29503,35587])

In [18]:
keywords['id']=keywords['id'].astype('int')
credits['id']=credits['id'].astype('int')
metadata['id']=metadata['id'].astype('int')

In [19]:
##Merging the two datasets.

metadata=metadata.merge(keywords,on="id")
metadata=metadata.merge(credits,on="id")

In [20]:
#Below is the library used to convert the String representations int objects
from ast import literal_eval 

In [21]:
#The data in the cast column is in formated string form we need to convert it to an object

features=['cast','keywords','crew','genres']

for i in features:
    metadata[i]=metadata[i].apply(literal_eval)

In [4]:
#This is a subroutine to get a director from the pandas collection.
def get_director(data):
     
    
    for i in data:
        if(i['job']=="Director"):
            return i['name']
        return np.NaN

    
    
#This is a method to get the top three entitie's attribute name not the object has first to be converted to a object literal
#using the literal_eval() method in python.
def get_list(data):
    
    names=[i['name'] for i in data]
    
    if(len(names)>3):
        names=names[:3]
    
    return names      


#Role of this method is to pre-process the data converting it to lower case and removing spaces.
def clean_data(data):
    
    if isinstance(data,list):
        
        return [str.lower(i.replace(" ","")) for i in data]
    
    if isinstance(data,str):
        
        return str.lower(data.replace(" ",""))

#The below function joins all the keywords in the data frame to create a soup that will serve as input to
#our vectorizer we join the keywords and separates them using spaces.

def create_soup(data):
    return ' '.join(data["keywords"])+" "+" ".join(data["cast"])+" "+data["director"]+" "+" ".join(data["genres"])
    

    
#method takes in a soup of words does some preprocessing and returns a sparse matrix: which is a vector representation of the text.
def extract_feature(data):
    vectorizer=CountVectorizer(stop_words="english")
    
    return vectorizer.fit_transform(data.head(10000))

def cosine_similarity(matrix1,matrix2):
    
    return cos(matrix1,matrix2)


    

In [42]:
metadata['director']=metadata['crew'].apply(get_director)

features=["cast","keywords","genres"]


In [26]:
 #Apply the clean data function to our attributes:
    
for i in features:
    metadata[i]=metadata[i].apply(clean_data)

In [32]:
metadata.fillna(" ",inplace=True)

In [46]:
#creating the soup column of the dataset
#removing NaN values and replacing them with empty strings

 
metadata["soup"]=metadata.apply(create_soup,axis=1)


TypeError: ('can only concatenate str (not "float") to str', 'occurred at index 1')

In [48]:
matrix=extract_feature(metadata['soup'].head(100))

In [49]:
# calculating the similarity matrix of the data
similarity=(cosine_similarity(matrix,matrix))

In [50]:
def metadata_recomendation(name,matrix=similarity):
    #getting the index of the data from the name
    _index=index[name]
    
    #After getting the index of the data, search for that entry in the matrix along with their positions
    data=list(enumerate(matrix[_index]))
    
    #sort the values of the data
    data=sorted(data,key=lambda x:x[1],reverse=True)
    
    #we now have the sorted values of the data lets return a list of the 10 most similar movies along with their scores.
    data=data[1:21]
    
    #we require a mapping from the indexes of the movie to the actual names now.
    locations=[i[0] for i in data]
    scores=[i[1] for i in data]
    movies=[]
    
    for i in locations:
        movies.append(reverse_index[i])
    
     
    return(movies,scores)
    
    

In [479]:
metadata_recomendation('The Godfather: Part II',similarity_matrix)

(['The Godfather',
  'The Godfather: Part III',
  "Dolan's Cadillac",
  'Italianamerican',
  'It Runs in the Family',
  'Stuff and Dough',
  'The Las Vegas Story',
  'I Shot a Man in Vegas',
  'Casino',
  'Black Moon Rising',
  'Easy Riders, Raging Bulls',
  'Mulholland Falls',
  'Lay the Favorite',
  "Lookin' to Get Out",
  'Love In Bloom',
  'Showgirls',
  'Leaving Las Vegas',
  'Vegas Vacation',
  'Family Business',
  'Finding Amanda'],
 [0.45878927645099377,
  0.28428058969818715,
  0.17133541601876187,
  0.15465826747651162,
  0.14126769800907402,
  0.13398219455317564,
  0.11936859836685157,
  0.11781581635300199,
  0.11569521547772915,
  0.10655677710049197,
  0.10191327986669267,
  0.1009174556628181,
  0.09907956282702146,
  0.09884515445521125,
  0.09759046583374047,
  0.09600186608528902,
  0.09396002419627492,
  0.09333274977892501,
  0.09190058412488028,
  0.09143888766075761])

In [52]:
#Recommeding using overview : By Tonny Kamau

In [212]:
metadata["soup"]

0        jealousy toy boy tomhanks timallen donrickles ...
1        boardgame disappearance basedonchildren'sbook ...
2        fishing bestfriend duringcreditsstinger walter...
3        basedonnovel interracialrelationship singlemot...
4        baby midlifecrisis confidence stevemartin dian...
                               ...                        
46623    tragiclove leilahatami kouroshtahami elhamkord...
46624    artist play pinoy angelaquino perrydizon hazel...
46625     erikaeleniak adambaldwin juliedupage Mark L. ...
46626     iwanmosschuchin nathalielissenko pavelpavlov ...
46627                                       Daisy Asquith 
Name: soup, Length: 46628, dtype: object

In [55]:
metadata.shape[0]**2

2174170384

In [75]:
import numpy as npy
from sklearn.metrics.pairwise import cosine_similarity as cos

In [79]:
a=[9,5]

In [78]:
b=[9,7]


In [76]:

def euclidean(a,b):
    distance=((a[0]-b[0])**2)+((a[1]-b[1])**2)
    return 1/(1+distance)
def manhattan(a,b):
    distance=(npy.abs(a[0]-b[0])+npy.abs(a[1]-b[1]))
    return 1/(1+distance)

def cosine(a,b):
     return cos(a,b)
    

In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [103]:
td=TfidfVectorizer(stop_words="english")

In [199]:
matrix=td.fit_transform(metadata["overview"].head(5000))

In [200]:
d_matrix=cosine_similarity(matrix,matrix)

In [201]:
index=pd.Series(metadata['title'],metadata.index)
reverse_index=pd.Series(metadata.index,metadata["title"])

In [202]:
 index[1]


'Jumanji'

In [203]:
 def recomend(name,matrix=d_matrix):
        
       
     
        
        entry=list(enumerate(entry))
        
        entry=sorted(entry,reverse=True)
        
        locations=[i[0] for i in entry]
        
        
        
        #returns a list of top 10 similar movies

In [204]:
 name='Jumanji'

In [205]:
 INDEX=reverse_index[name]

In [206]:
entry=d_matrix[INDEX]
print(entry)

[0.0175863  1.         0.04667068 ... 0.01442189 0.         0.        ]


In [207]:
entry=list(enumerate(entry))
entry

[(0, 0.01758630381549889),
 (1, 1.0),
 (2, 0.04667068203072072),
 (3, 0.0),
 (4, 0.0),
 (5, 0.05532833728259237),
 (6, 0.0),
 (7, 0.0),
 (8, 0.11458164220246671),
 (9, 0.0),
 (10, 0.008360996938391286),
 (11, 0.0),
 (12, 0.0),
 (13, 0.008953456107436301),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.0316062271222706),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0),
 (21, 0.0),
 (22, 0.00685969896734157),
 (23, 0.0),
 (24, 0.0),
 (25, 0.023207700030449654),
 (26, 0.02520028031911836),
 (27, 0.0068464337784237324),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.04336796612539656),
 (32, 0.0),
 (33, 0.0061895277609972975),
 (34, 0.005066868752124644),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.0),
 (42, 0.0),
 (43, 0.02379510005504401),
 (44, 0.0),
 (45, 0.0),
 (46, 0.020393292487838305),
 (47, 0.00886818304625511),
 (48, 0.0),
 (49, 0.018164343831667474),
 (50, 0.005687005144465109),
 (51, 0.0),
 (52, 0.02783946786759399),
 (53, 0.0),
 (54, 0.0),
 (55, 0.0),
 (56, 0.

In [208]:
entry=sorted(entry,key=lambda x:x[1],reverse=True)
entry

[(1, 1.0),
 (2506, 0.15763246368780734),
 (1526, 0.13694307151440083),
 (3899, 0.1297964927910645),
 (3083, 0.117027123242585),
 (8, 0.11458164220246671),
 (4110, 0.11375480028260032),
 (363, 0.10480066569570962),
 (1971, 0.0985156663573922),
 (2269, 0.09818159413210871),
 (4929, 0.0965707896208801),
 (4693, 0.09416079041660216),
 (997, 0.09392334138872402),
 (975, 0.09171881791771651),
 (1998, 0.08833322216838999),
 (4251, 0.08681732036163521),
 (2441, 0.08123422641367839),
 (1631, 0.0812226271601824),
 (3454, 0.080050212132591),
 (96, 0.07700605986963306),
 (853, 0.07664472667207921),
 (2351, 0.07612375097288691),
 (908, 0.07602250695625216),
 (3836, 0.07547193175826594),
 (591, 0.07531822880880083),
 (4121, 0.07511577897481612),
 (1983, 0.07482353705147204),
 (1765, 0.07452215134945381),
 (4111, 0.0730094823155289),
 (3595, 0.07122817330664806),
 (2710, 0.07121476339481961),
 (2047, 0.07110482077365125),
 (2231, 0.06991703461728181),
 (2016, 0.06919301576110116),
 (1990, 0.067874689

In [209]:
entry=entry[1:20]
entry

[(2506, 0.15763246368780734),
 (1526, 0.13694307151440083),
 (3899, 0.1297964927910645),
 (3083, 0.117027123242585),
 (8, 0.11458164220246671),
 (4110, 0.11375480028260032),
 (363, 0.10480066569570962),
 (1971, 0.0985156663573922),
 (2269, 0.09818159413210871),
 (4929, 0.0965707896208801),
 (4693, 0.09416079041660216),
 (997, 0.09392334138872402),
 (975, 0.09171881791771651),
 (1998, 0.08833322216838999),
 (4251, 0.08681732036163521),
 (2441, 0.08123422641367839),
 (1631, 0.0812226271601824),
 (3454, 0.080050212132591),
 (96, 0.07700605986963306)]

In [210]:
locations=[]
for i in entry:
    locations.append(i[0])
    

In [211]:
metadata['title'].iloc[locations]

2506                       eXistenZ
1526             The Innocent Sleep
3899             Dungeons & Dragons
3083               Any Given Sunday
8                      Sudden Death
4110                      Manhunter
363                        Maverick
1971                    BASEketball
2269                 Glen or Glenda
4929                       Motorama
4693          Sidewalks of New York
997            D3: The Mighty Ducks
975                          Picnic
1998                      Peter Pan
4251          The Anniversary Party
2441                      Avalanche
1631    The Man Who Knew Too Little
3454                 Guys and Dolls
96                         Shopping
Name: title, dtype: object