# Project-2 --> Movie-Recommendation-System

In [1]:
# importing all libraries
import numpy as np
import pandas as pd

import nltk
from nltk.stem.porter import PorterStemmer; # for stemming

from sklearn.feature_extraction.text import CountVectorizer # for Vectorization
from sklearn.metrics.pairwise import cosine_similarity # calculating cosine distances b/w every movies
import pickle # to save the model

# Step-1 : Data Collection

In [2]:
movies=pd.read_csv('movies.csv')

In [3]:
movies.shape # size of the data frame

(3127, 11)

In [4]:
# checking some samples of the dataframe
movies.head()

Unnamed: 0,name,date,lang,dur,genre,direc,act,c_rate,u_rate,desc,poster
0,Radhe,"13 May, 2021",['Hindi'],2 hrs 15 mins,Action Thriller Crime,Prabhudeva,"['Salman Khan', 'Disha Patani', 'Randeep Hooda']",3.0,3.0,This is definitely not the most wanted Bhai fi...,"https://static.toiimg.com/thumb/msid-80380429,..."
1,99 Songs,"16 Apr, 2021","['Hindi', 'Tamil', 'Telugu']",2 hrs 13 mins,Romance Music,Vishwesh Krishnamoorthy,"['Ehan Bhat', 'Edilsy Vargas', 'Lisa Ray']",3.0,3.3,A budding musician takes up the challenge to c...,"https://static.toiimg.com/thumb/msid-68883903,..."
2,Bansuri: The Flute,"16 Apr, 2021",['Hindi'],1 hr 48 mins,Drama,Hari Viswanathan,"['Rituparna Sengupta', 'Anurag Kashyap', 'Upen...",2.5,2.6,"In a nutshell, ‘Bansuri’ has its heart at the ...","https://static.toiimg.com/thumb/msid-75670489,..."
3,The Big Bull,"08 Apr, 2021",['Hindi'],2 hrs 35 mins,Crime Drama,Kookie V. Gulati,"['Abhishek Bachchan', 'Ileana D Cruz', 'Nikita...",3.0,3.4,"Overall, ‘The Big Bull’ is a decent attempt to...","https://static.toiimg.com/thumb/msid-71147012,..."
4,Koi Jaane Na,"02 Apr, 2021",['Hindi'],2 hrs 9 mins,Thriller,Amin Hajee,"['Amyra Dastur', 'Kunal Kapoor', 'Elli Avrram']",2.0,2.1,The most convincing part about this one is its...,"https://static.toiimg.com/thumb/msid-67540004,..."


In [5]:
# checking attributes of the dataframe
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3127 entries, 0 to 3126
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   name    3127 non-null   object 
 1   date    3125 non-null   object 
 2   lang    3034 non-null   object 
 3   dur     2698 non-null   object 
 4   genre   2008 non-null   object 
 5   direc   3068 non-null   object 
 6   act     2276 non-null   object 
 7   c_rate  2859 non-null   float64
 8   u_rate  3125 non-null   float64
 9   desc    1882 non-null   object 
 10  poster  3125 non-null   object 
dtypes: float64(2), object(9)
memory usage: 268.9+ KB


# Step-2 : Feature Selection

In [6]:
# filtering out only useful columns that are important for our results (we are considering only categorical data)
movies=movies[['name','genre','direc','act','desc','poster']]

In [7]:
movies.head()

Unnamed: 0,name,genre,direc,act,desc,poster
0,Radhe,Action Thriller Crime,Prabhudeva,"['Salman Khan', 'Disha Patani', 'Randeep Hooda']",This is definitely not the most wanted Bhai fi...,"https://static.toiimg.com/thumb/msid-80380429,..."
1,99 Songs,Romance Music,Vishwesh Krishnamoorthy,"['Ehan Bhat', 'Edilsy Vargas', 'Lisa Ray']",A budding musician takes up the challenge to c...,"https://static.toiimg.com/thumb/msid-68883903,..."
2,Bansuri: The Flute,Drama,Hari Viswanathan,"['Rituparna Sengupta', 'Anurag Kashyap', 'Upen...","In a nutshell, ‘Bansuri’ has its heart at the ...","https://static.toiimg.com/thumb/msid-75670489,..."
3,The Big Bull,Crime Drama,Kookie V. Gulati,"['Abhishek Bachchan', 'Ileana D Cruz', 'Nikita...","Overall, ‘The Big Bull’ is a decent attempt to...","https://static.toiimg.com/thumb/msid-71147012,..."
4,Koi Jaane Na,Thriller,Amin Hajee,"['Amyra Dastur', 'Kunal Kapoor', 'Elli Avrram']",The most convincing part about this one is its...,"https://static.toiimg.com/thumb/msid-67540004,..."


# Step-3 : Data Preprocessing

In [8]:
# checking missing values for each column
movies.isnull().sum()

name         0
genre     1119
direc       59
act        851
desc      1245
poster       2
dtype: int64

In [9]:
# filling up missing values
movies.fillna('missing',inplace=True)

In [10]:
movies.isnull().sum()

name      0
genre     0
direc     0
act       0
desc      0
poster    0
dtype: int64

In [11]:
# removing spaces for each director names

movies.loc[movies['direc'].notnull(), 'direc'] = movies.loc[movies['direc'].notnull(), 'direc'].astype(str).str.replace(' ','')
movies['direc'] = movies['direc'].replace(np.nan, '', regex=True) # replacing nan value 

In [12]:
movies['direc']

0                   Prabhudeva
1       VishweshKrishnamoorthy
2              HariViswanathan
3               KookieV.Gulati
4                    AminHajee
                 ...          
3122              CurtisHanson
3123        GaspardUllielActor
3124         AishwaryaRaiActor
3125                  TimStory
3126          OmarGoodingActor
Name: direc, Length: 3127, dtype: object

In [13]:
# converting colums into lists

movies['desc']=movies['desc'].apply(lambda x:x.split())
movies['genre']=movies['genre'].apply(lambda x:x.split())
movies['direc']=movies['direc'].apply(lambda x:x.split())

In [14]:
# removing spaces for each actor name
movies['actor']=movies['act'].apply(lambda x: [name if len(name.split(" ")) == 1 else name.split(" ")[0] + name.split(" ")[1] for name in x.replace("[", "").replace("]", "").replace("'", "").split(", ")])

In [15]:
# concating desc,genre,actor,director into a single column named 'tags'
movies['tags']=movies['desc'] + movies['genre'] + movies['actor'] + movies['direc']
movies=movies.rename(columns={'name': 'title'})# renaming 'name' column into 'title' column

In [16]:
movies=movies[['title','tags','poster']] # final df we required 

In [17]:
movies.sample(5)

Unnamed: 0,title,tags,poster
2232,Hitman: Agent 47,"[The, action, is, slick, and, indeed,, very, s...","https://static.toiimg.com/thumb/msid-61243713,..."
2630,Machine Gun Preacher,"[What, really, makes, up, for, all, in, this, ...","https://static.toiimg.com/thumb/msid-61236070,..."
338,Padmaavat,"[‘Padmaavat’, is, an, entertaining,, large, ca...","https://static.toiimg.com/thumb/msid-61979171,..."
2655,What's Your Number?,"[missing, missing, missing, AnnaFarisActor]","https://static.toiimg.com/thumb/msid-61914123,..."
436,The Wishing Tree,"[There’s, a, lot, for, the, little, ones, to, ...","https://static.toiimg.com/thumb/msid-61303425,..."


In [18]:
movies['tags']=movies['tags'].apply(lambda x:" ".join(x)) # converting tag lists into String

In [19]:
movies['tags']=movies['tags'].apply(lambda x:x.lower()) # converting into lower case

In [20]:
# final data frame
movies.head()

Unnamed: 0,title,tags,poster
0,Radhe,this is definitely not the most wanted bhai fi...,"https://static.toiimg.com/thumb/msid-80380429,..."
1,99 Songs,a budding musician takes up the challenge to c...,"https://static.toiimg.com/thumb/msid-68883903,..."
2,Bansuri: The Flute,"in a nutshell, ‘bansuri’ has its heart at the ...","https://static.toiimg.com/thumb/msid-75670489,..."
3,The Big Bull,"overall, ‘the big bull’ is a decent attempt to...","https://static.toiimg.com/thumb/msid-71147012,..."
4,Koi Jaane Na,the most convincing part about this one is its...,"https://static.toiimg.com/thumb/msid-67540004,..."


# Step-4 : Stemming 'tags' column (converting all words to it's root word)

In [21]:
ps=PorterStemmer()

In [22]:
# for each text in tags apply stemming on it
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [23]:
movies['tags']=movies['tags'].apply(stem)

# Step-5 : Vectorization (counting most common 10000 words among all tags)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
cv=CountVectorizer(max_features=10000,stop_words='english')

In [26]:
vectors=cv.fit_transform(movies['tags']).toarray()

In [27]:
vectors.shape

(3127, 10000)

# Step-6 : Calculating Similarity (calculating angular distance between two vectors of movies)

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

In [29]:
similarity=cosine_similarity(vectors)

In [30]:
movies[movies['title']=='Radhe'].index[0]

0

In [31]:
sorted(list(enumerate(similarity[0])),reverse=True,key=lambda x:x[1])[1:11]

[(1401, 0.4055535528269063),
 (352, 0.3504383220252312),
 (2156, 0.3077935056255462),
 (77, 0.30348848933344197),
 (320, 0.29821002598961344),
 (2350, 0.29346959282671103),
 (2205, 0.27668578554642986),
 (1171, 0.27529888064467406),
 (1966, 0.2699527623995085),
 (1787, 0.2564945880212885)]

In [32]:
def recommend(movie):
    ind=movies[movies['title']==movie].index[0] # fetching the selected movie index
    distances=similarity[ind] # calculating similarity with other movies
    
    # sort the distances to find the most similar 10 movies 
    movie_list=sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:11]
    for i in movie_list:
        print(movies.iloc[i[0]].title)

# Step-7 : Checking Outputs

In [33]:
recommend('Radhe')

Wanted
Tiger Zinda Hai
Money Monster
Dabangg 3
Baaghi 2
The November Man
Everly
Bodyguard
Maze Runner: The Death Cure
21 Bridges


# Step-8 : Saving the model using pickle

In [34]:
import pickle

In [35]:
pickle.dump(movies,open('movie.pkl','wb'))

In [36]:
pickle.dump(similarity,open('similarity.pkl','wb'))

In [38]:
import gzip
with open('similarity.pkl', 'rb') as f:
    data = pickle.load(f)

with gzip.open('compressed_similarity.pkl.gz', 'wb') as f:
    pickle.dump(data, f)