In [354]:
import numpy as np
import pandas as pd
import ast                  #Used in tag creation

In [379]:
newMovies= pd.read_csv('NewMovies.csv')

In [380]:
newMovies.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"['Action', 'Adventure', 'Fantasy', 'ScienceFic..."
1,285,Pirates of the Caribbean: At World's End,"['Adventure', 'Fantasy', 'Action', 'ocean', 'd..."
2,206647,Spectre,"['Action', 'Adventure', 'Crime', 'spy', 'based..."
3,49026,The Dark Knight Rises,"['Action', 'Crime', 'Drama', 'Thriller', 'dcco..."
4,49529,John Carter,"['Action', 'Adventure', 'ScienceFiction', 'bas..."


In [382]:
#Creating a string out of tags
newMovies['tags']= newMovies['tags'].apply(lambda x: " ".join(x))
newMovies['tags']= newMovies['tags'].apply(lambda x: x.replace(' ',''))

In [383]:
newMovies['tags']= newMovies['tags'].apply(lambda x: x.lower())

In [384]:
newMovies.iloc[0].tags

"['action','adventure','fantasy','sciencefiction','cultureclash','future','spacewar','spacecolony','society','spacetravel','futuristic','romance','space','alien','tribe','alienplanet','cgi','marine','soldier','battle','loveaffair','antiwar','powerrelations','mindandsoul','3d','in','the','22nd','century,','a','paraplegic','marine','is','dispatched','to','the','moon','pandora','on','a','unique','mission,','but','becomes','torn','between','following','orders','and','protecting','an','alien','civilization.','ingeniousfilmpartners','twentiethcenturyfoxfilmcorporation','duneentertainment','lightstormentertainment','samworthington','zoesaldana','sigourneyweaver','jamescameron']"

In [360]:
#'action','actions' are similar but will act as different tag
# We will nltk library to solve this issue
!pip install nltk



In [385]:
newMovies.shape

(4805, 3)

In [386]:
import nltk

In [387]:
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()

In [388]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [389]:
newMovies['tags']= newMovies['tags'].apply(stem)

In [390]:
#Now we need to vectorise our tags by the method of Text Vectorization- Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=3000, stop_words='english')

In [391]:
vector= cv.fit_transform(newMovies['tags']).toarray()

In [392]:
vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [393]:
vector.shape

(4805, 3000)

In [394]:
cv.get_feature_names()

['000',
 '10',
 '11',
 '12',
 '13',
 '14',
 '1492pictures',
 '15',
 '16',
 '17',
 '18',
 '1930s',
 '1940s',
 '1950s',
 '1960s',
 '1970s',
 '1980s',
 '19th',
 '19thcentury',
 '20',
 '20th',
 '24',
 '30',
 '3d',
 '40',
 '40acres',
 '50',
 'aaroneckhart',
 'abandoned',
 'ability',
 'able',
 'abuse',
 'academy',
 'accepts',
 'accident',
 'accidentally',
 'account',
 'accused',
 'ace',
 'act',
 'action',
 'actions',
 'activities',
 'actor',
 'actors',
 'actress',
 'actually',
 'adam',
 'adamsandler',
 'adaptation',
 'addiction',
 'adopted',
 'adoption',
 'adult',
 'adultery',
 'adventure',
 'adventures',
 'advertising',
 'advice',
 'affair',
 'afghanistan',
 'africa',
 'african',
 'aftercreditsstinger',
 'afterlife',
 'age',
 'aged',
 'agency',
 'agent',
 'agents',
 'aging',
 'ago',
 'agree',
 'agrees',
 'ahead',
 'aid',
 'aided',
 'air',
 'airplane',
 'airport',
 'al',
 'alaska',
 'albert',
 'alcohol',
 'alcoholic',
 'alcoholism',
 'alconentertainment',
 'alecbaldwin',
 'alex',
 'alice',
 

In [395]:
from sklearn.metrics.pairwise import cosine_similarity

In [396]:
similarity = cosine_similarity(vector)

In [397]:
similarity[0]

array([1.        , 0.09583148, 0.06277648, ..., 0.0270666 , 0.02988072,
       0.        ])

In [398]:
sorted(list(enumerate(similarity[0])), reverse=True, key= lambda x: x[1])[1:6]

[(1216, 0.346274924279844),
 (507, 0.2958682036329227),
 (539, 0.29424494316824984),
 (3730, 0.28128433856309726),
 (61, 0.2788866755113585)]

In [399]:
def recommend(movie):
    movie_index= newMovies[newMovies['title']== movie].index[0]
    distances= similarity[movie_index]
    movie_list= sorted(list(enumerate(distances)), reverse=True, key= lambda x:x[1])[1:6]
    
    for i in movie_list:
        print(newMovies.iloc[i[0]].title)

In [400]:
recommend('Avatar')

Aliens vs Predator: Requiem
Independence Day
Titan A.E.
Falcon Rising
Jupiter Ascending


In [402]:
recommend('Batman Begins')

The Dark Knight
The Dark Knight Rises
Batman
Batman & Robin
Batman


In [401]:
similarity.shape

(4805, 4805)