In [12]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')

df = df[['Title','Genre','Director','Actors','Plot']]

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
df.head(2)

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...


In [17]:
# initializing the new columns
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['Plot'], inplace = True)

In [23]:
df['Director'] = df['Director'].str.lower()
df['Director'] = df['Director'].str.replace(' ', '')

In [29]:
df['Actors'] = df['Actors'].str.lower()
df['Actors'] = df['Actors'].str.replace(' ', '')
df['Actors'] = df['Actors'].str.replace(',', ' ')

In [37]:
df['Genre'] = df['Genre'].str.lower()
df['Genre'] = df['Genre'].str.replace(',', '')

In [45]:
df.head(2)

Unnamed: 0,Title,Genre,Director,Actors,Key_words,bag_of_words
0,The Shawshank Redemption,crime drama,frankdarabont,timrobbins morganfreeman bobgunton williamsadler,"[two, imprisoned, men, bond, number, years, fi...",crime drama frankdarabont timrobbins morganfre...
1,The Godfather,crime drama,francisfordcoppola,marlonbrando alpacino jamescaan richards.caste...,"[aging, patriarch, organized, crime, dynasty, ...",crime drama francisfordcoppola marlonbrando al...


In [65]:
def list_to_str(word_list):
    string = ''
    for i in word_list:
        string += i + ' '
    return string

In [68]:
df['Key_words'] = df.Key_words.apply(list_to_str)

In [73]:
df['bag_of_words'] = df.Genre + ' ' + df.Director + ' ' + df.Actors  + ' ' + df.Key_words

In [78]:
df = df[['Title', 'bag_of_words']]

In [81]:
df.head(2)

Unnamed: 0,Title,bag_of_words
0,The Shawshank Redemption,crime drama frankdarabont timrobbins morganfre...
1,The Godfather,crime drama francisfordcoppola marlonbrando al...


In [90]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [99]:
indices = pd.Series(df.Title)

In [100]:
indices

0      The Shawshank Redemption
1                 The Godfather
2        The Godfather: Part II
3               The Dark Knight
4                  12 Angry Men
                 ...           
245            The Lost Weekend
246               Short Term 12
247             His Girl Friday
248          The Straight Story
249         Slumdog Millionaire
Name: Title, Length: 250, dtype: object

In [106]:


#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended movies
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.Title)[i])
        
    return recommended_movies

In [107]:
recommendations('Fargo')

['No Country for Old Men',
 'The Departed',
 'Rope',
 'The Big Lebowski',
 'Reservoir Dogs',
 'The Godfather',
 'The Godfather: Part II',
 'On the Waterfront',
 'Goodfellas',
 'Arsenic and Old Lace']