In [15]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')

df = df[['Title','Genre','Director','Actors','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [16]:
df.shape

(250, 5)

In [17]:
# rake extracts keywords from "plot" column - NLP on plot

# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['Plot'], inplace = True)

In [18]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Key_words
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...","[two, imprisoned, men, bond, number, years, fi..."
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...","[aging, patriarch, organized, crime, dynasty, ..."
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...","[early, life, career, vito, corleone, 1920s, n..."
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...","[menace, known, joker, emerges, mysterious, pa..."
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....","[jury, holdout, attempts, prevent, miscarriage..."


In [19]:
df['bag_of_words'] = df['Genre'].str.lower() + " " + df['Director'].str.lower() + " " + df['Actors'].str.lower()

In [20]:
df['Key_words'] = df['Key_words'].str.join(" ")

In [21]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Key_words,bag_of_words
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",two imprisoned men bond number years finding s...,"crime, drama frank darabont tim robbins, morga..."
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",aging patriarch organized crime dynasty transf...,"crime, drama francis ford coppola marlon brand..."
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",early life career vito corleone 1920s new york...,"crime, drama francis ford coppola al pacino, r..."
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",menace known joker emerges mysterious past wre...,"action, crime, drama christopher nolan christi..."
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",jury holdout attempts prevent miscarriage just...,"crime, drama sidney lumet martin balsam, john ..."


In [22]:
df['bag_of_words'] = df['bag_of_words'] + df['Key_words']

In [24]:
from string import punctuation
punct = punctuation

In [26]:
temp_string = ["this is a test string!", "this is another713127y381&YD&Y"]

In [32]:
def del_punct(text):
    text="".join([char for char in text if char not in punct])
    return text

In [33]:
del_punct(temp_string)

'this is a test string!this is another713127y381&YD&Y'

In [38]:
clean_BoW = list()
for text in df['bag_of_words']:
    text = del_punct(text)
    clean_BoW.append(text)

In [39]:
df['clean_BoW'] = clean_BoW

In [40]:
df = df[['Title','clean_BoW']]

In [44]:
df = df.set_index('Title')

In [45]:
# merge names into one - don't want to be repeated, and lower everything to avoid duplicates
# bag of words - everyword important for detection of similarity

# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['clean_BoW'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [48]:
count_matrix.shape

(250, 3458)

In [49]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use in the function to match the indexes
indices = pd.Series(df.index)

#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies # based on cosine similarity set earlier
def recommendations(title, cosine_sim = cosine_sim):
    
    # initializing the empty list of recommended movies
    recommended_movies = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies # drops the first will be the same movie
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_movies.append(list(df.index)[i])
        
    return recommended_movies

In [52]:
recommendations('Reservoir Dogs')

['Pulp Fiction',
 'Heat',
 'Fargo',
 'The Departed',
 'A Clockwork Orange',
 'Kill Bill: Vol. 1',
 'The Green Mile',
 'Sin City',
 'The Usual Suspects',
 'Rope']