In [1]:
#import needed modules and packages
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# **The Movies Dataset**

In [2]:
#reading the movies csv 
movies_df = pd.read_csv("movie_overviews.csv")
movies_df

Unnamed: 0,id,title,overview,tagline
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...
...,...,...,...,...
9094,159550,The Last Brickmaker in America,A man must cope with the loss of his wife and ...,
9095,392572,Rustom,"Rustom Pavri, an honourable officer of the Ind...",Decorated Officer. Devoted Family Man. Defendi...
9096,402672,Mohenjo Daro,"Village lad Sarman is drawn to big, bad Mohenj...",
9097,315011,Shin Godzilla,From the mind behind Evangelion comes a hit la...,A god incarnate. A city doomed.


# **Data Cleaning**

The title column of the data will be cleaned by removing punctuation marks from the text where necessary.The cleaned title column will then be transformed into arrays and be used in calculating the cosine similarity between the various movie titles.

In [3]:
#define a function to clean title

def clean_title(title):
    return re.sub('[^a-zA-Z0-9 ]','',title)

In [4]:
movies_df['cl_title'] = movies_df['title'].apply(clean_title)

movies_df.head()

Unnamed: 0,id,title,overview,tagline,cl_title
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",,Toy Story
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,Jumanji
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,Waiting to Exhale
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II


In [5]:
#transform the cleaned title column to arrays
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,2))
transformed_cl_title = tfidf.fit_transform(movies_df['cl_title'])

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

#creating a function that checks for similarity in title name
def search(title):

    #title = clean_title(title)
    vec = tfidf.transform([title])
    similarity = cosine_similarity(vec, transformed_cl_title).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies_df.iloc[indices][::-1]
    results = results.loc[:,['title','overview']].reset_index(drop=True)
    return results

## **Creating a Search Engine**

From the calculated cosine similarities I will be creating a search engine that will give various movie titles when a name is typed. Also, I will be creating a widget for easy display of the names.

In [7]:
#movie search engine
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled = False
)
movie_list = widgets.Output()

def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data['new']
        if len(title) > 5:
            display(search(title))
            
movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

### RECOMMENDATION ENGINE

**Data preparation**

The overview column will be used for creating the necessary features for the recommendation engine. 
Firstly, I will clean this column by removing punctuation marks, stop words that do not have relevance to the text. Also i will be using the WordNetLemmatizer on the words.

Next,I will use TfidfVectorizer to extract features from the text.

In [8]:
#Recomendation system for movies

#define a function to clean overview

def clean_col(col):
    lemmatizer = WordNetLemmatizer()
    for index,row in movies_df.iterrows():
        filter_sentence = ''
        sentence = row[col]
        sentence = str(sentence).lower()
        sentence = re.sub(r'[^\w\s]','', sentence) 
        stop_words = set(stopwords.words('english'))
        words = nltk.word_tokenize(sentence)
        words = [w for w in words if not w in stop_words]
        for word in words:
            filter_sentence = filter_sentence+ ' ' + lemmatizer.lemmatize(word)
        movies_df.loc[index, col] = filter_sentence
    return movies_df

In [9]:
clean_col('overview')

Unnamed: 0,id,title,overview,tagline,cl_title
0,862,Toy Story,led woody andys toy live happily room andys b...,,Toy Story
1,8844,Jumanji,sibling judy peter discover enchanted board g...,Roll the dice and unleash the excitement!,Jumanji
2,15602,Grumpier Old Men,family wedding reignites ancient feud nextdoo...,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men
3,31357,Waiting to Exhale,cheated mistreated stepped woman holding brea...,Friends are the people who let you be yourself...,Waiting to Exhale
4,11862,Father of the Bride Part II,george bank recovered daughter wedding receiv...,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II
...,...,...,...,...,...
9094,159550,The Last Brickmaker in America,man must cope loss wife obsolescence job find...,,The Last Brickmaker in America
9095,392572,Rustom,rustom pavri honourable officer indian navy s...,Decorated Officer. Devoted Family Man. Defendi...,Rustom
9096,402672,Mohenjo Daro,village lad sarman drawn big bad mohenjo daro...,,Mohenjo Daro
9097,315011,Shin Godzilla,mind behind evangelion come hit larger life m...,A god incarnate. A city doomed.,Shin Godzilla


In [10]:
#transforming the cleaned overview column to arrays

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2, max_df=0.7 ,ngram_range=(2,2))
vectors= vectorizer.fit_transform(movies_df['overview'])
print(vectorizer.get_feature_names())





In [11]:
#create a dataframe using the feature_names and arrays
tfidf_df = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names())
tfidf_df.index = movies_df['title']
tfidf_df.head()

Unnamed: 0_level_0,00 agent,10 million,10 month,10 year,100 million,10year old,10yearold boy,10yearold daughter,10yearold girl,11 year,...,zac efron,zach confronts,zak gibbs,zak quickly,zenon kar,zola paris,zombie film,zombie outbreak,zombie survivor,zooey deschanel
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## **Recommendation System**

Here, the cosine similarities of the overview column will be calculated using cosine_similarity from sklearn.

From the calculated cosine similarities I will be creating a recommendation engine that recommends movies that has same similarities as the movie named typed.  Also, I will be creating a widget for easy display of the names.

In [12]:
#calculating cosine similarities of columns
from sklearn.metrics.pairwise import cosine_similarity

cs_array = cosine_similarity(tfidf_df)

cs_df = pd.DataFrame(cs_array, index=tfidf_df.index, columns=tfidf_df.index)

cs_df.head()

title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II,Heat,Sabrina,Tom and Huck,Sudden Death,GoldenEye,...,Author: The JT LeRoy Story,Hell or High Water,Kingsglaive: Final Fantasy XV,Body,Sharknado 4: The 4th Awakens,The Last Brickmaker in America,Rustom,Mohenjo Daro,Shin Godzilla,The Beatles: Eight Days a Week - The Touring Years
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Jumanji,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Grumpier Old Men,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Waiting to Exhale,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Father of the Bride Part II,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#define a function that sorts the calcualted similarity 
def sort_similarity(name):
    recommendation = cs_df.loc[name,:].sort_values(ascending=False)
    return recommendation.nlargest()

In [14]:
#recommendation system

movie_input_name = widgets.Text(
    value='Toy Story',
    description='Movie Title:',
    disabled = False
)
recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data['new']
        if len(title) > 5:
            results = search(title)
            name = results.iloc[0]['title']
            display(sort_similarity(name))
            
movie_input_name.observe(on_type, names='value')

display(movie_input_name, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()