In [1]:
import pandas as pd
import numpy as np

import requests

import html5lib
from bs4 import BeautifulSoup

import tensorflow
from tensorflow import keras
from keras.saving import load_model
import pickle

import nltk
from nltk.tokenize import sent_tokenize
import regex as re
import contractions
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
model = load_model('model.keras',compile = False)
tokenizer = pickle.load(open('tokenizer.pkl','rb'))

In [3]:
metadata = pd.read_csv('clean_metadata.csv')
metadata.head()

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,production_companies,...,revenue,runtime,status,title,vote_average,vote_count,keywords,director,film_cast,film_characters
0,False,30000000,"Animation,Comedy,Family",862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,Pixar Animation Studios,...,373554033.0,81.0,Released,Toy Story,7.7,5415.0,jealousy toy boy friendship friends rivalry bo...,John Lasseter,"Tom Hanks,Tim Allen,Don Rickles,Jim Varney,Wal...","Woody (voice),Buzz Lightyear (voice),Mr. Potat..."
1,False,65000000,"Adventure,Fantasy,Family",8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,"TriStar Pictures,Teitler Film,Interscope Commu...",...,262797249.0,104.0,Released,Jumanji,6.9,2413.0,board game disappearance new home recluse gian...,Joe Johnston,"Robin Williams,Jonathan Hyde,Kirsten Dunst,Bra...","Alan Parrish,Samuel Alan Parrish / Van Pelt,Ju..."
2,False,0,"Romance,Comedy",15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,"Warner Bros.,Lancaster Gate",...,0.0,101.0,Released,Grumpier Old Men,6.5,92.0,fishing best friend duringcreditsstinger old men,Howard Deutch,"Walter Matthau,Jack Lemmon,Ann-Margret,Sophia ...","Max Goldman,John Gustafson,Ariel Gustafson,Mar..."
3,False,16000000,"Comedy,Drama,Romance",31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,Twentieth Century Fox Film Corporation,...,81452156.0,127.0,Released,Waiting to Exhale,6.1,34.0,based on novel interracial relationship single...,Forest Whitaker,"Whitney Houston,Angela Bassett,Loretta Devine,...","Robin Stokes,Marvin King,Kenneth Dawkins,John ..."
4,False,0,Comedy,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,"Sandollar Productions,Touchstone Pictures",...,76578911.0,106.0,Released,Father of the Bride Part II,5.7,173.0,baby midlife crisis confidence aging daughter ...,Charles Shyer,"Steve Martin,Diane Keaton,Martin Short,Kimberl...","George Banks,Nina Banks,Franck Eggelhoffer,Ann..."


In [4]:
tag_data = pd.read_csv('final_clean.csv')
tag_data.head()

Unnamed: 0,title,overview,keywords,genres,director,film_cast,tags
0,Toy Story,"led by woody, andy's toy live happily in his r...",jealousy toy boy friendship friend rivalry boy...,animation comedy family,johnlasseter,tomhanks timallen donrickles jimvarney wallace...,"led by woody, andy toy live happily in his roo..."
1,Jumanji,when sibling judy and peter discover an enchan...,board game disappearance new home recluse gian...,adventure fantasy family,joejohnston,robinwilliams jonathanhyde kirstendunst bradle...,when sibling judy and peter discover an enchan...
2,Grumpier Old Men,a family wedding reignites the ancient feud be...,fishing best friend duringcreditsstinger old men,romance comedy,howarddeutch,waltermatthau jacklemmon ann-margret sophialor...,a family wedding reignites the ancient feud be...
3,Waiting to Exhale,"cheated on, mistreated and stepped on, the wom...",based on novel interracial relationship single...,comedy drama romance,forestwhitaker,whitneyhouston angelabassett lorettadevine lel...,"cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,just when george bank ha recovered from his da...,baby midlife crisis confidence aging daughter ...,comedy,charlesshyer,stevemartin dianekeaton martinshort kimberlywi...,just when george bank ha recovered from his da...


In [5]:
def convert_to_tokens():
    cv = CountVectorizer(stop_words= stopwords.words('english'))
    transformed = cv.fit_transform(tag_data['tags'])
    scaled_transformed = (transformed-np.min(transformed))/(np.max(transformed)-np.min(transformed))
    return scaled_transformed
scaled_transformed = convert_to_tokens()

In [6]:
def recommend(movie_title):
    d = {'Title':[],'Similarity':[]}
    user_movie_index = tag_data[tag_data['title'] == movie_title].index[0]
    user_trans = scaled_transformed[user_movie_index]
    
    for i in range(len(tag_data['title'])):
        if i != user_movie_index:
            movie_trans = scaled_transformed[i]
            sim = cosine_similarity(user_trans,movie_trans)[0][0]
            d['Title'].append(tag_data['title'][i])
            d['Similarity'].append(sim)
    
    return pd.DataFrame(d).sort_values(by = 'Similarity',ascending = False)[['Title']].head(5)

In [8]:
#FUNCTION TO GET MOVIE DETAILS

def get_movie_details(movie_title):
    data = metadata[metadata['title'] == movie_title]
    data = data.fillna('Unknown')
    overview = data['overview'].values[0]
    genre = ', '.join(data['genres'].values[0].split(','))
    director = data['director'].values[0]
    film_cast = ', '.join(data['film_cast'].values[0].split(','))
    return overview,genre,director,film_cast

In [9]:
#FUNCTION TO GET MOVIE POSTER

def get_poster(movie_title):
    movie_id = metadata[metadata['title'] == movie_title]['imdb_id'].values[0]
    url = 'https://www.movieposterdb.com/search?q=%09'+str(movie_id)+'&imdb=0'
    r = requests.get(url)
    soup = BeautifulSoup(r.text,'html5lib')
    result = soup.find('div',attrs={'class':'row mgrid'})
    poster = result.find('img')
    return poster['data-src']

In [10]:
#FUNCTION TO GET ROTTEN TOMATOES URL

def get_url(movie_title):
    url = 'https://www.rottentomatoes.com/search?search='

    data = metadata[metadata['title'] == movie_title]
    og_title = data['title'].values[0]
    title = og_title.replace(': ',' ')
    if data['director'].isna().values[0] == False:
        director = data['director'].values[0]
        title_for_url = '%20'.join((title+' '+director).lower().split(' '))
    else:
        title_for_url = '%20'.join(title.lower().split(' '))

    r = requests.get(url+title_for_url)
    soup = BeautifulSoup(r.text,'html5lib')

    table = soup.find('ul',attrs={'slot':'list'})

    for a in table.findAll('a',attrs = {'data-qa':'info-name'}):
        if len(table.findAll('a',attrs = {'data-qa':'info-name'})) == 1:
            return a['href']
        else:
            if a.text.strip() == og_title.strip():
                return a['href']
            elif a.text.strip() == title.strip():
                return a['href']
        
    return 0

In [11]:
#FUNCTIONS FOR PREPROCESSING THE REVIEWS

def lower_text(review):
    return review.lower()

def initial_clean(review):
    new_review = review.replace('(','')
    new_review = new_review.replace(')','')
    new_review = new_review.replace('"','')
    new_review = new_review.replace(',','')
    new_review = new_review.replace(';','')
    new_review = new_review.replace(':','')
    new_review = new_review.replace('','')
    new_review = new_review.replace('*','')
    new_review = new_review.replace('-',' ')
    new_review = new_review.replace('/',' ')
    new_review = new_review.replace('i.e.',' ')
    new_review = re.sub(r'<[^>]*>',' ',new_review)
    new_review = re.sub('\.{2,}',' ',new_review)
    new_review = re.sub('\?{2,}','?',new_review)
    new_review = re.sub('\!{2,}','!',new_review)
    return new_review

def fix_contractions(review):
    new_review = []
    sentences = sent_tokenize(review)
    for sentence in sentences:
        for word in sentence.split(' '):
            new_review.append(contractions.fix(word))
    return " ".join(new_review)

def final_clean(review):
    new_review = review.replace("'s",'')
    for punc in string.punctuation:
        new_review = new_review.replace(punc,'')
    new_review = re.sub(r'\b\d+(st|nd|rd|th|s)\b','',new_review)
    new_review = re.sub('\d+','',new_review)
    return new_review


def remove_extra_space(review):
    new_review = []
    for word in review.split(' '):
        if len(word)!=0:
            new_review.append(word.strip())
    return " ".join(new_review) 

stopwords_dict = {word:1 for word in stopwords.words('english')}

def remove_stopwords(review):
    new_review = ""
    for word in review.split(' '):
        if word not in stopwords_dict:
            new_review += word + " "
    return new_review

def lemmatize_review(review):
    lemma = WordNetLemmatizer()
    new_review = ""
    for word in review.split(' '):
        new_review += lemma.lemmatize(word) + " "
    return new_review.strip()

In [12]:
#FUNCTION TO GET CRITICS REVIEWS

def get_critics_review(movie_title):
    data = []
    
    start_url = get_url(movie_title)
    if start_url == 0:
        return pd.DataFrame(data,columns = ['Reviews'])
    
    r = requests.get(start_url+"/reviews?type=top_critics")
    soup = BeautifulSoup(r.text,'html5lib')

    review = soup.find('div',attrs = {'class':'review_table'})

    for reviews in review.findAll('p',attrs = {'class':'review-text'}):
        if len(reviews.text.strip())!=0:
            data.append(reviews.text.strip())
    
    return pd.DataFrame(data[:5],columns = ['Reviews'])

In [13]:
#FUNCTION TO GET AUDIENCE REVIEWS

def get_audience_review(movie_title):
    data = []
    
    start_url = get_url(movie_title)
    if start_url == 0:
        return pd.DataFrame(data,columns = ['Reviews'])
    
    r = requests.get(start_url+"/reviews?type=user")
    soup = BeautifulSoup(r.text,'html5lib')

    review = soup.find('div',attrs = {'class':'review_table'})

    for reviews in review.findAll('p',attrs = {'slot':'content'}):
        if len(reviews.text.strip())!=0:
            data.append(reviews.text.strip())
    
    return pd.DataFrame(data[:10],columns = ['Reviews'])

In [14]:
#FUNCTION TO PREDICT SENTIMENT OF REVIEWS

def sentiment_analysis(reviews):
    if reviews.shape[0] == 0:
        return reviews

    cleaned_reviews = (reviews['Reviews'].apply(lower_text)
                .apply(initial_clean)
                .apply(fix_contractions)
                .apply(final_clean)
                .apply(remove_extra_space)
                .apply(remove_stopwords)
                .apply(lemmatize_review)
                )
    
    review_tokens = pad_sequences(tokenizer.texts_to_sequences(cleaned_reviews),maxlen=200)

    reviews['Sentiment'] = model.predict(review_tokens,verbose = 0)
    reviews['Sentiment'] = reviews['Sentiment'].apply(lambda x: 'Positive' if x >=0.5 else 'Negative')
    
    return reviews