#### CORPUS FOR plot summary

In [211]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
spacy_nlp = spacy.load('en_core_web_sm')
import re
import pymongo
import pandas as pd

#### preprocessing

In [214]:
#create list of punctuations and stopwords
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

#function for data cleaning and processing
#This can be further enhanced by adding / removing reg-exps as desired.

def spacy_tokenizer(sentence):
    try:
        #remove distracting single quotes
        sentence = re.sub('\'','',sentence)

        #remove digits adnd words containing digits
        sentence = re.sub('\w*\d\w*','',sentence)

        #replace extra spaces with single space
        sentence = re.sub(' +',' ',sentence)

        #remove unwanted lines starting from special charcters
        sentence = re.sub(r'\n: \'\'.*','',sentence)
        sentence = re.sub(r'\n!.*','',sentence)
        sentence = re.sub(r'^:\'\'.*','',sentence)
        
        #remove non-breaking new line characters
        sentence = re.sub(r'\n',' ',sentence)
        
        #remove punctunations
        sentence = re.sub(r'[^\w\s]',' ',sentence)
        
        #creating token object
        tokens = spacy_nlp(sentence)
        
        #lower, strip and lemmatize
        tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
        
        #remove stopwords, and exclude words less than 2 characters
        tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]
        
        #return tokens
        return tokens
    except Exception as e:
        print(e)
        return []

In [221]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient["contelligenz"]
mycol = mydb["movies"]
df_1 = pd.DataFrame(list(mycol.find()))
syn = df_1['synopsis']
print ('Cleaning and Tokenizing...')
%time movie_plot = syn.map(lambda x: spacy_tokenizer(x))

#### dictionary, corpus

In [217]:

from gensim import corpora

#creating term dictionary

dictionary = corpora.Dictionary(movie_plot)

#filter out terms which occurs in less than 4 documents and more than 20% of the documents.
#NOTE: Since we have smaller dataset, we will keep this commented for now.

#dictionary.filter_extremes(no_below=4, no_above=0.2)

#list of few which which can be further removed
stoplist = set('hello and if this can would should could tell ask stop come go')
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)

In [218]:

corpus = [dictionary.doc2bow(desc) for desc in movie_plot]

word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in corpus]

# print(word_frequencies)

#### model training and saving

In [219]:
import gensim
%time movie_tfidf_model = gensim.models.TfidfModel(corpus, id2word=dictionary)
%time movie_lsi_model = gensim.models.LsiModel(movie_tfidf_model[corpus], id2word=dictionary, num_topics=300)
%time gensim.corpora.MmCorpus.serialize('movie_tfidf_model_mm', movie_tfidf_model[corpus])
%time gensim.corpora.MmCorpus.serialize('movie_lsi_model_mm',movie_lsi_model[movie_tfidf_model[corpus]])

CPU times: user 20.1 ms, sys: 1.33 ms, total: 21.5 ms
Wall time: 20.5 ms
CPU times: user 5.3 s, sys: 289 ms, total: 5.59 s
Wall time: 1.43 s
CPU times: user 782 ms, sys: 1.05 s, total: 1.84 s
Wall time: 290 ms
CPU times: user 559 ms, sys: 76.1 ms, total: 635 ms
Wall time: 657 ms


#### loading model

In [220]:

#Load the indexed corpus
movie_tfidf_corpus = gensim.corpora.MmCorpus('movie_tfidf_model_mm')
movie_lsi_corpus = gensim.corpora.MmCorpus('movie_lsi_model_mm')

print(movie_tfidf_corpus)
print(movie_lsi_corpus)

MmCorpus(1028 documents, 12291 features, 67309 non-zero entries)
MmCorpus(1028 documents, 300 features, 270300 non-zero entries)


#### search

In [223]:

from gensim.similarities import MatrixSimilarity

%time movie_index = MatrixSimilarity(movie_lsi_corpus, num_features = movie_lsi_corpus.num_terms)
from operator import itemgetter

def search_similar_movies(search_term):

    query_bow = dictionary.doc2bow(spacy_tokenizer(search_term))
    query_tfidf = movie_tfidf_model[query_bow]
    query_lsi = movie_lsi_model[query_tfidf]

    movie_index.num_best = 5

    movies_list = movie_index[query_lsi]

    movies_list.sort(key=itemgetter(1), reverse=True)
    movie_names = []

    for j, movie in enumerate(movies_list):

        movie_names.append (
            {
                'Relevance': round((movie[1] * 100),2),
                'Movie Title': df_1['title'][movie[0]],
                'Movie Plot': df_1['synopsis'][movie[0]]
            }

        )
        if j == (movie_index.num_best-1):
            break

    return pd.DataFrame(movie_names, columns=['Relevance','Movie Title','Movie Plot'])



CPU times: user 166 ms, sys: 4.22 ms, total: 171 ms
Wall time: 169 ms


In [227]:
search_similar_movies('alien')

Unnamed: 0,Relevance,Movie Title,Movie Plot
0,97.72,PK,"\nAn alien (Aamir Khan), with a body similar t..."
1,13.8,Koi... Mil Gaya,\nCanada based Scientist Sanjay Mehra (Rakesh ...
2,10.52,Roja,"\nIn Srinagar, a Kashmiri terrorist, Wasim Kha..."
3,5.47,Kaho Naa... Pyaar Hai,\nRohit (Hrithik Roshan) and his younger broth...
4,4.41,Zinda,"\nThe film is about a rich, successful busines..."


In [224]:
search_similar_movies('violence protest march')


Unnamed: 0,Relevance,Movie Title,Movie Plot
0,58.82,Black Friday,"\nOn 9 March 1993 a small-time thug, Gul Moham..."
1,56.97,Talvar,\nThe film starts off with an introduction of ...
2,46.77,Gunday,\nThis movie is a disgrace for Bangladeshi lib...
3,27.87,Jolly LLB,\nJolly LLB is an upcoming Bollywood comedy dr...
4,19.7,Shahid,\nThe film opens with murder of Shahid Azmi (R...


In [225]:
search_similar_movies('love affair hate')

Unnamed: 0,Relevance,Movie Title,Movie Plot
0,54.22,Ishaqzaade,\nRising from the ashes of hooliganism and sma...
1,43.76,Thoda Pyaar Thoda Magic,"\nRanbeer Talwar (Saif Ali Khan), one of the c..."
2,34.58,Hum Tum,\nLife isn't always like the movies. Love isn'...
3,24.23,Rab Ne Bana Di Jodi,\nHave you ever stopped to think if the most o...
4,23.32,Aashayein,\nThe story revolves around Rahul Singh (John ...
