In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import gensim
import operator
import re
import string


In [18]:
df = pd.read_csv('movies.csv')
df.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vit...","In late summer 1945, guests are gathered for t..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of ...","In 1947, Andy Dufresne (Tim Robbins), a banker..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the...",The relocation of Polish Jews from surrounding...
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight...","The film opens in 1964, where an older and fat..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate...,"In the early years of World War II, December 1..."


In [19]:
from spacy.lang.en.stop_words import STOP_WORDS
spacy_nlp = spacy.load('en_core_web_sm')
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [20]:
def spacy_tokenizer(sentence):
 
    #remove distracting single quotes
    sentence = re.sub('\'','',sentence)

    #remove digits adnd words containing digits
    sentence = re.sub('\w*\d\w*','',sentence)

    #replace extra spaces with single space
    sentence = re.sub(' +',' ',sentence)

    #remove unwanted lines starting from special charcters
    sentence = re.sub(r'\n: \'\'.*','',sentence)
    sentence = re.sub(r'\n!.*','',sentence)
    sentence = re.sub(r'^:\'\'.*','',sentence)
    
    #remove non-breaking new line characters
    sentence = re.sub(r'\n',' ',sentence)
    
    #remove punctunations
    sentence = re.sub(r'[^\w\s]',' ',sentence)
    
    #creating token object
    tokens = spacy_nlp(sentence)
    
    #lower, strip and lemmatize
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    
    #remove stopwords, and exclude words less than 2 characters
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]
    
    #return tokens
    return tokens


In [21]:
df['tokenized_wiki']= df ['wiki_plot'].map(lambda x: spacy_tokenizer(x))

In [26]:
df.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot,tokenized_wiki
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vit...","In late summer 1945, guests are gathered for t...","[day, daughter, wedding, vito, corleone, hear,..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of ...","In 1947, Andy Dufresne (Tim Robbins), a banker...","[banker, andy, dufresne, convict, murder, wife..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the...",The relocation of Polish Jews from surrounding...,"[germans, polish, jews, kraków, ghetto, world,..."
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight...","The film opens in 1964, where an older and fat...","[brief, scene, age, overweight, italian, ameri..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate...,"In the early years of World War II, December 1...","[early, december, american, expatriate, rick, ..."


In [27]:
movie_plot= df['tokenized_wiki']
movie_plot[:3]

0    [day, daughter, wedding, vito, corleone, hear,...
1    [banker, andy, dufresne, convict, murder, wife...
2    [germans, polish, jews, kraków, ghetto, world,...
Name: tokenized_wiki, dtype: object

In [28]:
from gensim import corpora
dictionary = corpora.Dictionary(movie_plot)


In [30]:
vals=[[val,key] for key,val in dictionary.items() if key<=50]


In [31]:
bow= [dictionary.doc2bow(words) for words in movie_plot]



In [32]:
word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in bow[0:1]]
word_frequencies


[[('abrasive', 2),
  ('abroad', 2),
  ('abuse', 4),
  ('accept', 6),
  ('accuse', 2),
  ('act', 2),
  ('adams', 3),
  ('add', 2),
  ('address', 2),
  ('agree', 2),
  ('ambush', 2),
  ('angeles', 2),
  ('answer', 2),
  ('anthony', 3),
  ('apollonia', 2),
  ('arrange', 2),
  ('ask', 2),
  ('assassin', 4),
  ('assassination', 4),
  ('associate', 2),
  ('attack', 4),
  ('attempt', 4),
  ('authority', 2),
  ('aware', 2),
  ('baron', 2),
  ('barzini', 7),
  ('battle', 2),
  ('bed', 2),
  ('betrayal', 2),
  ('bodyguard', 2),
  ('bomb', 2),
  ('booth', 2),
  ('brasi', 3),
  ('brasis', 2),
  ('break', 2),
  ('bronx', 2),
  ('brother', 6),
  ('business', 6),
  ('buy', 2),
  ('capos', 3),
  ('captain', 2),
  ('car', 2),
  ('career', 2),
  ('carlo', 7),
  ('casino', 2),
  ('christening', 4),
  ('christmas', 2),
  ('clampdown', 2),
  ('clemenza', 3),
  ('collapse', 2),
  ('come', 4),
  ('command', 2),
  ('confess', 2),
  ('confront', 2),
  ('connection', 2),
  ('connie', 4),
  ('connies', 2),
  ('c

In [33]:
a=np.array(movie_plot)[0]
a.count('accept')

6

In [34]:
movie_tfidf_model = gensim.models.TfidfModel(bow, id2word=dictionary)
movie_lsi_model = gensim.models.LsiModel(movie_tfidf_model[bow], id2word=dictionary, num_topics=300)

In [35]:
tfidif_mode=movie_tfidf_model[bow]
lsi_model=movie_lsi_model[movie_tfidf_model[bow]]




In [36]:

print(tfidif_mode)
print(lsi_model)
# print(len(dictionary),movie_lsi_model.num_terms)


<gensim.interfaces.TransformedCorpus object at 0x178b1beb0>
<gensim.interfaces.TransformedCorpus object at 0x178b1b9a0>


In [37]:
from gensim.similarities import MatrixSimilarity
sim_index = MatrixSimilarity(lsi_model, num_features=len(dictionary))

In [38]:
from operator import itemgetter

def search_similar_movies(search_term):

    query_bow = dictionary.doc2bow(spacy_tokenizer(search_term))
    query_tfidf = movie_tfidf_model[query_bow]
    query_lsi = movie_lsi_model[query_tfidf]

    sim_index.num_best = 5

    movies_list = sim_index[query_lsi]

    movies_list.sort(key=itemgetter(1), reverse=True)
    movie_names = []

    for j, movie in enumerate(movies_list):

        movie_names.append (
            {
                'Relevance': round((movie[1] * 100),2),
                'Movie Title': df['title'][movie[0]],
                'Movie Plot': df['wiki_plot'][movie[0]]
            }

        )
        if j == (sim_index.num_best-1):
            break

    return pd.DataFrame(movie_names, columns=['Relevance','Movie Title','Movie Plot'])

In [39]:
(sim_index.num_best,sim_index.num_features)



(None, 8420)

In [40]:
search_similar_movies('violence protest march ')

Unnamed: 0,Relevance,Movie Title,Movie Plot
0,73.92,Gandhi,The screenplay of Gandhi is available as a pub...
1,52.1,A Clockwork Orange,"In futuristic London, Alex DeLarge is the lead..."
2,31.72,Amadeus,The story begins in 1823 as the elderly Antoni...
3,21.7,The Best Years of Our Lives,"After World War II, Fred Derry (Dana Andrews),..."
4,13.31,All Quiet on the Western Front,\n\n\n\n\nThis section's plot summary may be t...


In [44]:
a=search_similar_movies('love and romance')[[]]
a.to_dict('records')

[{'Relevance': 83.71,
  'Movie Title': 'Network',
  'Movie Plot': 'Howard Beale, the longtime anchor of the Union Broadcasting System\'s UBS Evening News, learns from the news division president, Max Schumacher, that he has just two more weeks on the air because of declining ratings. The two old friends get roaring drunk and lament the state of their industry. The following night, Beale announces on live television that he will commit suicide on next Tuesday\'s broadcast. UBS fires him after this incident, but Schumacher intervenes so that Beale can have a dignified farewell. Beale promises he will apologize for his outburst, but once on the air, he launches back into a rant claiming that life is "bullshit". Beale\'s outburst causes the newscast\'s ratings to spike, and much to Schumacher\'s dismay, the upper echelons of UBS decide to exploit Beale\'s antics rather than pull him off the air. In one impassioned diatribe, Beale galvanizes the nation, persuading his viewers to shout out o

In [None]:
a.to