In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import gensim
import operator
import re
import string


In [89]:
df = pd.read_csv('movies.csv')
df.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vit...","In late summer 1945, guests are gathered for t..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of ...","In 1947, Andy Dufresne (Tim Robbins), a banker..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the...",The relocation of Polish Jews from surrounding...
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight...","The film opens in 1964, where an older and fat..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate...,"In the early years of World War II, December 1..."


In [90]:
from spacy.lang.en.stop_words import STOP_WORDS
spacy_nlp = spacy.load('en_core_web_sm')
punctuations = string.punctuation
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [91]:
def spacy_tokenizer(sentence):
 
    #remove distracting single quotes
    sentence = re.sub('\'','',sentence)

    #remove digits adnd words containing digits
    sentence = re.sub('\w*\d\w*','',sentence)

    #replace extra spaces with single space
    sentence = re.sub(' +',' ',sentence)

    #remove unwanted lines starting from special charcters
    sentence = re.sub(r'\n: \'\'.*','',sentence)
    sentence = re.sub(r'\n!.*','',sentence)
    sentence = re.sub(r'^:\'\'.*','',sentence)
    
    #remove non-breaking new line characters
    sentence = re.sub(r'\n',' ',sentence)
    
    #remove punctunations
    sentence = re.sub(r'[^\w\s]',' ',sentence)
    
    #creating token object
    tokens = spacy_nlp(sentence)
    
    #lower, strip and lemmatize
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    
    #remove stopwords, and exclude words less than 2 characters
    tokens = [word for word in tokens if word not in stop_words and word not in punctuations and len(word) > 2]
    
    #return tokens
    return tokens


In [92]:
df['tokenized_wiki']= df ['wiki_plot'].map(lambda x: spacy_tokenizer(x))

In [93]:
df.head()

Unnamed: 0,rank,title,genre,wiki_plot,imdb_plot,tokenized_wiki
0,0,The Godfather,"[u' Crime', u' Drama']","On the day of his only daughter's wedding, Vit...","In late summer 1945, guests are gathered for t...","[day, daughter, wedding, vito, corleone, hear,..."
1,1,The Shawshank Redemption,"[u' Crime', u' Drama']","In 1947, banker Andy Dufresne is convicted of ...","In 1947, Andy Dufresne (Tim Robbins), a banker...","[banker, andy, dufresne, convict, murder, wife..."
2,2,Schindler's List,"[u' Biography', u' Drama', u' History']","In 1939, the Germans move Polish Jews into the...",The relocation of Polish Jews from surrounding...,"[germans, polish, jews, kraków, ghetto, world,..."
3,3,Raging Bull,"[u' Biography', u' Drama', u' Sport']","In a brief scene in 1964, an aging, overweight...","The film opens in 1964, where an older and fat...","[brief, scene, age, overweight, italian, ameri..."
4,4,Casablanca,"[u' Drama', u' Romance', u' War']",It is early December 1941. American expatriate...,"In the early years of World War II, December 1...","[early, december, american, expatriate, rick, ..."


In [94]:
movie_plot= df['tokenized_wiki']
movie_plot[:3]

0    [day, daughter, wedding, vito, corleone, hear,...
1    [banker, andy, dufresne, convict, murder, wife...
2    [germans, polish, jews, kraków, ghetto, world,...
Name: tokenized_wiki, dtype: object

In [95]:
from gensim import corpora
dictionary = corpora.Dictionary(movie_plot)


In [96]:
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)


In [116]:
vals=[[val,key] for key,val in dictionary.items() if key<=50]


In [112]:
bow= [dictionary.doc2bow(words) for words in movie_plot]



In [135]:
word_frequencies = [[(dictionary[id], frequency) for id, frequency in line] for line in bow[0:1]]
word_frequencies


[[('abrasive', 2),
  ('abroad', 2),
  ('abuse', 4),
  ('accept', 6),
  ('accuse', 2),
  ('act', 2),
  ('adams', 3),
  ('add', 2),
  ('address', 2),
  ('agree', 2),
  ('ambush', 2),
  ('angeles', 2),
  ('answer', 2),
  ('anthony', 3),
  ('apollonia', 2),
  ('arrange', 2),
  ('ask', 2),
  ('assassin', 4),
  ('assassination', 4),
  ('associate', 2),
  ('attack', 4),
  ('attempt', 4),
  ('authority', 2),
  ('aware', 2),
  ('baron', 2),
  ('barzini', 7),
  ('battle', 2),
  ('bed', 2),
  ('betrayal', 2),
  ('bodyguard', 2),
  ('bomb', 2),
  ('booth', 2),
  ('brasi', 3),
  ('brasis', 2),
  ('break', 2),
  ('bronx', 2),
  ('brother', 6),
  ('business', 6),
  ('buy', 2),
  ('capos', 3),
  ('captain', 2),
  ('car', 2),
  ('career', 2),
  ('carlo', 7),
  ('casino', 2),
  ('christening', 4),
  ('christmas', 2),
  ('clampdown', 2),
  ('clemenza', 3),
  ('collapse', 2),
  ('come', 4),
  ('command', 2),
  ('confess', 2),
  ('confront', 2),
  ('connection', 2),
  ('connie', 4),
  ('connies', 2),
  ('c

In [132]:
a=np.array(movie_plot)[0]
a.count('accept')

6

In [142]:
movie_tfidf_model = gensim.models.TfidfModel(bow, id2word=dictionary)
movie_lsi_model = gensim.models.LsiModel(movie_tfidf_model[bow], id2word=dictionary, num_topics=300)

In [148]:

print(movie_lsi_model)
print(movie_tfidf_model)
# movie_lsi_model.num_terms

LsiModel<num_terms=8420, num_topics=300, decay=1.0, chunksize=20000>
TfidfModel<num_docs=100, num_nnz=26018>


In [152]:
from gensim.similarities import MatrixSimilarity
sim_index = MatrixSimilarity(movie_lsi_model, num_features=movie_lsi_model.num_terms)

TypeError: object of type 'LsiModel' has no len()