## Example of document clustering using movie plots

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3

### 1. Read input data with a selection of movies

In [6]:
os.chdir(r'C:\Users\CAS85405\Desktop\Movie_plots')
input_data = pd.read_csv('Sample.csv',index_col=0)
input_data

Unnamed: 0,Title,Plot,Tag
1,Paddington,An explorer named Montgomery Clyde (Tim Downie...,comedy
2,Mr. & Mrs. Smith,"DAVID and ANN SMITH have a volatile marriage, ...",comedy
3,How to Steal a Million,"Paris, 1966...Monsieur Charles Bonnet (Hugh Gr...",comedy
4,Scary Movie,Teenager Drew Decker is home alone when she re...,comedy
5,Monte Carlo,The film begins with 18-year old Grace Ann Ben...,comedy
6,Father of the Bride,George Banks (Steve Martin) is the owner of an...,comedy
7,Parenthood,Gil Buckman (Steve Martin) is a 35 year-old fa...,comedy
8,Treasure Island,Young Jim Hawkins Jackie Cooper) and his mothe...,action
9,Mutiny on the Bounty,"One night in Portsmouth, England in 1787, a pr...",action
10,Long John Silver,The movie is set some time after the events of...,action


### 2. Load stopwords to be ignored 

In [3]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')
print (stopwords[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


### 3. Tokenize and stem

#### <span style="color:red">Tokenize:</span><br>
> split the plot text into a list of its respective words (or tokens)
#### <span style="color:red">Stem:</span><br>
> map different forms of the same word to a common "stem" (example: connection, connective, connected -> **connect**)

In [4]:
#Define a tokenizer and stemmer which returns the set of stems in the text that it is passed

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as its own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as its own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [10]:
#Test
from sklearn.feature_extraction.text import TfidfVectorizer
movie_title = []
movie_words = []

for index, row in input_data.iterrows():
    
    #Grab movie title and plot
    movie_title.append(row['Title']) 
    movie_words.append(row['Plot'])
    
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(movie_words)
print(X.shape)

(20, 4322)


In [14]:
#List of features used in the matrix
print(vectorizer.get_feature_names()[100:110])

['aghast', 'ago', 'agree', 'agreed', 'agrees', 'ahmed', 'air', 'airport', 'akimbo', 'al']


In [5]:
#Apply functions to the movie plots

movie_title = []
movie_words = []

for index, row in input.iterrows():
    
    #Grab movie title and plot
    movie_title.append(row['Title']) 
    plot = row['Plot'].split() #From string to list of words: ['I', 'don't', 'know', 'what', 'I', 'am', 'doing']
    
    #Tokenize and stem each plot text
    stemmed = []
    tokenized = []
    
    for word in plot:
        
        #Tokenize and stem
        stemmed_words = tokenize_and_stem(word)
        stemmed.extend(stemmed_words)
        
        #Tokenize only
        tokenized_words = tokenize_only(word)
        tokenized.extend(tokenized_words)
        
        #Store single movie data in a dataframe
        vocabulary = pd.DataFrame({'words': tokenized}, index = stemmed)
        vocabulary.index = vocabulary.index.rename('stems')
        
    #Use movie words to create a flat catalogue of all tokenized words
    movie_words.extend(tokenized) #List of lists


In [7]:
#movie_words is a list of lists with the tokenized words for each movie plot
movie_words[:100]

['an',
 'explorer',
 'named',
 'montgomery',
 'clyde',
 'tim',
 'downie',
 'documents',
 'his',
 'trip',
 'to',
 'darkest',
 'peru',
 'he',
 'comes',
 'across',
 'a',
 'rather',
 'intelligent',
 'species',
 'of',
 'bears',
 'although',
 'he',
 'tries',
 'to',
 'take',
 'one',
 'in',
 'he',
 'decides',
 'to',
 'bond',
 'with',
 'them',
 'and',
 'learns',
 'that',
 'they',
 'can',
 'talk',
 'and',
 'have',
 'a',
 'great',
 'appetite',
 'for',
 'marmalade',
 'clyde',
 'says',
 'he',
 "'s",
 'learned',
 'a',
 'lot',
 'from',
 'the',
 'bears',
 'and',
 'he',
 'wonders',
 'if',
 'the',
 'bears',
 'have',
 'learned',
 'anything',
 'from',
 'him.years',
 'later',
 'a',
 'young',
 'bear',
 'voice',
 'of',
 'ben',
 'whishaw',
 'awakens',
 'to',
 'find',
 'that',
 'the',
 'marmalade',
 'fruits',
 'have',
 'been',
 'produced',
 'he',
 'excitedly',
 'runs',
 'to',
 'his',
 'aunt',
 'lucy',
 'voice',
 'of',
 'imelda',
 'staunton',
 'and',
 'uncle']

In [8]:
#What do tokenize and stem do? Take a look at the vocabulary for a given movie

vocabulary

Unnamed: 0_level_0,words
stems,Unnamed: 1_level_1
u.,u.s
armi,army
corpor,corporal
victor,victor
vic,vic
...,...
origin,original
game,game
in,in
more,more


### 4. Document similarity: the <span style="color:red">term frequency</span>-<span style="color:blue">inverse document frequency</span> (<span style="color:red">tf</span>-<span style="color:blue">idf</span>)

Procedure:

> 1. Count word occurrences by document (movie plot) <br>
> 2. Transform this into a document-term matrix (dtm) <br>
> 3. Apply the term frequency-inverse weighing: higher weight to words that are frequent within a plot but not frequent within all plots (the words has more meaning for a particular movie than it has for all movies)

<img src="dtm.png" align='left'>

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

input_data = movie_words

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, max_features=200000,
                                 min_df=2, stop_words='english', strip_accents = 'ascii', norm='l2',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(input_data) #fit the vectorizer to movie_words

print("Matrix shape:")
print(tfidf_matrix.shape)

  'stop_words.' % sorted(inconsistent))


Matrix shape:
(20552, 1526)


In [12]:
v
terms = tfidf_vectorizer.get_feature_names()
terms

["'d",
 "'m",
 "'s",
 '8-ball',
 'abandon',
 'abl',
 'aboard',
 'abov',
 'abus',
 'accept',
 'accid',
 'accident',
 'accompani',
 'accus',
 'acknowledg',
 'acquaint',
 'acquir',
 'act',
 'actor',
 'actual',
 'add',
 'addit',
 'adjoin',
 'admir',
 'admit',
 'advanc',
 'adventur',
 'advic',
 'advis',
 'affair',
 'afford',
 'afternoon',
 'agent',
 'ago',
 'agre',
 'ahm',
 'air',
 'airport',
 'alarm',
 'alexand',
 'alicia',
 'alik',
 'aliv',
 'alli',
 'allow',
 'alon',
 'alreadi',
 'alway',
 'ambiti',
 'ambush',
 'american',
 'amus',
 'angel',
 'anger',
 'angri',
 'angrili',
 'ani',
 'ann',
 'anna',
 'anni',
 'announc',
 'anoth',
 'answer',
 'anthoni',
 'antiqu',
 'anxieti',
 'anymor',
 'anyon',
 'anyth',
 'apart',
 'apolog',
 'appar',
 'appear',
 'appoint',
 'approach',
 'archiv',
 'argu',
 'argument',
 'arm',
 'armando',
 'armi',
 'arrang',
 'arrest',
 'arriv',
 'arrog',
 'art',
 'artist',
 'artwork',
 'aschenhausen',
 'ask',
 'assassin',
 'assault',
 'assembl',
 'assign',
 'assist',
 'a

Compute distance using Cosine Similarity
> Cosine Similarity is measured using the tf-idf matrix to generate a measure of similarity between each movie plot and the other move plots in the input data

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [16]:
dist.shape

(20552, 20552)

### 5. K-means clustering