In [1]:
import sys
import re, numpy as np, pandas as pd
from IPython.display import display

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
import string

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
#Define stopwords
punctuation = "".join([symbol for symbol in string.punctuation if symbol not in ["'", '"']])
punctuation += '–'
punctuation += '...'

stopwords_list = stopwords.words('english')
stopwords_list += list(punctuation)

In [3]:
#Checking my list of stopwords
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# Getting DF

In [4]:
#Importing dataframes
df = pd.read_pickle('script_TM')

In [5]:
df.shape

(1086, 2)

In [10]:
df.head(25)

Unnamed: 0,Speaker,Text
0,COMPUTER SCREEN,So close it has no boundaries. A blinking cur...
1,MAN (V.O.),"Hello? Data now slashes across the screen, ..."
2,SCREEN,Call trans opt: received. 2-19-96 13:24...
3,WOMAN (V.O.),I'm inside. Anything to report? We listen ...
4,TRINITY.,
5,CYPHER (V.O.),Let's see. Target left work at
6,5:01 PM.,
7,SCREEN,Trace program: running. The entire screen ...
8,CYPHER (V.O.),He caught the northbound Howard line. Go...
9,TRINITY (V.O.),"All right, you're relieved. Use the usua..."


# Cleaning DF

In [7]:
df.Speaker.value_counts()

     NEO               175
     MORPHEUS          134
     TRINITY           120
     AGENT SMITH        73
     TANK               60
                      ... 
 INT. MAIN DECK          1
 INT.  CAR               1
   #312-555-0690         1
 ON COMPUTER SCREEN      1
 EXT.  EL TRAIN          1
Name: Speaker, Length: 156, dtype: int64

In [8]:
df.Speaker.nunique()

156

In [9]:
df.Speaker.unique()

array([' COMPUTER SCREEN', '     MAN (V.O.)', '     SCREEN',
       '     WOMAN (V.O.)', ' TRINITY.', '     CYPHER (V.O.)',
       '   5:01 PM.', '     TRINITY (V.O.)', '     TRINITY',
       '   #312-555-0690', '     CYPHER (V.O.) ', '     RADIO (V.O.)',
       ' INT.  CHASE HOTEL - NIGHT', '     BIG COP',
       ' EXT.  CHASE HOTEL - NIGHT', '     AGENT SMITH',
       '     LIEUTENANT', '     AGENT SMITH ', ' INT.  CHASE HOTEL',
       ' FIRES --', ' EXT.  CHASE HOTEL', '     MORPHEUS (V.O.)',
       ' INT.  HALL', ' EXT.  FIRE E5CAPE', ' EXT.  ROOF', '     COP',
       ' EXT.  STREET', '     AGENT JONES', '   FOS4:  ALL HAIL SEGA!!!',
       " INT.  NEO'S APARTMENT", '     NEO', '     VOICE (O.S.)',
       '     ANTHONY', '     DUJOUR', ' INT.  APARTMENT',
       '       CUT TO:', ' 9:15 A.M.', ' EXT.  SKYSCRAPER',
       ' INT.  CORTECHS OFFICE', '     RHINEHEART',
       " INT.  NEO'S CUBICLE", '     TALL EMPLOYEE', '     FEDEX',
       ' INT.  INTERROGATION ROOM - CLOSE ON CAMERA

* I need to remove all the V.O to keep only the names 

In [15]:
#Use split to take the V.O at the end of the in the Speaker name
df.Speaker = df.Speaker.apply(lambda x: x.split()[0])

In [25]:
print(df.shape)
df.head(25)

(1086, 2)


Unnamed: 0,Speaker,Text
0,COMPUTER,So close it has no boundaries. A blinking cur...
1,MAN,"Hello? Data now slashes across the screen, ..."
2,SCREEN,Call trans opt: received. 2-19-96 13:24...
3,WOMAN,I'm inside. Anything to report? We listen ...
4,TRINITY.,
5,CYPHER,Let's see. Target left work at
6,5:01,
7,SCREEN,Trace program: running. The entire screen ...
8,CYPHER,He caught the northbound Howard line. Go...
9,TRINITY,"All right, you're relieved. Use the usua..."


In [27]:
list(df.Speaker.unique())

['COMPUTER',
 'MAN',
 'SCREEN',
 'WOMAN',
 'TRINITY.',
 'CYPHER',
 '5:01',
 'TRINITY',
 '#312-555-0690',
 'RADIO',
 'INT.',
 'BIG',
 'EXT.',
 'AGENT',
 'LIEUTENANT',
 'FIRES',
 'MORPHEUS',
 'COP',
 'FOS4:',
 'NEO',
 'VOICE',
 'ANTHONY',
 'DUJOUR',
 'CUT',
 '9:15',
 'RHINEHEART',
 'TALL',
 'FEDEX',
 'A."',
 'APOC',
 'GIZMO',
 'FADE',
 "NEO'S",
 'ANGLE',
 'DOZER',
 'TANK',
 '--',
 'MOUSE',
 'CABLE',
 'SWITCH',
 'CYPHER,',
 'R.S.I.',
 'E.M.P?',
 'MOJO',
 'REX',
 'PRIESTESS',
 'SPOON',
 'ORACLE',
 'COPS',
 'BA-BOOM!',
 'PILOT',
 'SERGEANT',
 'INTO',
 'FIRE.',
 'HELICOPTER',
 'OLD',
 'CLICK.',
 '305...',
 'BOOM.',
 'ON',
 'BOY',
 'MOMMY',
 'THE']

# Tokenize

In [10]:
def sent_to_words(sentences):
    for sent in sentences:
#         sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\t', ' ', sent)  # remove newline chars
        sent = re.sub('\n', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent) 

In [11]:
# Convert to list
data = df.Text.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

[['so', 'close', 'it', 'has', 'no', 'boundaries', 'blinking', 'cursor', 'pulses', 'in', 'the', 'electric', 'darkness', 'like', 'heart', 'coursing', 'with', 'phosphorous', 'light', 'burning', 'beneath', 'the', 'derma', 'of', 'black', 'neon', 'glass', 'phone', 'begins', 'to', 'ring', 'we', 'hear', 'it', 'as', 'though', 'we', 'were', 'making', 'the', 'call', 'the', 'cursor', 'continues', 'to', 'throb', 'relentlessly', 'patient', 'until']]


# Build Bigram, Trigram Models and Lemmatize

In [12]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once

In [13]:
def process_words(texts, stop_words=stopwords_list, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

In [14]:
data_ready = process_words(data_words)  # processed the entire movie

In [15]:
data_ready[:1]

[['close',
  'boundary',
  'blink',
  'cursor',
  'pulse',
  'electric',
  'darkness',
  'heart',
  'course',
  'phosphorous',
  'light',
  'burn',
  'black',
  'neon',
  'glass',
  'phone',
  'begin',
  'ring',
  'hear',
  'make',
  'call',
  'continue',
  'relentlessly',
  'patient']]

# Build the Topic Modeling

In [55]:
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

In [56]:
#check the topic
for topic in lda_model.print_topics():
    print(topic)
    print("--------")
# print(lda_model.print_topics())

(0, '0.059*"phone" + 0.048*"open" + 0.026*"ring" + 0.026*"door" + 0.024*"car" + 0.021*"find" + 0.020*"agent_jone" + 0.016*"drop" + 0.016*"helicopter" + 0.016*"take"')
--------
(1, '0.063*"go" + 0.056*"tank" + 0.055*"know" + 0.042*"man" + 0.027*"start" + 0.027*"smile" + 0.024*"dead" + 0.023*"believe" + 0.020*"boy" + 0.019*"life"')
--------
(2, '0.042*"see" + 0.030*"get" + 0.030*"hold" + 0.027*"stand" + 0.026*"begin" + 0.025*"hole" + 0.025*"look" + 0.023*"eye" + 0.022*"let" + 0.019*"try"')
--------
(3, '0.039*"agent" + 0.033*"fly" + 0.029*"stare" + 0.024*"back" + 0.022*"turn" + 0.019*"elevator" + 0.019*"machine" + 0.019*"metal" + 0.018*"still" + 0.016*"blow"')
--------
(4, '0.037*"hear" + 0.025*"black" + 0.023*"close" + 0.020*"make" + 0.019*"pull" + 0.018*"come" + 0.017*"ear" + 0.017*"beat" + 0.016*"old" + 0.015*"attack"')
--------
(5, '0.061*"trinity" + 0.026*"fall" + 0.026*"body" + 0.026*"gun" + 0.024*"scream" + 0.019*"fire" + 0.019*"bullet" + 0.017*"air" + 0.016*"right" + 0.015*"shoot

# Dominant topic and its percentage contribution in each text/action

In [32]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [33]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)

In [34]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,4.0,0.641,"hear, black, close, make, pull, come, ear, beat, old, attack","[close, boundary, blink, cursor, pulse, electric, darkness, heart, course, phosphorous, light, b..."
1,1,3.0,0.8755,"agent, fly, stare, back, turn, elevator, machine, metal, still, blow","[slash, screen, information, flash, faster, read]"
2,2,4.0,0.7805,"hear, black, close, make, pull, come, ear, beat, old, attack","[call, receive, log]"
3,3,0.0,0.4028,"phone, open, ring, door, car, find, agent_jone, drop, helicopter, take","[report, listen, phone, conversation, third, line, woman]"
4,4,0.0,0.1667,"phone, open, ring, door, car, find, agent_jone, drop, helicopter, take",[]
5,5,2.0,0.8611,"see, get, hold, stand, begin, hole, look, eye, let, try","[let, see, target, leave, work]"
6,6,0.0,0.1667,"phone, open, ring, door, car, find, agent_jone, drop, helicopter, take",[]
7,7,3.0,0.671,"agent, fly, stare, back, turn, elevator, machine, metal, still, blow","[program, run, entire, screen, fill, race, column, number, shimmer, green, rush, digit, phone, n..."
8,8,2.0,0.6566,"see, get, hold, stand, begin, hole, look, eye, let, try","[catch, line, get, stop, purchase, pack, return, area, code, identify, first, number, suddenly, ..."
9,9,5.0,0.8539,"trinity, fall, body, gun, scream, fire, bullet, air, right, shoot","[right, relieved, use, usual, exit]"


In [35]:
#Let's have a look
df_dominant_topic.shape

(1086, 5)

In [36]:
df_dominant_topic.Dominant_Topic.value_counts(normalize=True)

0.0    0.270718
1.0    0.203499
2.0    0.153775
5.0    0.132597
4.0    0.120626
3.0    0.118785
Name: Dominant_Topic, dtype: float64

In [37]:
df.shape

(1086, 2)

# Most representative action for each topic

In [57]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0.0,0.8957,"phone, open, ring, door, car, find, agent_jone, drop, helicopter, take","[explode, open, heavily, armed, rnen, rush, room]"
1,1.0,0.9053,"go, tank, know, man, start, smile, dead, believe, boy, life","[explain, go, seem, strange, bring, warn, lot, danger]"
2,2.0,0.8611,"see, get, hold, stand, begin, hole, look, eye, let, try","[let, see, target, leave, work]"
3,3.0,0.9479,"agent, fly, stare, back, turn, elevator, machine, metal, still, blow","[company, top, software, company, world, single, employee, understand, part, whole, thus, employ..."
4,4.0,0.8741,"hear, black, close, make, pull, come, ear, beat, old, attack","[deal, chew, steak, loudly, smack, tooth]"
5,5.0,0.9214,"trinity, fall, body, gun, scream, fire, bullet, air, right, shoot","[child, separate, possible, impossible, young, mind, easy, free, mind, difficult]"


# Frequency Distribution of Word Counts in movie

In [67]:
# 1. Wordcloud of Top N words in each topic
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

# cols = [color for name, color in mcolors.XKCD_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS - mcolors.TABLEAU_COLORS'

cols = ['blue','#fd8d49','green','#9e003a']
cloud = WordCloud(stopwords=stopwords_list,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=30,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False,num_words=30)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    print(i,topics[i][1])
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=-3, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.savefig('topic_wordcloud.png',dpi=180)
plt.show()

0 {'phone': 0.05897323, 'open': 0.047538627, 'ring': 0.026129488, 'door': 0.025514964, 'car': 0.024185978, 'find': 0.02070577, 'agent_jone': 0.020233853, 'drop': 0.016317, 'helicopter': 0.016171647, 'take': 0.01612897, 'rush': 0.013947997, 'hand': 0.013075812, 'jump': 0.01297167, 'sit': 0.01260833, 'hang': 0.011493117, 'explode': 0.011279634, 'almost': 0.011145089, 'line': 0.010375493, 'shake': 0.009991352, 'monitor': 0.009990843, 'room': 0.009905918, 'slow': 0.009530389, 'cypher': 0.008977997, 'head': 0.00889582, 'key': 0.008863169, 'finger': 0.008726907, 'rope': 0.008410365, 'grab': 0.007723027, 'roof': 0.007323642, 'truck': 0.0069503183}
1 {'go': 0.062824585, 'tank': 0.055669684, 'know': 0.055035334, 'man': 0.041758213, 'start': 0.027326284, 'smile': 0.02680533, 'dead': 0.024341457, 'believe': 0.023339152, 'boy': 0.019751703, 'life': 0.018976077, 'think': 0.014551956, 'little': 0.014251259, 'white': 0.012450625, 'want': 0.011761688, 'seem': 0.011715347, 'kiss': 0.011613785, 'thing':

IndexError: list index out of range