In [1]:
import sys
import re, numpy as np, pandas as pd

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
import string

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
#Define stopwords
punctuation = "".join([symbol for symbol in string.punctuation if symbol not in ["'", '"']])
punctuation += '–'
punctuation += '...'

stopwords_list = stopwords.words('english')
stopwords_list += list(punctuation)

In [3]:
#Checking my list of stopwords
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# Getting DF

In [4]:
#Importing dataframes
df = pd.read_pickle('script_TM')

In [5]:
df.shape

(1086, 2)

In [6]:
df.head(25)

Unnamed: 0,Speaker,Text
0,COMPUTER SCREEN,So close it has no boundaries. A blinking cur...
1,MAN (V.O.),"Hello? Data now slashes across the screen, ..."
2,SCREEN,Call trans opt: received. 2-19-96 13:24...
3,WOMAN (V.O.),I'm inside. Anything to report? We listen ...
4,TRINITY.,
5,CYPHER (V.O.),Let's see. Target left work at
6,5:01 PM.,
7,SCREEN,Trace program: running. The entire screen ...
8,CYPHER (V.O.),He caught the northbound Howard line. Go...
9,TRINITY (V.O.),"All right, you're relieved. Use the usua..."


# Cleaning DF

In [7]:
df.Speaker.value_counts()

     NEO                     175
     MORPHEUS                134
     TRINITY                 120
     AGENT SMITH              73
     TANK                     60
                            ... 
 BA-BOOM!                      1
 INT.  TRAIN                   1
 INT.  BAR (MATRIX) - DAY      1
 EXT.  EL TRAIN                1
 INT.  OVERFLOW PIT            1
Name: Speaker, Length: 156, dtype: int64

In [8]:
df.Speaker.nunique()

156

In [9]:
df.Speaker.unique()

array([' COMPUTER SCREEN', '     MAN (V.O.)', '     SCREEN',
       '     WOMAN (V.O.)', ' TRINITY.', '     CYPHER (V.O.)',
       '   5:01 PM.', '     TRINITY (V.O.)', '     TRINITY',
       '   #312-555-0690', '     CYPHER (V.O.) ', '     RADIO (V.O.)',
       ' INT.  CHASE HOTEL - NIGHT', '     BIG COP',
       ' EXT.  CHASE HOTEL - NIGHT', '     AGENT SMITH',
       '     LIEUTENANT', '     AGENT SMITH ', ' INT.  CHASE HOTEL',
       ' FIRES --', ' EXT.  CHASE HOTEL', '     MORPHEUS (V.O.)',
       ' INT.  HALL', ' EXT.  FIRE E5CAPE', ' EXT.  ROOF', '     COP',
       ' EXT.  STREET', '     AGENT JONES', '   FOS4:  ALL HAIL SEGA!!!',
       " INT.  NEO'S APARTMENT", '     NEO', '     VOICE (O.S.)',
       '     ANTHONY', '     DUJOUR', ' INT.  APARTMENT',
       '       CUT TO:', ' 9:15 A.M.', ' EXT.  SKYSCRAPER',
       ' INT.  CORTECHS OFFICE', '     RHINEHEART',
       " INT.  NEO'S CUBICLE", '     TALL EMPLOYEE', '     FEDEX',
       ' INT.  INTERROGATION ROOM - CLOSE ON CAMERA

* I need to remove all the V.O to keep only the names 

In [10]:
#create a function that take a text and remove the "V.O" at the end

def remove_vo(text):
    
    #check the length of the split and evaluate the second part to VO
    if len(text.split()) == 2:
        if text.split()[1] in ['(V.O.)','(V.O.).']:
            return text.split()[0]
        else: 
            return text
    else:
        return text

In [11]:
#Use split to take the V.O at the end of the in the Speaker name
df.Speaker = df.Speaker.apply(lambda x: remove_vo(x))

In [12]:
print(df.shape)
df.tail(25)

(1086, 2)


Unnamed: 0,Speaker,Text
1061,INT. HOTEL HALL (MATRIX) - DAY,The BLOW ECHOES deep in his mind. His eyes sn...
1062,INT. HOVERCRAFT,Trinity screams as the monitors jump back to ...
1063,TRINITY,Now get up!
1064,INT. HALL (MATRIX) - DAY,"Neo struggles, holding his chest, sitting up...."
1065,AGENT SMITH,Nooooo! Agent Smith pounds on the elevator ...
1066,INT. HOVERCRAFT,A sentinel BLOWS a hole in Morpheus' right ar...
1067,INT. ROOM 303 (MATRIX) - DAY,Neo dives for the RINGING PHONE.
1068,INT. HOVERCRAFT,"Machines split open the craft, pouring in, la..."
1069,TRINITY,Now! He turns the key.
1070,INT. OVERFLOW PIT,A blinding shock of white lights flood the ch...


In [13]:
df.head(25)

Unnamed: 0,Speaker,Text
0,COMPUTER SCREEN,So close it has no boundaries. A blinking cur...
1,MAN,"Hello? Data now slashes across the screen, ..."
2,SCREEN,Call trans opt: received. 2-19-96 13:24...
3,WOMAN,I'm inside. Anything to report? We listen ...
4,TRINITY.,
5,CYPHER,Let's see. Target left work at
6,5:01 PM.,
7,SCREEN,Trace program: running. The entire screen ...
8,CYPHER,He caught the northbound Howard line. Go...
9,TRINITY,"All right, you're relieved. Use the usua..."


In [14]:
sorted(list(df.Speaker.unique()))

['          FADE OUT.',
 '         FADE TO BLACK.',
 '       CUT TO:',
 '     AGENT BROWN',
 '     AGENT BROWTJ',
 '     AGENT JONES',
 '     AGENT SMITH',
 '     AGENT SMITH ',
 '     ANTHONY',
 '     APOC',
 '     BIG COP',
 '     BOY',
 '     CABLE',
 '     COP',
 '     COPS',
 '     CYPHER',
 '     CYPHER,',
 '     DOZER',
 '     DUJOUR',
 '     FEDEX',
 '     GIZMO',
 '     LIEUTENANT',
 '     MAN (O.S.)',
 '     MOJO',
 '     MOMMY',
 '     MORPHEUS',
 '     MOUSE',
 '     NEO',
 '     OLD MAN',
 '     ORACLE',
 '     ORACLE (WOMAN)',
 '     PILOT',
 '     PRIESTESS',
 '     REX',
 '     RHINEHEART',
 '     SCREEN',
 '     SERGEANT',
 '     SPOON BOY',
 '     SWITCH',
 '     TALL EMPLOYEE',
 '     TANK',
 '     THE END',
 '     TRINITY',
 '     VOICE (O.S.)',
 '     WOMAN (O.S.)',
 '    MORPHEUS',
 '   #312-555-0690',
 '   5:01 PM.',
 '   E.M.P?',
 '   FOS4:  ALL HAIL SEGA!!!',
 '   R.S.I.',
 ' --',
 ' 305...  304...',
 ' 9:15 A.M.',
 ' A."',
 ' ANGLE ON NEO',
 ' BA-BOOM!',
 ' BO

In [15]:
len(df.Speaker.unique())

154

In [16]:
len(df.loc[df['Speaker'] == '     NEO'])

175

I need to delete extra space in speaker name

In [17]:
#create a function to remove space before the name
def remove_space(text):
    return " ".join(text.split())

In [18]:
#Use the function to remove extra space
df.Speaker = df.Speaker.apply(lambda x: remove_space(x))

In [19]:
len(df.Speaker.unique())

146

In [20]:
df.Speaker.value_counts()

NEO                                       180
MORPHEUS                                  166
TRINITY                                   132
AGENT SMITH                                74
TANK                                       69
                                         ... 
INT. WASTE LINE                             1
ORACLE (WOMAN)                              1
INT. POWER PLANT - CLOSE ON MAN'S BODY      1
HELICOPTER BEGIN TO DIE.                    1
EXT. LOWER WACKER                           1
Name: Speaker, Length: 146, dtype: int64

In [21]:
df.loc[df['Speaker'] == 'NEO'].head()

Unnamed: 0,Speaker,Text
66,NEO,Fuckin' idiots don't know shit. He finishes...
69,NEO,What the hell...
72,NEO,"What do you want, Anthony?"
74,NEO,You got the money this time? He holds up tw...
82,NEO,I don't know. I have to work tomorrow.


I need to delete all the cells without text --> len(text) == 0

In [22]:
#work with the cells without text

In [23]:
#Check if it is None or blank
# df.Text.iloc[4] == None #False
df.Text.iloc[4] == '' #True

True

In [24]:
#How many of these empty cells do I have?
len(df[df.Text == ''])

20

In [25]:
df.head()

Unnamed: 0,Speaker,Text
0,COMPUTER SCREEN,So close it has no boundaries. A blinking cur...
1,MAN,"Hello? Data now slashes across the screen, ..."
2,SCREEN,Call trans opt: received. 2-19-96 13:24...
3,WOMAN,I'm inside. Anything to report? We listen ...
4,TRINITY.,


In [26]:
df_cleaned = df[df.Text != ''].reset_index(drop=True)

In [27]:
df_cleaned.head()

Unnamed: 0,Speaker,Text
0,COMPUTER SCREEN,So close it has no boundaries. A blinking cur...
1,MAN,"Hello? Data now slashes across the screen, ..."
2,SCREEN,Call trans opt: received. 2-19-96 13:24...
3,WOMAN,I'm inside. Anything to report? We listen ...
4,CYPHER,Let's see. Target left work at


In [28]:
df_cleaned.shape

(1066, 2)

In [29]:
df_cleaned.Speaker.value_counts()

NEO                                                  180
MORPHEUS                                             166
TRINITY                                              132
AGENT SMITH                                           74
TANK                                                  69
                                                    ... 
EXT. ROOF                                              1
INT. CHASE HOTEL - NIGHT                               1
INT. INTERROGATION ROOM - CLOSE ON CAMERA MONITOR      1
INT. ROOM 303 (MATRIX) - DAY                           1
EXT. LOWER WACKER                                      1
Name: Speaker, Length: 132, dtype: int64

In [30]:
df_cleaned.head()

Unnamed: 0,Speaker,Text
0,COMPUTER SCREEN,So close it has no boundaries. A blinking cur...
1,MAN,"Hello? Data now slashes across the screen, ..."
2,SCREEN,Call trans opt: received. 2-19-96 13:24...
3,WOMAN,I'm inside. Anything to report? We listen ...
4,CYPHER,Let's see. Target left work at


In [31]:
df_cleaned['Movie'] = 'The Matrix'

In [32]:
df_cleaned.head()

Unnamed: 0,Speaker,Text,Movie
0,COMPUTER SCREEN,So close it has no boundaries. A blinking cur...,The Matrix
1,MAN,"Hello? Data now slashes across the screen, ...",The Matrix
2,SCREEN,Call trans opt: received. 2-19-96 13:24...,The Matrix
3,WOMAN,I'm inside. Anything to report? We listen ...,The Matrix
4,CYPHER,Let's see. Target left work at,The Matrix


In [34]:
#Saving as pickle
import pickle 
with open('script_cleaned_TM.pkl', 'wb') as f:
            pickle.dump(df_cleaned, f)