In [1]:
import sys
import re, numpy as np, pandas as pd

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
import string

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
#Define stopwords
punctuation = "".join([symbol for symbol in string.punctuation if symbol not in ["'", '"']])
punctuation += '–'
punctuation += '...'

stopwords_list = stopwords.words('english')
stopwords_list += list(punctuation)

In [3]:
#Checking my list of stopwords
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# Getting DF

In [24]:
#Importing dataframes
df = pd.read_pickle('script_TMReloaded.pkl')

In [25]:
df.shape

(778, 2)

In [62]:
df.head(25)

Unnamed: 0,Speaker,Text
0,WOMAN,"""Six o'clock, 300 meters. We can't ..."
1,MAN,"""I know. Can't tow this crate fast enough!..."
2,NIOBE,"""Can't, the core is still good. And they'..."
3,MAN,"""Oooh, our savior. Hed better be..."
4,NIOBE,"""Shut up and make the exit. Ho..."
5,NIOBE,"""Woo-hoooh!""The r..."
6,MAN,"""They're still on us!""..."
7,INT. NEBACHANEZZER COCKPIT,"The NEB is slung between the two HOVERCRAFT, w..."
8,MORPHEUS,"""Niobe, theyre closing."""
9,NIOBE,"""How many?"""


# Cleaning DF

In [27]:
df.Speaker.value_counts()

                               MORPHEUS      93
                                 NEO         92
                                RAZOR        51
                               TRINITY       49
                               GREGORY       46
                                             ..
INT. BATHROOM HOUSE (MATRIX) - SUNSET         1
CU: TANKS                                     1
                             PILOT (V.O.)     1
EXT. DESERT HIGHWAY (NIGHT)                   1
                               SERGEANT       1
Name: Speaker, Length: 171, dtype: int64

In [28]:
df.Speaker.nunique()

171

In [29]:
df.Speaker.unique()

array(['                             WOMAN (V.O.)',
       '                              MAN (V.O.)',
       '                             NIOBE (V.O.)',
       'INT. NEBACHANEZZER COCKPIT',
       '                               MORPHEUS',
       '                                NIOBE', 'INT. SEPHORA COCKPIT',
       'INT. FREEDOM COCKPIT', '                                 MAN',
       'EXT. FREEDOM HULL', '                                CIRCA',
       'INT. NEB COCKPIT', '                         DEFENDER ONE (V.O.)',
       'EXT. THE SURFACE', '                                 TANK',
       '                               TRINITY',
       '                                 NEO',
       '                                                              FADE OUT',
       'FADE INTO:',
       '                                                               CUT TO:',
       'INT. MAIN DECK, NEB', 'EXT. ZION TARMAC',
       '                                 "M."',
       '                  

* I need to remove all the V.O to keep only the names 

In [30]:
#create a function that take a text and remove the "V.O" at the end

def remove_vo(text):
    
    #check the length of the split and evaluate the second part to VO
    if len(text.split()) == 2:
        if text.split()[1] in ['(V.O.)','(V.O.).','(V.O. )']:
            return text.split()[0]
        else: 
            return text
    else:
        return text

In [31]:
#Use split to take the V.O at the end of the in the Speaker name
df.Speaker = df.Speaker.apply(lambda x: remove_vo(x))

In [32]:
print(df.shape)
df.tail(25)

(778, 2)


Unnamed: 0,Speaker,Text
753,NEO,"""Morpheus! Holy shit! You wont believe..."
754,MORPHEUS,"""I know."""
755,NEO,"""I couldnt do it. Morpheus, there..."
756,INT. MAIN DECK - NIGHT,MORPHEUS has the headset on. He stands ...
757,MORPHEUS,"""Yes, there is a problem. The Matr..."
758,EXT. BURNT-OUT CATHEDRAL (MATRIX) - NIGHT,NEOS relief fades into conf...
759,NEO,"""I know. Im sorry."""
760,MORPHEUS (O.S.),"""And the problem is..."""
761,NEO,"""Uh, its impossible. If the Matrix is destro..."
762,NEO,"""Morpheus? Did you know th..."


I need to remove (O.S) in the name of the speaker

In [41]:
#create a function that take a text and remove the "O.S" at the end

def remove_os(text):
    
    #check the length of the split and evaluate the second part to VO
    if len(text.split()) == 2:
        if text.split()[1] in ['(O.S.)','(O.S.).', ' (O.S.)']:
            return text.split()[0]
        else: 
            return text
    else:
        return text

In [42]:
#Use split to take the O.S at the end of the in the Speaker name
df.Speaker = df.Speaker.apply(lambda x: remove_os(x))

In [43]:
#Checking the result
print(df.shape)
df.tail(25)

(778, 2)


Unnamed: 0,Speaker,Text
753,NEO,"""Morpheus! Holy shit! You wont believe..."
754,MORPHEUS,"""I know."""
755,NEO,"""I couldnt do it. Morpheus, there..."
756,INT. MAIN DECK - NIGHT,MORPHEUS has the headset on. He stands ...
757,MORPHEUS,"""Yes, there is a problem. The Matr..."
758,EXT. BURNT-OUT CATHEDRAL (MATRIX) - NIGHT,NEOS relief fades into conf...
759,NEO,"""I know. Im sorry."""
760,MORPHEUS,"""And the problem is..."""
761,NEO,"""Uh, its impossible. If the Matrix is destro..."
762,NEO,"""Morpheus? Did you know th..."


In [44]:
df.head(25)

Unnamed: 0,Speaker,Text
0,WOMAN,"""Six o'clock, 300 meters. We can't ..."
1,MAN,"""I know. Can't tow this crate fast enough!..."
2,NIOBE,"""Can't, the core is still good. And they'..."
3,MAN,"""Oooh, our savior. Hed better be..."
4,NIOBE,"""Shut up and make the exit. Ho..."
5,NIOBE,"""Woo-hoooh!""The r..."
6,MAN,"""They're still on us!""..."
7,INT. NEBACHANEZZER COCKPIT,"The NEB is slung between the two HOVERCRAFT, w..."
8,MORPHEUS,"""Niobe, theyre closing."""
9,NIOBE,"""How many?"""


In [63]:
sorted(list(df.Speaker.unique()))

['"HEY!"',
 '"M."',
 '"THIS IS THE POLICE, STOP YOUR VEHICLES AND SURRENDER!"',
 '?"',
 'AGENT BROWN',
 'AGENT JONES',
 'AGENT SMITH',
 'AGENT SMITH.',
 'ANGLE ON: THE DOOR',
 'BROWN',
 'CHANDRA',
 'CHANDRA.',
 'CHOI',
 'CHOPPER LOUDSPEAKER',
 'CIRCA',
 'COP',
 'CU: AGENT BROWN',
 'CU: AGENT SMITH',
 'CU: AGM-65 HELLFIRE GROUND-TO-AIR-MISSILE',
 'CU: CHOI',
 'CU: GREGOR',
 'CU: GREGORY',
 'CU: GREGORY AND CHANDRA',
 'CU: GUTTED CRUISER',
 'CU: NEO',
 'CU: PHONE',
 'CU: PILOT',
 'CU: SMITH',
 'CU: SOLDIER',
 'CU: SOLDIERS',
 'CU: TANK GUNNER',
 'CU: TANKS',
 'CU: TRINITY',
 'CUT TO:',
 'CUT TO: NEO',
 'DEFENDER ONE (V.O.)',
 'DOWNLOADED',
 'DUJOUR',
 'EXT. ABANDONED WAREHOUSE',
 'EXT. ALLEY (MATRIX) - DAY',
 'EXT. ALLEY (MATRIX) DAY',
 'EXT. ALLEY - DAY',
 'EXT. BEACH',
 'EXT. BOOTH (MATRIX) - SUNSET',
 'EXT. BUILDING ROOFTOP',
 'EXT. BURNT-OUT CATHEDRAL (MATRIX) - NIGHT',
 'EXT. CAMPFIRE',
 'EXT. CITY STREET',
 'EXT. CRUISER',
 'EXT. DESERT',
 'EXT. DESERT (DAY)',
 'EXT. DESERT HIGHWAY

In [46]:
len(df.Speaker.unique())

169

In [64]:
len(df.loc[df['Speaker'] == '     NEO'])

0

I need to delete extra space in speaker name

In [48]:
#create a function to remove space before the name
def remove_space(text):
    return " ".join(text.split())

In [49]:
#Use the function to remove extra space
df.Speaker = df.Speaker.apply(lambda x: remove_space(x))

In [50]:
len(df.Speaker.unique())

160

In [51]:
df.Speaker.value_counts()

MORPHEUS                       104
NEO                             92
RAZOR                           53
TRINITY                         49
GREGORY                         46
                              ... 
EXT. ROAD                        1
EXT. ZION STREET                 1
GREG                             1
EXT. DESERT HIGHWAY (NIGHT)      1
TARGET: MATRIX MAINFRAME         1
Name: Speaker, Length: 160, dtype: int64

In [52]:
df.loc[df['Speaker'] == 'NEO'].head()

Unnamed: 0,Speaker,Text
31,NEO,"""What's that?"""
38,NEO,"""Who is this guy?"""
41,NEO,"""What happened to him?""..."
42,NEO,"""Was he like me?"" ..."
44,NEO,"""No. I've never been a big sl..."


In [53]:
len(df.loc[df['Speaker'] == 'NEO'])

92

I need to delete all the cells without text --> len(text) == 0

In [None]:
#work with the cells without text

In [54]:
#Check if it is None or blank
# df.Text.iloc[4] == None #False
df.Text.iloc[4] == '' #True

False

In [65]:
#How many of these empty cells do I have?
len(df[df.Text == ''])

37

In [66]:
df.head()

Unnamed: 0,Speaker,Text
0,WOMAN,"""Six o'clock, 300 meters. We can't ..."
1,MAN,"""I know. Can't tow this crate fast enough!..."
2,NIOBE,"""Can't, the core is still good. And they'..."
3,MAN,"""Oooh, our savior. Hed better be..."
4,NIOBE,"""Shut up and make the exit. Ho..."


In [67]:
df_cleaned = df[df.Text != ''].reset_index(drop=True)

In [68]:
df_cleaned.head()

Unnamed: 0,Speaker,Text
0,WOMAN,"""Six o'clock, 300 meters. We can't ..."
1,MAN,"""I know. Can't tow this crate fast enough!..."
2,NIOBE,"""Can't, the core is still good. And they'..."
3,MAN,"""Oooh, our savior. Hed better be..."
4,NIOBE,"""Shut up and make the exit. Ho..."


In [69]:
df_cleaned.shape

(741, 2)

In [70]:
df_cleaned.Speaker.value_counts()

MORPHEUS                    104
NEO                          92
RAZOR                        53
TRINITY                      49
GREGORY                      46
                           ... 
HOMELESS                      1
INT. COP CAR                  1
EXT. ROAD                     1
EXT. ZION STREET              1
TARGET: MATRIX MAINFRAME      1
Name: Speaker, Length: 146, dtype: int64

In [61]:
df_cleaned.Speaker.unique()

array(['WOMAN', 'MAN', 'NIOBE', 'INT. NEBACHANEZZER COCKPIT', 'MORPHEUS',
       'INT. SEPHORA COCKPIT', 'INT. FREEDOM COCKPIT',
       'EXT. FREEDOM HULL', 'CIRCA', 'INT. NEB COCKPIT',
       'DEFENDER ONE (V.O.)', 'EXT. THE SURFACE', 'TANK', 'TRINITY',
       'NEO', 'FADE OUT', 'FADE INTO:', 'INT. MAIN DECK, NEB',
       'EXT. ZION TARMAC', 'CHOI', 'DUJOUR', 'EXT. ZION STREET',
       'WAITRESS', 'EXT. MAIN STREET, ZION', 'INT. DEAD DUCK',
       'EXT. TUNNEL', 'INT. MAIN DECK', 'RAZOR',
       'EXT. MOUNTAIN TOP, LATE DAY (CONSTRUCT)', 'INT. MAIN BRIDGE',
       'EXT. GLADE (CONSTRUCT)', 'INT. OFFICE (MATRIX)', 'AGENT JONES',
       'AGENT BROWN', 'JONES', 'BROWN', 'MAN\x92S VOICE (O.S.)',
       'ANGLE ON: THE DOOR', 'SMITH', 'EXT. DESERT HIGHWAY (NIGHT)',
       'CHANDRA', 'GREG', 'INT. CRUISER', 'COP', 'EXT. HIGHWAY',
       'GREGORY', 'INT. COP CAR', 'EXT. ROAD', 'EXT. CRUISER',
       'EXT. BEACH', 'EXT. DUNES', 'EXT. CAMPFIRE',
       '"THIS IS THE POLICE, STOP YOUR VEHICLES A

In [None]:
# I need to remove the CU: at the beginning of some names

In [100]:
#create a function that take a text and remove the "C.U" at the beginning

def remove_cu(text):
    
    #check the length of the split and evaluate the second part to VO
    if len(text.split())>=2:
        if text.split()[0] in ['CU:']:
            return " ".join(text.split()[1:])
        else: 
            return text
    else:
        return text

In [102]:
#testing the function
remove_cu('CU: GREGORY AND CHANDRA')

'GREGORY AND CHANDRA'

In [103]:
#Use the function to take the CU and other things in the Speaker name
df_cleaned.Speaker = df_cleaned.Speaker.apply(lambda x: remove_cu(x))

In [104]:
len(df_cleaned.Speaker.unique())

139

In [105]:
df_cleaned.Speaker.unique()

array(['WOMAN', 'MAN', 'NIOBE', 'INT. NEBACHANEZZER COCKPIT', 'MORPHEUS',
       'INT. SEPHORA COCKPIT', 'INT. FREEDOM COCKPIT',
       'EXT. FREEDOM HULL', 'CIRCA', 'INT. NEB COCKPIT',
       'DEFENDER ONE (V.O.)', 'EXT. THE SURFACE', 'TANK', 'TRINITY',
       'NEO', 'FADE OUT', 'FADE INTO:', 'INT. MAIN DECK, NEB',
       'EXT. ZION TARMAC', 'CHOI', 'DUJOUR', 'EXT. ZION STREET',
       'WAITRESS', 'EXT. MAIN STREET, ZION', 'INT. DEAD DUCK',
       'EXT. TUNNEL', 'INT. MAIN DECK', 'RAZOR',
       'EXT. MOUNTAIN TOP, LATE DAY (CONSTRUCT)', 'INT. MAIN BRIDGE',
       'EXT. GLADE (CONSTRUCT)', 'INT. OFFICE (MATRIX)', 'AGENT JONES',
       'AGENT BROWN', 'JONES', 'BROWN', 'MAN\x92S VOICE (O.S.)',
       'ANGLE ON: THE DOOR', 'SMITH', 'EXT. DESERT HIGHWAY (NIGHT)',
       'CHANDRA', 'GREG', 'INT. CRUISER', 'COP', 'EXT. HIGHWAY',
       'GREGORY', 'INT. COP CAR', 'EXT. ROAD', 'EXT. CRUISER',
       'EXT. BEACH', 'EXT. DUNES', 'EXT. CAMPFIRE',
       '"THIS IS THE POLICE, STOP YOUR VEHICLES A

In [107]:
df_cleaned.head(25)

Unnamed: 0,Speaker,Text
0,WOMAN,"""Six o'clock, 300 meters. We can't ..."
1,MAN,"""I know. Can't tow this crate fast enough!..."
2,NIOBE,"""Can't, the core is still good. And they'..."
3,MAN,"""Oooh, our savior. Hed better be..."
4,NIOBE,"""Shut up and make the exit. Ho..."
5,NIOBE,"""Woo-hoooh!""The r..."
6,MAN,"""They're still on us!""..."
7,INT. NEBACHANEZZER COCKPIT,"The NEB is slung between the two HOVERCRAFT, w..."
8,MORPHEUS,"""Niobe, theyre closing."""
9,NIOBE,"""How many?"""


In [108]:
df_cleaned.tail(25)

Unnamed: 0,Speaker,Text
716,NEO,"""Morpheus! Holy shit! You wont believe..."
717,MORPHEUS,"""I know."""
718,NEO,"""I couldnt do it. Morpheus, there..."
719,INT. MAIN DECK - NIGHT,MORPHEUS has the headset on. He stands ...
720,MORPHEUS,"""Yes, there is a problem. The Matr..."
721,EXT. BURNT-OUT CATHEDRAL (MATRIX) - NIGHT,NEOS relief fades into conf...
722,NEO,"""I know. Im sorry."""
723,MORPHEUS,"""And the problem is..."""
724,NEO,"""Uh, its impossible. If the Matrix is destro..."
725,NEO,"""Morpheus? Did you know th..."


In [123]:
df_cleaned['Movie'] = 'The Matrix Reloaded'

In [119]:
df_cleaned  = df_cleaned.drop(columns = 'Movie#')

In [124]:
df_cleaned.head()

Unnamed: 0,Speaker,Text,Movie
0,WOMAN,"""Six o'clock, 300 meters. We can't ...",The Matrix Reloaded
1,MAN,"""I know. Can't tow this crate fast enough!...",The Matrix Reloaded
2,NIOBE,"""Can't, the core is still good. And they'...",The Matrix Reloaded
3,MAN,"""Oooh, our savior. Hed better be...",The Matrix Reloaded
4,NIOBE,"""Shut up and make the exit. Ho...",The Matrix Reloaded


In [125]:
#Saving as pickle
import pickle 
with open('script_cleaned_TMR.pkl', 'wb') as f:
            pickle.dump(df_cleaned, f)