In [1]:
import sys
import re, numpy as np, pandas as pd

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
import string

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [2]:
#Define stopwords
punctuation = "".join([symbol for symbol in string.punctuation if symbol not in ["'", '"']])
punctuation += '–'
punctuation += '...'

stopwords_list = stopwords.words('english')
stopwords_list += list(punctuation)

In [3]:
#Checking my list of stopwords
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

# Getting DF

In [4]:
#Importing dataframes with the 2 movies scripts
df1 = pd.read_pickle('script_cleaned_TM.pkl')
df2 = pd.read_pickle('script_cleaned_TMR.pkl')

In [8]:
#Checking the data

In [5]:
df1.shape

(1066, 3)

In [6]:
df2.shape

(741, 3)

In [7]:
df1.head(25)

Unnamed: 0,Speaker,Text,Movie
0,COMPUTER SCREEN,So close it has no boundaries. A blinking cur...,The Matrix
1,MAN,"Hello? Data now slashes across the screen, ...",The Matrix
2,SCREEN,Call trans opt: received. 2-19-96 13:24...,The Matrix
3,WOMAN,I'm inside. Anything to report? We listen ...,The Matrix
4,CYPHER,Let's see. Target left work at,The Matrix
5,SCREEN,Trace program: running. The entire screen ...,The Matrix
6,CYPHER,He caught the northbound Howard line. Go...,The Matrix
7,TRINITY,"All right, you're relieved. Use the usua...",The Matrix
8,CYPHER,Do you know when we're going to make cont...,The Matrix
9,TRINITY,Soon. Only two thin digits left.,The Matrix


In [9]:
df2.head(25)

Unnamed: 0,Speaker,Text,Movie
0,WOMAN,"""Six o'clock, 300 meters. We can't ...",The Matrix Reloaded
1,MAN,"""I know. Can't tow this crate fast enough!...",The Matrix Reloaded
2,NIOBE,"""Can't, the core is still good. And they'...",The Matrix Reloaded
3,MAN,"""Oooh, our savior. Hed better be...",The Matrix Reloaded
4,NIOBE,"""Shut up and make the exit. Ho...",The Matrix Reloaded
5,NIOBE,"""Woo-hoooh!""The r...",The Matrix Reloaded
6,MAN,"""They're still on us!""...",The Matrix Reloaded
7,INT. NEBACHANEZZER COCKPIT,"The NEB is slung between the two HOVERCRAFT, w...",The Matrix Reloaded
8,MORPHEUS,"""Niobe, theyre closing.""",The Matrix Reloaded
9,NIOBE,"""How many?""",The Matrix Reloaded


# Merging dataframes into 1

In [12]:
df = pd.concat([df1, df2], ignore_index=True)

In [14]:
#Checking final dataframes

In [15]:
df.shape

(1807, 3)

In [16]:
df.head(25)

Unnamed: 0,Speaker,Text,Movie
0,COMPUTER SCREEN,So close it has no boundaries. A blinking cur...,The Matrix
1,MAN,"Hello? Data now slashes across the screen, ...",The Matrix
2,SCREEN,Call trans opt: received. 2-19-96 13:24...,The Matrix
3,WOMAN,I'm inside. Anything to report? We listen ...,The Matrix
4,CYPHER,Let's see. Target left work at,The Matrix
5,SCREEN,Trace program: running. The entire screen ...,The Matrix
6,CYPHER,He caught the northbound Howard line. Go...,The Matrix
7,TRINITY,"All right, you're relieved. Use the usua...",The Matrix
8,CYPHER,Do you know when we're going to make cont...,The Matrix
9,TRINITY,Soon. Only two thin digits left.,The Matrix


In [17]:
df.Movie.value_counts()

The Matrix             1066
The Matrix Reloaded     741
Name: Movie, dtype: int64

# Cleaning DF

In [None]:
df.Speaker.value_counts()

In [None]:
df.Speaker.nunique()

In [None]:
df.Speaker.unique()

* I need to remove all the V.O to keep only the names 

In [None]:
#create a function that take a text and remove the "V.O" at the end

def remove_vo(text):
    
    #check the length of the split and evaluate the second part to VO
    if len(text.split()) == 2:
        if text.split()[1] in ['(V.O.)','(V.O.).']:
            return text.split()[0]
        else: 
            return text
    else:
        return text

In [None]:
#Use split to take the V.O at the end of the in the Speaker name
df.Speaker = df.Speaker.apply(lambda x: remove_vo(x))

In [None]:
print(df.shape)
df.tail(25)

In [None]:
df.head(25)

In [None]:
sorted(list(df.Speaker.unique()))

In [None]:
len(df.Speaker.unique())

In [None]:
len(df.loc[df['Speaker'] == '     NEO'])

I need to delete extra space in speaker name

In [None]:
#create a function to remove space before the name
def remove_space(text):
    return " ".join(text.split())

In [None]:
#Use the function to remove extra space
df.Speaker = df.Speaker.apply(lambda x: remove_space(x))

In [None]:
len(df.Speaker.unique())

In [None]:
df.Speaker.value_counts()

In [None]:
df.loc[df['Speaker'] == 'NEO'].head()

I need to delete all the cells without text --> len(text) == 0

In [None]:
#work with the cells without text

In [None]:
#Check if it is None or blank
# df.Text.iloc[4] == None #False
df.Text.iloc[4] == '' #True

In [None]:
#How many of these empty cells do I have?
len(df[df.Text == ''])

In [None]:
df.head()

In [None]:
df_cleaned = df[df.Text != ''].reset_index(drop=True)

In [None]:
df_cleaned.head()

In [None]:
df_cleaned.shape

In [None]:
df_cleaned.Speaker.value_counts()

In [None]:
df_cleaned.head()

In [None]:
df_cleaned['Movie'] = 'The Matrix'

In [None]:
df_cleaned.head()

In [None]:
#Saving as pickle
import pickle 
with open('script_cleaned_TM.pkl', 'wb') as f:
            pickle.dump(df_cleaned, f)

In [None]:
#transform into words -- done
#removing words stops -- not sure
#counts words -- done
#counts words per actor -- not yet
#graphs
#topic modeling

# Tokenize

In [None]:
#Function to convert sentences to words
#removing stopwords
def sent_to_words(sentences):
    for sent in sentences:
#         sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\t', ' ', sent)  # remove newline chars
        sent = re.sub('\n', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        #removing stopwords
        sent = 
#         yield(sent) 
    return sent

In [None]:
df_cleaned['Words'] = df_cleaned.Text.apply(lambda x: sent_to_words([x]))

In [None]:
df_cleaned.head()

In [None]:
# Convert to list
# data = df_cleaned.Text.values.tolist()
# data_words = list(sent_to_words(data))
# print(data_words[:1])

In [None]:
df_cleaned.Words[0]

In [None]:
df_cleaned['No_Words'] = df_cleaned.Words.apply(lambda x: len(x))

In [None]:
df_cleaned.head()

In [None]:
df_cleaned.groupby(['Speaker']).sum()

In [None]:
len(data)

# Build Bigram, Trigram Models and Lemmatize

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once

In [None]:
def process_words(texts, stop_words=stopwords_list, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

In [None]:
data_ready = process_words(data_words)  # processed the entire movie

In [None]:
data_ready[:1]

# Build the Topic Modeling

In [None]:
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=6, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

In [None]:
#check the topic
for topic in lda_model.print_topics():
    print(topic)
    print("--------")
# print(lda_model.print_topics())

# Dominant topic and its percentage contribution in each text/action

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)

In [None]:
# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)

In [None]:
#Let's have a look
df_dominant_topic.shape

In [None]:
df_dominant_topic.Dominant_Topic.value_counts(normalize=True)

In [None]:
df_cleaned.shape

# Most representative action for each topic

In [None]:
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

# Show
sent_topics_sorteddf_mallet.head(10)

# Frequency Distribution of Word Counts in movie

In [None]:
# 1. Wordcloud of Top N words in each topic
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

# cols = [color for name, color in mcolors.XKCD_COLORS.items()]  # more colors: 'mcolors.XKCD_COLORS - mcolors.TABLEAU_COLORS'

cols = ['blue','#fd8d49','green','#9e003a']
cloud = WordCloud(stopwords=stopwords_list,
                  background_color='white',
                  width=2500,
                  height=1800,
                  max_words=30,
                  colormap='tab10',
                  color_func=lambda *args, **kwargs: cols[i],
                  prefer_horizontal=1.0)

topics = lda_model.show_topics(formatted=False,num_words=30)

fig, axes = plt.subplots(2, 2, figsize=(10,10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
#     print(i,topics[i][1])
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')


plt.subplots_adjust(wspace=-3, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.savefig('topic_wordcloud.png',dpi=180)
plt.show()