# Installing dependencies

In [None]:
%pip install scipy==1.10.1
%pip install scikit-learn gensim nltk pysrt bs4 contractions

In [None]:
import string
import pysrt
from bs4 import BeautifulSoup
import contractions
import nltk

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download("stopwords")
nltk.download("punkt")

from gensim.models import Word2Vec
from sklearn.cluster import KMeans

# Preprocessing Text
We first need to preprocess text, since it will taken from a subtitle file. We've used `pysrt` package to extract the text from the subtitle file, but then we also need to clear out the html tags (`<i>...<\i>`), contractions (i'll=>i will), and remove stopwords (words such as i, he, she, am, etc. that aren't useful to our analysis).
                                                                                                                                                                                           We tokenize the text with `nltk` library, and create an array of sentences, where each sentence is an array of words.                                                                         
                                                                                                                                                                                                                                                                    

In [28]:
# Preprocess Text

def separate_hypenated_word(sentence):
    ret_sentence = []
    for word in sentence:
        if '-' in word:
            parts = word.split('-')
            for part in parts:
                if part:
                    ret_sentence.append(part)
        else:
            ret_sentence.append(word)
    return ret_sentence

def lemmatize(sentence, lemmatizer):
    ret_sentence = []
    for word in sentence:
        lem_word = lemmatizer.lemmatize(word)
        if lem_word == 'cdos':
            lem_word = 'cdo'
        ret_sentence.append(lem_word)
    return ret_sentence
            
def tokenize_subtitle(subtitle_path):
    # Read the subtitle file
    subs = pysrt.open(subtitle_path, encoding='utf-8')

    # Extract text from subtitle objects and remove HTML tags
    text = ' '.join([sub.text for sub in subs])
    soup = BeautifulSoup(text, 'html.parser')
    for font_tag in soup.find_all('font'):
        font_tag.decompose()
    clean_text = soup.get_text()

    clean_text = contractions.fix(clean_text)

    # Tokenize into sentences
    sentences = sent_tokenize(clean_text)

    # Get English stopwords
    stop_words = set(stopwords.words('english') + list(string.punctuation) + ['...', '``', '\'\'', '\'s', 'us'])
    wnl = WordNetLemmatizer()
    
    # Tokenize each sentence into words and remove stopwords
    tokenized_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [word.lower() for word in words if word and not word.isdigit()]
        words = separate_hypenated_word(words)
        words = [word for word in words if word not in stop_words]
        words = lemmatize(words, wnl)        
        
        if len(words)>1 :
            tokenized_sentences.append(words)
    
    return tokenized_sentences

# Path to the subtitle file
subtitle_path = "big_short.srt"

# Tokenize the subtitle file and remove stopwords
tokens = tokenize_subtitle(subtitle_path)
for sentence in tokens:
    print(sentence)

['hiya', 'frank']
['wife', 'kid']
['know', 'considering', 'treasury', 'bond', 'utility', 'stock']
['late', "'70s", 'banking', 'job', 'went', 'make', 'large', 'sum', 'money']
['fucking', 'snooze']
['filled', 'loser']
['like', 'selling', 'insurance', 'accounting']
['banking', 'boring', 'bond', 'department', 'bank', 'straight', 'comatose']
['know', 'bond']
['give', 'snot', 'nosed', 'kid', 'turn']
['maybe', 'make', 'hundred', 'buck']
['lewis', 'ranieri', 'came', 'scene', 'salomon', 'brother']
['might', 'know', 'changed', 'life', 'michael', 'jordan', 'ipod', 'youtube', 'put', 'together']
['right', 'gentleman']
['let', 'get', 'money']
['let', 'make', 'money']
['see', 'lewis', 'know', 'yet', 'already', 'changed', 'banking', 'forever', 'one', 'simple', 'idea']
['mortgage', 'backed', 'security']
['private', 'label', 'mb']
['got', 'average', 'person', 'mortgage']
['fixed', 'rate', 'year']
['boring', 'safe', 'small', 'payoff', 'right']
['thousand', 'bundled', 'together', 'suddenly', 'yield', 'go'

# What is an embedding?
To be able to run computation over language, it is often quite convineint to create an embedding - a mapping - from words to a mathematical entity, here an n-dimensional vector. The words that are used in similar context are closer in the vector space, so for example `king` is closer to `man` than to `cat`. But their direction and magnitude also encodes information, that allows us to add to vector for `man` with the vector `queen - king` to get `woman`.

# Generating word embeddings
To generate word embeddings, we use the `gensim` library's `Word2Vec` class, which uses a continuous-bag-of-words model to generate the embeddings. What it does is that it tries to predict words that follows certain words, and then words that have similar weights in the model are the words that are used in similar context (the words following or preceding them are similar). This gets us the required embeddings.


In [43]:
# Train the model 
model = Word2Vec(tokens, vector_size=1000, window=100,  min_count=5, workers=1)

# Extract word embeddings
word_vectors = model.wv

stop_words = set(stopwords.words('english') + list(string.punctuation) + ['...', '``', '\'\'', '\'s'])

# Get word and corresponding vector
word_vector_list = [word_vectors[word] for word in word_vectors.key_to_index.keys() if word not in stop_words]
words_list = list(word_vectors.key_to_index.keys())
words_list = [word for word in words_list if word not in stop_words]


# Qualitatively assess the word embedding generated
print(word_vectors.most_similar('cdo')) 
print(word_vectors.most_similar('mortgage')) 


[('mortgage', 0.3596998453140259), ('wall', 0.3561512231826782), ('bond', 0.3501337170600891), ('swap', 0.3445431590080261), ('one', 0.30935052037239075), ('know', 0.3079161047935486), ('whole', 0.28944915533065796), ('housing', 0.2874480187892914), ('year', 0.28718188405036926), ('people', 0.28689122200012207)]
[('bond', 0.41095292568206787), ('one', 0.4016731083393097), ('wall', 0.39693984389305115), ('housing', 0.3893378973007202), ('swap', 0.3700407147407532), ('bank', 0.3670002520084381), ('cdo', 0.3596998453140259), ('know', 0.358483225107193), ('subprime', 0.35628193616867065), ('pay', 0.3538276255130768)]


### Interpretation of the results
Since the movie `Big Short` is about the 2008 housing crisis, we expect a lot of talk regarding CDOs and mortgages. The output is illustrative of this.

# Clustering
We use K-means algorithm to cluster the embeddings. We create 20 clusters, and then run some qualitative tests to see how words are clustered together.

In [44]:
# Apply K-means clustering
kmeans = KMeans(n_clusters=10, random_state=90)  # Fixing random state so the result are replicable
kmeans.fit(word_vector_list)
clusters = kmeans.labels_.tolist()

# Associate words with their clusters
clustered_words = {cluster: [] for cluster in set(clusters)}
for i, word in enumerate(words_list):
    clustered_words[clusters[i]].append(word)

# Print words in each cluster
for cluster, words in clustered_words.items():
    print(f"Cluster {cluster}: {words}")

Cluster 0: ['know', 'bond', 'mortgage', 'going', 'get', 'like', 'one', 'go', 'bank', 'people', 'cdo', 'swap', 'year', 'shit', 'okay', 'good', 'housing', 'wrong', 'big', 'short', 'subprime', 'thing', 'pay', 'morgan', 'bear', 'two', 'whole', 'world', 'rating', 'wall', 'something', 'aaa', 'loss', 'agency']
Cluster 1: ['time', 'hey', 'always', 'baum', 'michael', 'house', 'hi', 'lewis', 'paying', 'anybody', 'underlying', 'found', 'together', 'asking', 'poor', 'nothing', 'yet', 'seem']
Cluster 2: ['want', 'money', 'got', 'look', 'loan', 'burry', 'fuck', 'said', 'made', 'job', 'home', 'still', 'financial', 'kathy', 'backed', 'selling', 'collapse', 'fine', 'started', 'return', 'hold', 'explain', 'lost', 'almost', 'solid']
Cluster 3: ['right', 'would', 'let', 'ben', 'bet', 'way', 'tell', 'fund', 'come', 'number', 'credit', 'actually', 'put', 'street', 'man', 'jared', 'later', 'charlie', 'mike', 'wanted', 'dog', 'help', 'idea', 'sound', 'sold', 'might', 'trying', 'wife', 'four', 'told', 'ago', '

In [45]:
# To qualitatively analyze the clusters formed, we check the cluster the following words are assigned to.
words_to_check = ["cdo", "bond", "subprime","mortgage", "swap", "money","loan" , "stock", "market"]
for cluster, words in clustered_words.items():
    for word in words_to_check:
        if word in words:
            print(f"{word} is in Cluster {cluster}")
# Result: Most words are clustered together.


cdo is in Cluster 0
bond is in Cluster 0
subprime is in Cluster 0
mortgage is in Cluster 0
swap is in Cluster 0
money is in Cluster 2
loan is in Cluster 2
stock is in Cluster 4
market is in Cluster 4


### Interpretation of the results
The results are illustrative of the financial instruments named throught the movie.
- `stock` and `market` are in the same cluster, since `stock market` is used collectively a lot
- `swap` and `credit`, (the swaps bought up by the protagonists were `credit default swaps`)
- `subprime`, `bond`, and `mortgage` are clubbed together, since the bonds were of subprime mortgages