# Homework #2

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import os
import json
import re

import numpy as np
import pandas as pd

# Problem 1: Word phrases

### In this problem we will look at methods to identify valid n-grams such as 'New York' or 'Barack Obama' while eliminating statistical flukes such as `in the` or `i write`.

### Preprocessing such as this can drastically improved embeddings since words can ngrams will often have a different meaning than the sum of its parts
### `V('united')` + `V('states')` != `V('united states')`
### `V('real')` + `V('estate')` != `V('real estate')`

In [12]:
wiki_df = pd.read_csv('./data/kdwd_r1k_articles.csv')

### get consecutive unigrams for the 'intro_text' column of our dataset

In [14]:
# list of lists of unigrams
unigram_pattern = r'[a-z0-9]+'
corpus = [re.findall(unigram_pattern, doc.lower()) for doc in wiki_df['intro_text'].tolist()]

### The package `gensim` has a convenient wrapper to obtain statistically significant ngrams/Phrase automatically

### we need to first `pip install gensim`
### `gensim` is a useful library for anything related to word representations and embeddings. It will come up a few more times. https://radimrehurek.com/gensim/index.html

In [15]:
import sys
!{sys.executable} -m pip install gensim
from gensim.models.phrases import Phrases



### Write some code to parse our corpus and use valid ngrams using `Phrases`

In [16]:
phrases = Phrases(corpus, min_count=1, threshold=1)

In [17]:
vocab_count_dict = {k.decode('utf8'): v for k, v in phrases.vocab.items()}

n_grams = pd.Series(data=vocab_count_dict).sort_values(ascending=False)

In [18]:
print(n_grams.shape[0], 'n-grams found')
n_grams.head(10)

60689 n-grams found


the            4873
and            4173
in             3706
of             2422
company        1884
is             1686
a              1336
to             1072
the_company     999
s               961
dtype: int64

### How do the results look? Can you improve the results by excluding common terms using the `common_terms` kwarg of `Phrases`?

In [19]:
from sklearn.feature_extraction import stop_words

phrases = Phrases(corpus, min_count=1, threshold=1, common_terms=stop_words.ENGLISH_STOP_WORDS)



In [20]:
vocab_count_dict = {k.decode('utf8'): v for k, v in phrases.vocab.items()}
n_grams = pd.Series(data=vocab_count_dict).sort_values(ascending=False)

In [21]:
print(n_grams.shape[0], 'n-grams found')
n_grams.head(10)

55985 n-grams found


company          1884
s                 961
american          498
largest           453
states            451
united            443
services          418
united_states     393
corporation       391
products          360
dtype: int64

## This was convenient, but it's also a black box where many of the knobs for tuning are actually broken in the newest version. Let's try to create our own solution for finding n-grams.

### To do this, let's start by counting unigrams and bigrams within our corpus

### Tip: use Counter for easy counting. It behaves similar to a dictionary with some added functionality around counting. such as `my_counter[unknown_key]` returning `0` for all unknown keys

In [23]:
from collections import Counter

unigram_count = Counter()
bigram_count = Counter()
for doc in corpus:
    # your code here
    n = len(doc)
    for i in range(n-1):
        unigram_count[doc[i]] += 1
        bigram_count[doc[i]+' '+doc[i+1]] += 1
    unigram_count[doc[n-1]] += 1

### Now we need to come up with a score for each bigram that helps us decide on its importance and the fact of whether it is truly a bigram or two independent unigrams.

In [25]:
# your code here
tf_idf = dict()
n = wiki_df.shape[0]
for token in bigram_count.keys():
    max_tf = 0
    df = 0
    
    for document in wiki_df['intro_text'].tolist():
        
        document = document.lower()
        tf = document.count(token)/(len(document.split())-1)
        if tf > 0: df += 1
        max_tf = max(tf, max_tf)
    
    tf_idf[token] = max_tf*(np.log((1+n)/(1+df))+1)

bigram_df = pd.DataFrame(tf_idf.items(), columns=['bigram','tf-idf'])

In [26]:
bigram_df.head(10)

Unnamed: 0,bigram,tf-idf
0,apple inc,0.034528
1,inc is,0.121065
2,is an,0.147476
3,an american,0.158072
4,american multinational,0.282122
5,multinational technology,0.077946
6,technology company,0.122653
7,company headquartered,0.192896
8,headquartered in,0.145858
9,in cupertino,0.01027


### Find ways to sort and filter your output to bigrams that make sense, such as `wells fargo`, `apple inc` or `puerto rico`

In [27]:
# your code here
filtered_bigram_df = bigram_df[bigram_df['tf-idf']>0.01].sort_values(by=['tf-idf'], ascending=False)

In [28]:
filtered_bigram_df.head(10)

Unnamed: 0,bigram,tf-idf
19817,marathon oil,0.747124
50072,reinsurance company,0.69428
50069,everest re,0.69428
50071,a reinsurance,0.69428
29985,in hamilton,0.653733
49739,b2b it,0.631164
49738,provides b2b,0.631164
49911,alexandria real,0.631164
49912,estate equities,0.631164
49913,equities is,0.631164


# Problem 2: Word vectors via Pointwise Mutual Information (PMI)

### In this problem we will investigate another way of creating word representation from word co-occurrences. For this we will create a word-word matrix that counts the number of times that two words appear close to each other.

## More formally:

### The pointwise mutual information (PMI) for a (word, context) pair in a corpus is defined as the probability of their co-occurrence divided by the probabilities of them appearing individually, 
## $$
{\rm pmi}(w, c) = \log \frac{p(w, c)}{p(w) p(c)}
$$

## $$
p(w, c) = \frac{
f_{i,j}
}{
\sum_{i=1}^N \sum_{j=1}^N f_{i,j}
}, \quad 
p(w) = \frac{
\sum_{j=1}^N f_{i,j}
}{
\sum_{i=1}^N \sum_{j=1}^N f_{i,j}
}, \quad
p(c) = \frac{
\sum_{i=1}^N f_{i,j}
}{
\sum_{i=1}^N \sum_{j=1}^N f_{i,j}
}
$$
### where $f_{i,j}$ is the word-word count matrix. <br />
### In addition we can define the positive pointwise mutual information as, 
## $$
{\rm ppmi}(w, c) = {\rm max}\left[{\rm pmi(w,c)}, 0 \right]
$$

## We will implement this on our wiki featured articles dataset

In [29]:
wiki_feat_df = pd.read_csv('./data/kdwd_featured_articles.csv')
wiki_feat_df.sample(5)

Unnamed: 0,page_id,page_title,page_views,intro_text
4645,23325915,Byzantine civil war of 1341–1347,932,"The Byzantine civil war of 1341–1347, sometime..."
2632,2072335,Lambeosaurus,1232,"'Lambeosaurus' ( ; meaning ""Lambe's lizard"") i..."
1385,339877,Simeon I of Bulgaria,2458,"Tsar Simeon (also Symeon) I the Great (, trans..."
895,161190,Star Trek VI: The Undiscovered Country,8021,'Star Trek VI: The Undiscovered Country' is a ...
5383,41908173,Streatham portrait,443,"The ""Streatham"" portrait is an oil painting on..."


In [30]:
corpus = wiki_feat_df['intro_text'].tolist()

In [31]:
def get_tokens(text):
    token_pattern = r"\b\w\w+\\?'?s?(?:-\w+)?\b"
    return re.findall(token_pattern, text.lower())

In [32]:
from collections import Counter

In [33]:
unigram_counts = Counter()
for doc in corpus:
    # your code here
    tokens = get_tokens(doc)
    for word in tokens:
        unigram_counts[word] += 1
vocab = {token: n for n, token in enumerate(pd.Series(unigram_counts).sort_values().index)}
inv_vocab = {v: k for k, v in vocab.items()}
print('vocabulary size: {}'.format(len(unigram_counts)))

vocabulary size: 72750


### Skip-grams are a generalization of n-grams: https://en.wikipedia.org/wiki/N-gram#Skip-gram
### We will use this term here to find pairs of word within a context window, meaning that all words separated by max N words will be considered a bigram

In [34]:
# use skip-2-grams and context length 2 in each direction
word_window_len = 2
skipgram_counts = Counter()
for doc in corpus:
    tokens = get_tokens(doc)
    for token_idx, token in enumerate(tokens):
        for context_token in tokens[token_idx - word_window_len:token_idx + word_window_len]:
            if token != context_token:
                skipgram_counts[(vocab[token], vocab[context_token])] += 1
print('number of skipgrams:', len(skipgram_counts))
print('most common:')

[((inv_vocab[t1], inv_vocab[t2]), v) for (t1, t2), v in skipgram_counts.most_common(5)]

number of skipgrams: 1981470
most common:


[(('of', 'the'), 42125),
 (('the', 'of'), 23088),
 (('in', 'the'), 18020),
 (('the', 'in'), 17652),
 (('and', 'the'), 11390)]

### Now let's create a sparse matrix that contains word-word co-occurrence counts

In [35]:
from scipy import sparse as ssp

row_indxs = []
col_indxs = []
dat_values = []


# your code here
row_indxs = [t1 for (t1, t2), v in skipgram_counts.items()]
col_indxs = [t2 for (t1, t2), v in skipgram_counts.items()]
dat_values = [v for (t1, t2), v in skipgram_counts.items()]

In [36]:
wwcnt_mat = ssp.csr_matrix((dat_values, (row_indxs, col_indxs)))
wwcnt_mat

<72750x72750 sparse matrix of type '<class 'numpy.intc'>'
	with 1981470 stored elements in Compressed Sparse Row format>

### Next, create the PPMI matrix

In [37]:
# reusable quantities
num_skipgrams = wwcnt_mat.sum()
sum_over_words = np.array(wwcnt_mat.sum(axis=0)).flatten()
sum_over_contexts = np.array(wwcnt_mat.sum(axis=1)).flatten()

ppmi_dat_values = []   # positive pointwise mutial information
row_indxs = []  # for creating sparce matrices
col_indxs = []  # for creating sparce matrices
for (tok_word, tok_context), sg_count in skipgram_counts.items():

    nwc = sg_count
    Pwc = nwc / num_skipgrams
    nw = sum_over_words[tok_word]
    Pw = nw / num_skipgrams
    nc = sum_over_words[tok_context]
    Pc = nc / num_skipgrams
    
    pmi = np.log2(Pwc / (Pw * Pc))   
    ppmi = max(pmi, 0)
    
    row_indxs.append(tok_word)
    col_indxs.append(tok_context)
    ppmi_dat_values.append(ppmi)

ppmi_mat = ssp.csr_matrix((ppmi_dat_values, (row_indxs, col_indxs)))
ppmi_mat

<72750x72750 sparse matrix of type '<class 'numpy.float64'>'
	with 1981470 stored elements in Compressed Sparse Row format>

## Use `ppmi_mat` to investigate the most similar values to a few test terms.

In [38]:
# to speed up calculation we do dimentionality reduction here
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=64, random_state=6006)
trafo_ppmi_mat = svd.fit_transform(ppmi_mat)
sim_mat = cosine_similarity(trafo_ppmi_mat)

MemoryError: Unable to allocate 39.4 GiB for an array with shape (72750, 72750) and data type float64

In [39]:
token = 'eminem'
# print most similar terms
word_idx = vocab[token]
for idx, row in pd.Series(sim_mat[word_idx]).sort_values(ascending=False).head(10).iteritems():
    print('%.3f' % row, inv_vocab[idx])

NameError: name 'sim_mat' is not defined

In [40]:
token = 'quantum'
# print most similar terms
word_idx = vocab[token]
for idx, row in pd.Series(sim_mat[word_idx]).sort_values(ascending=False).head(10).iteritems():
    print('%.3f' % row, inv_vocab[idx])

NameError: name 'sim_mat' is not defined

## In what way do these embeddings differ the TfIdf based ones we covered in class? Can you think of advantages/disadvantages for each approach?

In [None]:
# Your answer here!
print('The difference between these embeddings and TfIdf is that these embeddings find similar terms by cosine similarity, while TfIdf based ones do that by word counting. The advantage of cosine similarity is that it can find out the relationship among different words, which is the disadvantage of TfIdf based ones since they focus on word counting. The disadvantage of these embeddings is that it requires huge amount and types of data/words to calculate the relationship, while TfIdf based ones do not need.')

# Problem 3: Word vectors for different domains


### In this problem we will creat embeddings for the `intro_text` column of the datasets `kdwd_featured_articles.csv` and `kdwd_r1k_articles.csv`
### We can think of these as examples of 'generic' and 'finance specific' word representations

## The goal of this exercise is to compare these two representations and find out which words change meaning the most across these two domains

In [1]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as ssp
from scipy.sparse import csr_matrix
import pandas as pd
import numpy as np

In [2]:
generic_df = pd.read_csv('./data/kdwd_featured_articles.csv')

In [3]:
finance_df = pd.read_csv('./data/kdwd_r1k_articles.csv')

### Create word representation for our 2 corpora using your favorite method

In [4]:
vectorizer = TfidfVectorizer(min_df=3, max_df=1.0)
generic_mat = vectorizer.fit_transform(generic_df['intro_text'].tolist())
generic_vocab = {token: n for n, token in enumerate(pd.Series(vectorizer.vocabulary_).sort_values().index)}

vectorizer = TfidfVectorizer(min_df=3, max_df=1.0)
finance_mat = vectorizer.fit_transform(finance_df['intro_text'].tolist())
finance_vocab = {token: n for n, token in enumerate(pd.Series(vectorizer.vocabulary_).sort_values().index)}

### Since our two corpora use different vocabulary we want to sub-select each representation matrix to be only of vacabulary tokens that occur in both corpora

In [5]:
common_terms = list(set(generic_vocab) & set(finance_vocab))

In [6]:
filtered_generic_mat = []
filtered_finance_mat = []

# your code here
for word in common_terms:
    filtered_generic_mat.append(generic_mat[:,generic_vocab[word]])
    filtered_finance_mat.append(finance_mat[:,finance_vocab[word]])

filtered_generic_mat = ssp.hstack(filtered_generic_mat)
filtered_finance_mat = ssp.hstack(filtered_finance_mat)

### Our documents for each corpus are different so there is no direct way of comparing our two representations, even though they now have the same dimension. To get them on equal footing, let's look at the word-word similarlity matrix for each domain.
### Comparing these two, find terms that seem to have a drastically different meaning within the two domains.

In [7]:
# get the term-term similarity matrix
generic_term_sim_mat = cosine_similarity(filtered_generic_mat.T)
finance_term_sim_mat = cosine_similarity(filtered_finance_mat.T)

In [8]:
term_drift_scores = {}
for n, term in enumerate(common_terms):
    term_sim = cosine_similarity([generic_term_sim_mat[n]],[finance_term_sim_mat[n]])
    term_drift_scores[term] = term_sim

In [9]:
term_drifts = pd.Series(term_drift_scores).sort_values()

In [10]:
term_drifts.head(10)

highway            [[0.24520438464636277]]
staffing            [[0.2871955698477838]]
written             [[0.2912808573896264]]
pharmaceuticals    [[0.29643837015355917]]
formula            [[0.29802407875828485]]
fruit              [[0.29827656395060986]]
king               [[0.29960946318389897]]
preferred          [[0.30167333531360996]]
proved              [[0.3026852090127874]]
won                [[0.30302971798690137]]
dtype: object

In [11]:
term_drifts.tail(10)

for     [[0.8114789418584171]]
its     [[0.8152663255658184]]
with    [[0.8206452301148983]]
and     [[0.8304891735052484]]
in      [[0.8319360747475575]]
as       [[0.833800459880701]]
by      [[0.8361606478193372]]
of      [[0.8423985374468572]]
the     [[0.8526193824575488]]
to      [[0.8538485132645217]]
dtype: object

# Problem 4: Corporate Similarity and Returns
### In this example we'll explore how to use NLP to measure corporate similarity

### In particular we will
 - ### Make word vectors for firms in order to get an NLP measure of similarity
 - ### Measure the quality of this similarity metric by predicting future co-movement of returns. 
 
## Step X: This problem uses a few concepts of basic modeling such as `sklearn.model_selection.train_test_split` and `sklearn.linear_model.LinearRegression`
## Feel free to read some of the sklearn documentation, but otherwise we will cover these concepts next class
 

# $ \\ $
## Step 0: Load the MD&A section from Form-10-K from 2016

In [None]:
with open('parsed_mda.json') as f:
    data = json.load(f)

### Next, take only the first filing for each company

In [None]:
clean = []
seen = set()
for item in data:
    if item['ticker'] in seen:
        continue
    else:
        seen.add(item['ticker'])
        clean.append(item)
data = clean
del clean

### Now load the price data for 2015-2018

In [None]:
prices = pd.read_csv(
    'sp500_prices.csv', 
    index_col=0, 
    parse_dates=True
).loc['2015-01-01':'2018-01-01']

In [None]:
prices.head()

In [None]:
data_tickers = [item['ticker'] for item in data]
assert len(data_tickers) == len(set(data_tickers)), 'non-unique tickers, this will not work'

## Step 1: clean the text
### Much of NLP boils down to doing reasonable processing on text.
### First, we'll try out very minimial processing

In [None]:
def clean_mda_simple(mda):
    return mda.lower()

In [None]:
# add import here
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()
word_lower = []
for item in data:
    word_lower.append(clean_mda_simple(item['mda']))
word_vecs = vectorizer.fit_transform(word_lower)
vocab = {token: n for n, token in enumerate(pd.Series(vectorizer.vocabulary_).sort_values().index)}
inv_vocab = {v: k for k, v in vocab.items()}

## Step 2: Pairwise Word similarity
### Calculate the pariwise cosine similarity between word vectors
### Make the cosine similarities into a dataframe indexed/columned on ticker symbols

In [None]:
word_sims = cosine_similarity(word_vecs)

In [None]:
df = pd.DataFrame(word_sims, index = data_tickers, columns = data_tickers)

In [None]:
df.head()

## Step 2a: Why `cosine_similarity` and not another measure?

In [None]:
# ANSWER GOES HERE
print("cosine_similarity produces results with absolute value equal or smaller than 1, which are easy to understand and compare.")

## Step 3: Wrangle the price and word data
### Our goal here is to have a dataframe which is indexed on PAIRS of tickers and has columns
 - ### `returns_correlation`: the correlation of returns for those two tickers from Jan 1 2016 to Jan 1 2017
 - ### `word_similarity`: the cosine similarity of the word vectors for the two companies' MD&A sections
 
## Tips
 - ### NB: use pct_change to calculate returns in pandas
 - ### NB: use the pandas builtin corr function to calculate correlations (we don't need anything fancy)
 - ### NB: the index of the dataframe should have two columns (the tickers)

In [6]:
# one way you might do this is
rets = prices.pct_change()
rets = rets.loc['2016-01-01':'2017-01-01']
rets_cor = rets.corr().stack().to_frame(name = 'returns_correlation') # calculate returns correlations
word_cor = df.stack()#  calcuate the word similarities in the right shape
word_cor.name = 'word_similarity'
all_data = rets_cor.join(word_cor)
all_data = all_data.dropna()
all_data.head()

NameError: name 'prices' is not defined

## Step 3a: 
 - ### What is the contemperaneous correlation of these data?
 - ### Make a scatter plot of the returns correlation and word similarities

In [None]:
all_data.corr()

In [None]:
## This should be about 12%. That's not bad, but we can do better

In [None]:
all_data.plot.scatter(x='returns_correlation', y='word_similarity')

## Part 4: Try to predict the future returns correlations
### Use OLS (`LinearRegression`) to predict `returns_correlation` from `word_similarity`. 
### What is the (contemperaneous) out of sample performance?

In [3]:
data_df = np.array(all_data)
feature_cols = data_df[:,1:]
target_col = data_df[:,0]

NameError: name 'all_data' is not defined

In [None]:
data_df

In [None]:
# add code here
from sklearn import linear_model
from sklearn.model_selection import train_test_split

reg = linear_model.LinearRegression()# add code here
X_train, X_test, y_train, y_test = train_test_split(feature_cols,target_col,test_size=0.4, random_state=0)

reg.fit(X_train, y_train)
reg.score(X_test, y_test)

In [None]:
#pd.Series(reg.coef_, index=feature_cols)

### This is not amazing. We can do better!

## $ \\ $ 

## Part 5: Repeat, but be careful
### Here we will see if we can clean the data better

### Things to try
 - ### Look at the histograms of word similarities to see if we can "ignore" some ill-behaved data
 - ### Try limiting how greedy the `TFIDFVectorizer` is: `min_df`, `max_df`, `max_features`, etc.
 
### We will examine our data and look for things that look out of place
 - ### We will ultimately want our data to look normally distributed


In [None]:
def clean_mda(mda):
    paras = [p.lower() for p in mda.split('\n') if len(p) > 40]
    cleaned =  ' '.join(paras)
    words = cleaned.split()
    words = [word for word in words if len(word) > 2]
    if len(words) > 10:
        return ' '.join(words)
    else:
        return ''

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(
    min_df = 3,
    max_df = 1.0,
    max_features= None
)
word_vecs = vec.fit_transform((clean_mda(item['mda']) for item in data))

vocab = {token: n for n, token in enumerate(pd.Series(vec.vocabulary_).sort_values().index)}
inv_vocab = {v: k for k, v in vocab.items()}

word_sims =  cosine_similarity(word_vecs)
# Lots of word similarities are all zeros- so we'll ignore
# add code here to remove rows of word_sims where all the elements are zero


In [4]:
# calculate the returns correlation and the cosine similarities as above
word_cor_wo_zero = df_wo_zero.stack()#  calcuate the word similarities in the right shape
word_cor_wo_zero.name = 'word_similarity'
all_data = rets_cor.join(word_cor_wo_zero)
all_data = all_data.dropna()

NameError: name 'df_wo_zero' is not defined

In [5]:
# inspect your data- make some histograms
import matplotlib.pyplot as plt
plt.figure(figsize=(12,7))
all_data.returns_correlation.hist(bins=40)
plt.title('Returns Correlation')
plt.figure(figsize=(12,7))
all_data.word_similarity.hist(bins=40)
plt.title('Word Similarity')

NameError: name 'all_data' is not defined

### Cleaning our data
### It seems lots of things are identically 0 (no word overlap) or identically 1 (the MD&A section for one company perfectly overlaps itself). We will exclude those

In [None]:
# your code here
all_data = all_data[(all_data['word_similarity'] < 1) & (all_data['word_similarity'] > 0)]

In [None]:
# examine histograms again
plt.figure(figsize=(12,7))
all_data.returns_correlation.hist(bins=40)
plt.title('Returns Correlation')
plt.figure(figsize=(12,7))
all_data.word_similarity.hist(bins=40)
plt.title('Word Similarity')

In [None]:
all_data.corr()

### Lastly, there is a bit of a "hump" at low `word_similarity`

In [None]:
# add code here
all_data = all_data[(all_data['word_similarity'] > 0.6)]
all_data.corr()

### The contemperaneous correlation is twice as large!

## Part 6: Now, repeat the exercise of predicting future returns correlation

In [None]:
data_df = np.array(all_data)# Add code here
# add code here
feature_cols = data_df[:,1:]
target_col = data_df[:,0]

In [None]:
reg = linear_model.LinearRegression()
X_train, X_test, y_train, y_test = train_test_split(feature_cols,target_col,test_size=0.4, random_state=0)

reg.fit(X_train, y_train)
reg.score(X_test, y_test)

In [None]:
#pd.Series(reg.coef_, index=feature_cols)

### This is about 5 times better than before!
## $ \\ $ 
## Part 7: What will happen if we include last year's returns correlation as a feature

In [None]:
rets = prices.pct_change()
last_year_corr =  rets.loc['2015-01-01':'2016-01-01'].corr().stack().to_frame(name = 'last_year_returns_correlation')
data_df = last_year_corr.join(all_data)
data_df = data_df.dropna()
data_df.head()

In [None]:
data_df = np.array(data_df)
feature_cols = data_df[:, [0,2]]
target_col = data_df[:, 1]

X_train, X_test, y_train, y_test = train_test_split(feature_cols, target_col, test_size=0.33, random_state=42)
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
reg.score(X_test, y_test)

In [None]:
reg.coef_

In [None]:
#pd.Series(reg.coef_, index=feature_cols)

## Indeed, we do much better, but the word features still help!