# Homework #2

# $ \\ $

# Problem 1: Obtain structured company data using Regex

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import re
import pandas as pd

In [3]:
wiki_df = pd.read_csv('data/kdwd_r1k_articles.csv')
wiki_df.head(2)

Unnamed: 0,page_id,page_title,page_views,intro_text
0,856,Apple Inc.,190485,Apple Inc. is an American multinational techno...
1,2386,American Airlines,40829,"American Airlines, Inc. (AA) is a major Americ..."


### -) Write a regex to find unusually capitalized terms
Sometimes product names will have unusual capitalization such as iPhone or ThinkPad. Find a list of such terms and investigate if you think some of them are products.

In [4]:
data = [i for i in wiki_df['intro_text']]
all_text = ' '.join(data)
len(all_text)

613174

In [5]:
maybe_products_ptn = r'\b[A-Z]*[a-z]+[A-Z][a-z]+\b' # find the correct regular expression here

# code here!

maybe_products_set = []
maybe_products_set = re.findall(maybe_products_ptn, all_text)
# print(len(maybe_products_set), 'terms found that are potential products')


In [6]:
# verify that we found some of the key terms
assert('iPhone' in maybe_products_set)
assert('ThinkPad' in maybe_products_set)

### -) Parse company acquisition data from plain text

We are looking to identify the following types of patterns.<br />
`'Citrix acquired Sequoia Software Corp'`<br />
`'Moody\'s was acquired by Dun & Bradstreet in 1962.'`<br />
The idea here is to look for patterns around the word 'acquire' with two valid entities on either side, and an option year at the end.<br />
<span style="color:orange">Helpful Reminder:</span> you can create non-capturing capture groups via `(?:capture this|or that)`

In [7]:
# find a way to capture, 'FedEx', 'Coca-Cola', 'Sequoia Software Corp', 'Dun & Bradstreet' and 'Moody\'s'
company_ptn = r'(?:[A-Z][a-z]+\s)?[A-Z][a-z]+(?:\-|\s\&\s|\'[s])?(?:[A-Z][a-z]+)?(?:\s[Corp]+)?'  # write pattern
maybe_companies_set = re.findall(company_ptn, all_text)
# code here!
# print(len(maybe_companies_set), 'terms found that are potential companies')

In [8]:
# verify that we found some of the key terms
assert('FedEx' in maybe_companies_set)
assert('Coca-Cola' in maybe_companies_set)
assert('Sequoia Software Corp' in maybe_companies_set)
assert('Dun & Bradstreet' in maybe_companies_set)
assert('Moody\'s' in maybe_companies_set)

In [9]:
# acquisition pattern
acquisition_ptn = r'(?:acquired|was acquired by)'

# find a way to optionally capture the year such as ' in 1962'
optional_year_ptn = r'(?: in [12][0-9]{3}\b)?'

In [10]:
# let's assemble the patterns together to a full capture pattern
full_acquisition_pattern = (
    company_ptn + r'\s+' + acquisition_ptn + r'\s+' + company_ptn + optional_year_ptn
)

In [11]:
acquisition_strings = []
for _, row in wiki_df.iterrows():
    acquisition_strings.extend(re.findall(full_acquisition_pattern, row['intro_text']))
print(len(acquisition_strings), 'potential acquisitions found.')

19 potential acquisitions found.


In [12]:
# verify that we found some of the key terms
assert('Citrix acquired Sequoia Software Corp' in acquisition_strings)
assert('Moody\'s was acquired by Dun & Bradstreet in 1962' in acquisition_strings)

### -) Question: Are there any false positives in your results? If so, how could you account for them?

There are false positives in the results. Probably we could find a database for company names, and run some matching algo to further filter the companies mentioned in our data

### -) Let's look into speed of regex matching

In [22]:
# write a simple pattern of your choice to search for in our dataset
search_ptn = r'iPhone'

In [23]:
# assemble a list of strings
doc_list = wiki_df['intro_text'].tolist()

In [24]:
%%timeit
for doc in doc_list:
    re.search(search_ptn, doc)
# evaluation loop here

1.1 ms ± 33.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Let's compile the regex and see if this increases the speed using `re.compile`

In [25]:
%%timeit
# evaluation loop here
compiled_ptn = re.compile(r'iPhone')
for doc in doc_list:
    re.search(compiled_ptn, doc)

1.21 ms ± 37.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### A note on speed: basic string operations are always faster than regex
#### show this using `'my_string' in 'other_string'`

In [26]:
%%timeit
'iPhone' in 'This is the new iPhone 13.'

47.8 ns ± 2.13 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [27]:
%%timeit
re.search('iPhone', 'This is the new iPhone 13.')

674 ns ± 42.8 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


# Problem 2: Word phrases

### In this problem we will look at methods to identify valid n-grams such as 'New York' or 'Barack Obama' while eliminating statistical flukes such as `in the` or `i write`.

### Preprocessing such as this can drastically improved embeddings since words can ngrams will often have a different meaning than the sum of its parts
### `V('united')` + `V('states')` != `V('united states')`
### `V('real')` + `V('estate')` != `V('real estate')`

In [50]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [51]:
import os
import json
import re

import numpy as np
import pandas as pd

In [52]:
wiki_df = pd.read_csv('data/kdwd_r1k_articles.csv')

### get consecutive unigrams for the 'intro_text' column of our dataset

In [53]:
# list of lists of unigrams
unigram_pattern = r'[a-z0-9]+'
corpus = [re.findall(unigram_pattern, doc.lower()) for doc in wiki_df['intro_text'].tolist()]

### The package `gensim` has a convenient wrapper to obtain statistically significant ngrams/Phrase automatically

### we need to first `pip install gensim`
### `gensim` is a useful library for anything related to word representations and embeddings. It will come up a few more times. https://radimrehurek.com/gensim/index.html

In [54]:
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

### Write some code to parse our corpus and use valid ngrams using `Phrases`

In [55]:
phrases = Phrases(corpus, min_count=1, threshold=1)

In [56]:
vocab_count_dict = {k: v for k, v in phrases.vocab.items()}

n_grams = pd.Series(vocab_count_dict)
n_grams = n_grams.sort_values(ascending=False)

In [57]:
print(n_grams.shape[0], 'n-grams found')
n_grams.head(10)

60689 n-grams found


the            4873
and            4173
in             3706
of             2422
company        1884
is             1686
a              1336
to             1072
the_company     999
s               961
dtype: int64

### How do the results look? Can you improve the results by excluding common terms using the `connector_words` kwarg of `Phrases`?

In [58]:
phrases = Phrases(corpus, min_count=1, threshold=1, connector_words=ENGLISH_CONNECTOR_WORDS)

In [59]:
vocab_count_dict = {k: v for k, v in phrases.vocab.items()}

n_grams = pd.Series(vocab_count_dict)
n_grams = n_grams.sort_values(ascending=False)

In [60]:
print(n_grams.shape[0], 'n-grams found')
n_grams.head(10)

60573 n-grams found


company     1884
is          1686
s            961
as           905
it           757
its          746
was          657
american     498
inc          473
largest      453
dtype: int64

## This was convenient, but it's also a black box where many of the knobs for tuning are actually broken in the newest version. Let's try to create our own solution for finding n-grams.

### To do this, let's start by counting unigrams and bigrams within our corpus

### Tip: use Counter for easy counting. It behaves similar to a dictionary with some added functionality around counting. such as `my_counter[unknown_key]` returning `0` for all unknown keys

In [61]:
from collections import Counter

unigram_counter = Counter()
bigram_counter = Counter()
for tokens in corpus:
    # your code here
    for idx in range(len(tokens)-1):
        unigram_counter[tokens[idx]] +=1
        bigram_counter[tokens[idx] + ' ' + tokens[idx+1]] +=1
    unigram_counter[tokens[len(tokens)-1]] +=1

### Now we need to come up with a score for each bigram that helps us decide on its importance and the fact of whether it is truly a bigram or two independent unigrams.

In [62]:
# your code here
n,_ = wiki_df.shape
tokens_tf = {}
tokens_tf_idf = {}

for token in bigram_counter.keys():
    tf = 0
    df = 0
    tokens_tf[token] = []
    for doc in wiki_df['intro_text'].tolist():
        doc = doc.lower()
        tf = doc.count(token)/(len(doc.split())-1)
        if tf > 0:
            df +=1
        tokens_tf[token].append(tf)
    idf = np.log((1+n)/(1+df)) + 1
    tokens_tf_idf[token] = []
    for tf in tokens_tf[token]:
        tokens_tf_idf[token].append(tf*idf)
        
largest_tokens_tf_idf = {}
for token in bigram_counter.keys():
    largest_tokens_tf_idf[token] = max(tokens_tf_idf[token])
bigram_df = pd.DataFrame(largest_tokens_tf_idf.items(), columns=['bigram','tf-idf'])

KeyboardInterrupt: 

In [None]:
bigram_df.head(10)

Unnamed: 0,bigram,tf-idf
0,apple inc,0.034528
1,inc is,0.121065
2,is an,0.147476
3,an american,0.158072
4,american multinational,0.282122
5,multinational technology,0.077946
6,technology company,0.122653
7,company headquartered,0.192896
8,headquartered in,0.145858
9,in cupertino,0.01027


### Find ways to sort and filter your output to bigrams that make sense, such as `wells fargo`, `apple inc` or `puerto rico`

In [None]:
# your code here
filtered_bigram_df = bigram_df[bigram_df['tf-idf']>0.03]
filtered_bigram_df

Unnamed: 0,bigram,tf-idf
0,apple inc,0.034528
1,inc is,0.121065
2,is an,0.147476
3,an american,0.158072
4,american multinational,0.282122
...,...,...
50384,formerly l,0.079802
50385,l 3,0.138913
50386,3 communications,0.079802
50387,and harris,0.079802


In [None]:
filtered_bigram_df.sort_values(by = 'tf-idf').tail(10)

Unnamed: 0,bigram,tf-idf
49911,alexandria real,0.631164
49738,provides b2b,0.631164
49913,equities is,0.631164
49739,b2b it,0.631164
49912,estate equities,0.631164
29985,in hamilton,0.653733
50072,reinsurance company,0.69428
50071,a reinsurance,0.69428
50069,everest re,0.69428
19817,marathon oil,0.747124


# Problem 3: Corporate Similarity and Returns
### In this example we'll explore how to use NLP to measure corporate similarity

### In particular we will
 - ### Make word vectors for firms in order to get an NLP measure of similarity
 - ### Measure the quality of this similarity metric by predicting future co-movement of returns. 
 
## Step X: This problem uses a few concepts of basic modeling such as `sklearn.model_selection.train_test_split` and `sklearn.linear_model.LinearRegression`
## Feel free to read some of the sklearn documentation, but otherwise we will cover these concepts next class
 

# $ \\ $
## Step 0: Load the MD&A section from Form-10-K from 2016

In [None]:
with open('../../data/parsed_mda.json') as f:
    data = json.load(f)

### Next, take only the first filing for each company

In [None]:
clean = []
seen = set()
for item in data:
    if item['ticker'] in seen:
        continue
    else:
        seen.add(item['ticker'])
        clean.append(item)
data = clean
del clean

### Now load the price data for 2015-2018

In [None]:
prices = pd.read_csv(
    '../../data/sp500_prices.csv', 
    index_col=0, 
    parse_dates=True
).loc['2015-01-01':'2018-01-01']

In [None]:
prices.head()

In [None]:
data_tickers = [item['ticker'] for item in data]
assert len(data_tickers) == len(set(data_tickers)), 'non-unique tickers, this will not work'

## Step 1: clean the text
### Much of NLP boils down to doing reasonable processing on text.
### First, we'll try out very minimial processing

In [None]:
def clean_mda_simple(mda):
    return mda.lower()

In [None]:
# add import here

In [None]:
word_vecs = ...

## Step 2: Pairwise Word similarity
### Calculate the pariwise cosine similarity between word vectors
### Make the cosine similarities into a dataframe indexed/columned on ticker symbols

In [None]:
word_sims = ...

## Step 2a: Why `cosine_similarity` and not another measure?

In [None]:
# ANSWER GOES HERE

## Step 3: Wrangle the price and word data
### Our goal here is to have a dataframe which is indexed on PAIRS of tickers and has columns
 - ### `returns_correlation`: the correlation of returns for those two tickers from Jan 1 2016 to Jan 1 2017
 - ### `word_similarity`: the cosine similarity of the word vectors for the two companies' MD&A sections
 
## Tips
 - ### NB: use pct_change to calculate returns in pandas
 - ### NB: use the pandas builtin corr function to calculate correlations (we don't need anything fancy)
 - ### NB: the index of the dataframe should have two columns (the tickers)

In [None]:
# one way you might do this is
rets_cor = ... # calculate returns correlations
word_cor = #  calcuate the word similarities in the right shape

all_data = rets_cor.join(word_cor)
all_data = all_data.dropna()
all_data.head()

## Step 3a: 
 - ### What is the contemperaneous correlation of these data?
 - ### Make a scatter plot of the returns correlation and word similarities

In [None]:
all_data.corr()

In [None]:
## This should be about 12%. That's not bad, but we can do better

In [None]:
all_data.plot.scatter(x='returns_correlation', y='word_similarity')

## Part 4: Try to predict the future returns correlations
### Use OLS (`LinearRegression`) to predict `returns_correlation` from `word_similarity`. 
### What is the (contemperaneous) out of sample performance?

In [None]:
data_df =  ...
feature_cols =  ...
target_col =  ...

In [None]:
# add code here

reg = ...# add code here
reg.score(X_test, y_test)

In [None]:
pd.Series(reg.coef_, index=feature_cols)

### This is not amazing. We can do better!

## $ \\ $ 

## Part 5: Repeat, but be careful
### Here we will see if we can clean the data better

### Things to try
 - ### Look at the histograms of word similarities to see if we can "ignore" some ill-behaved data
 - ### Try limiting how greedy the `TFIDFVectorizer` is: `min_df`, `max_df`, `max_features`, etc.
 
### We will examine our data and look for things that look out of place
 - ### We will ultimately want our data to look normally distributed


In [None]:
def clean_mda(mda):
    paras = [p.lower() for p in mda.split('\n') if len(p) > 40]
    cleaned =  ' '.join(paras)
    words = cleaned.split()
    words = [word for word in words if len(word) > 2]
    if len(words) > 10:
        return ' '.join(words)
    else:
        return ''

In [None]:
vec = TfidfVectorizer(
    min_df=...
    max_df= ...
    max_features= ...
)
word_vecs = vec.fit_transform((clean_mda(item['mda']) for item in data))


word_sims =  ...
# Lots of word similarities are all zeros- so we'll ignore
# add code here to remove rows of word_sims where all the elements are zero

In [None]:
# calculate the returns correlation and the cosine similarities as above
all_data = ...

In [None]:
# inspect your data- make some histograms
plt.figure(figsize=(12,7))
all_data.returns_correlation.hist(bins=40)
plt.title('Returns Correlation')
plt.figure(figsize=(12,7))
all_data.word_similarity.hist(bins=40)
plt.title('Word Similarity')

### Cleaning our data
### It seems lots of things are identically 0 (no word overlap) or identically 1 (the MD&A section for one company perfectly overlaps itself). We will exclude those

In [None]:
# your code here

In [None]:
# examine histograms again
plt.figure(figsize=(12,7))
all_data.returns_correlation.hist(bins=40)
plt.title('Returns Correlation')
plt.figure(figsize=(12,7))
all_data.word_similarity.hist(bins=40)
plt.title('Word Similarity')

In [None]:
all_data.corr()

### Lastly, there is a bit of a "hump" at low `word_similarity`

In [None]:
# add code here
all_data.corr()

### The contemperaneous correlation is twice as large!

## Part 6: Now, repeat the exercise of predicting future returns correlation

In [None]:
data_df =  ...# Add code here
# add code here

In [None]:
reg =  ...
reg.score(X_test, y_test)

In [None]:
pd.Series(reg.coef_, index=feature_cols)

### This is about 5 times better than before!
## $ \\ $ 
## Part 7: What will happen if we include last year's returns correlation as a feature

In [None]:
last_year_corr =  ...
data_df = last_year_corr.join( ...

In [None]:
reg =  ...
reg.score(X_test, y_test)

In [None]:
pd.Series(reg.coef_, index=feature_cols)

## Indeed, we do much better, but the word features still help!