# EDA Notebook
- Table of Content
    - Part1. Event Type
    - Part2. Text Features
    - Part3. EPS Surpriseness 
    - Part4. Label / Class Distribution

In [None]:
import json
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.preprocessing import MultiLabelBinarizer


import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')

In [None]:
notebook_config = json.load(open('../config/notebook.json', 'r'))
if notebook_config['testing']:
    data_dir = '../test/'
else:
    data_dir = '../data/'

- **Reading in the data...**

In [None]:
data = pd.read_pickle(data_dir + 'processed/feature_encoded_merged_data.pkl')
unigrams = pd.read_csv(data_dir + 'processed/model_unigrams.csv')
phrases = pd.read_csv(data_dir + 'financial_phrases_sample.txt', sep = '\t', header = None).head(2107)

def select_phrases(phrases):
    return phrases[:2107]
data['top_phrases'] = data['phrase_vec'].apply(select_phrases)

train = data.loc[data['dataset'] == 'train'].copy()
val = data.loc[data['dataset'] == 'val'].copy()
test = data.loc[data['dataset'] == 'test'].copy()

data.head(3)

# 1. Event Type
- **Note**: "event type" is an important field of every 8K report. Here we treat it as a categorical feature where each 8K report could have multiple event type.

In [None]:
mlb = MultiLabelBinarizer()

all_events = pd.DataFrame(mlb.fit_transform(data['cleaned_event']),
                   columns = mlb.classes_,
                   index = data['cleaned_event'].index)

In [None]:
events = data.explode('cleaned_event').groupby('cleaned_event').count()['symbol']
events = events.reset_index().sort_values(by = ['symbol'], ascending = False).reset_index(drop = True)
events = events.rename(columns = {'cleaned_event': 'event', 'symbol': 'count'})
events.head(10)

# 2. Text Features - Unigrams & Phrases from AutoPhrase

In [None]:
uni_percent = [np.array(vector) for vector in train['unigram_vec'].values]
uni_count = np.array(uni_percent).sum(axis = 0) 
uni_percent = uni_count / len(train)

phrase_percent = [np.array(vector) for vector in train['top_phrases'].values]
phrase_count = np.array(phrase_percent).sum(axis = 0)
phrase_percent = phrase_count / len(train)

## 2.1 Top Unigrams

In [None]:
unigrams.shape

In [None]:
unigrams["% of 8-K's"] = uni_percent
unigrams["freq"] = uni_count
unigrams_freq = unigrams.sort_values(by = 'freq', ascending = False)
unigrams_freq.head(10)

In [None]:
d = dict(zip(unigrams_freq['unigrams'].values, unigrams_freq['freq'].values))

wordcloud = WordCloud(background_color='white', width=800, height=400)
wordcloud.generate_from_frequencies(frequencies=d)
plt.figure(figsize=(20,15))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
# plt.figure()
plt.show()

## 2.2 Top Phrases

In [None]:
phrases["% of 8-K's"] = phrase_percent
phrases = phrases.rename(columns = {1: 'phrases'})
phrases['freq'] = phrase_count
phrases_freq = phrases[['phrases', "% of 8-K's", "freq"]].sort_values(by = "% of 8-K's", ascending = False).reset_index(drop = True)
phrases_freq.head(10)

In [None]:
d = dict(zip(phrases_freq['phrases'].values, phrases_freq['freq'].values))

wordcloud = WordCloud(background_color='white', width=800, height=400)
wordcloud.generate_from_frequencies(frequencies=d)
plt.figure(figsize=(20,15))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

- **Summary**: From the frequency table and the word-clouds above, we can tell that phrases from AutoPhrase and Investopedia knowledge base is much more meaningful than the top unigrams.

# 3. EPS Surpriseness

In [None]:
surprises = data['Surprise(%)'].values
lower, upper = np.percentile(surprises, [5, 95])
surprises_mid_95 = surprises[(surprises > lower) & (surprises < upper)]

sns.distplot(surprises_mid_95)

In [None]:
surprises_mid_95_w_label = data[(data['Surprise(%)'] > lower) & (data['Surprise(%)'] < upper)][['Surprise(%)', 'target']]

In [None]:
# 3 classes
sns.histplot(x = 'Surprise(%)', hue = 'target', data = surprises_mid_95_w_label)

In [None]:
# down V.S.up
sns.histplot(x = 'Surprise(%)', hue = 'target', data = surprises_mid_95_w_label.query('target != "STAY"'))

In [None]:
# down V.S. stay
sns.histplot(x = 'Surprise(%)', hue = 'target', data = surprises_mid_95_w_label.query('target != "UP"'))

In [None]:
# up V.S. stay
sns.histplot(x = 'Surprise(%)', hue = 'target', data = surprises_mid_95_w_label.query('target != "DOWN"'))

- **Summary**: As we can see in the chart above, the EPS Surpriseness is indeed a pretty good indicator for prediction; however, note that there is also a great number of nearly-zero-surpriseness.

# 4. Class / Label Dirstibution

## 4.1 Label distribution for three classes in different subset

In [None]:
pd.DataFrame(data = [data.groupby(data['target']).count()['symbol'] / data.shape[0],
                     train.groupby(train['target']).count()['symbol'] / train.shape[0],
                     val.groupby(val['target']).count()['symbol'] / val.shape[0],
                     test.groupby(test['target']).count()['symbol'] / test.shape[0]],
             index = ["all_data", "train", "val", "test"]
            )

## 4.2 Average price change for each target in different subset

In [None]:
pd.DataFrame(data = [data.groupby(data['target']).mean()['targe_price_change'],
                     train.groupby(train['target']).mean()['targe_price_change'],
                     val.groupby(val['target']).mean()['targe_price_change'],
                     test.groupby(test['target']).mean()['targe_price_change']],
             index = ["all_data", "train", "val", "test"]
            )

- **Summary**: As we can see from the tables above, the label distribution is relatively even and balanced for three classes in all subsets; the average price changes for different targets also align with our expectation (i.e. STAY has nearly 0 change, UP has positive change, and DOWN has negative change).