### Content:
- Train text classifier on custom labels (market sematic)

#### TODO:
- filename_to_id for reuters and bloomberg

In [2]:
import os
import re
import glob
from datetime import datetime
import sys
sys.path.append("..") # Adds higher directory to python modules path for importing from src dir

import pandas as pd
import numpy as np
import tqdm
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn import metrics
import spacy
from spacy import displacy

%matplotlib inline
%load_ext autotime
%load_ext autoreload
%autoreload 2

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
time: 164 ms


### Load Data

In [5]:
from src.datasets import NyseSecuritiesDataset
from src.datasets import NyseStocksDataset
from src.datasets import NyseFundamentalsDataset
import src.nlp_utils as nlp_utils
import src.text_classification_utils as tc_utils

HOME = ".."
DATA_DIR = "data"
REUTERS = os.path.join(HOME, DATA_DIR, "preprocessed", "news_reuters.csv")
BLOOMBERG = os.path.join(HOME, DATA_DIR, "preprocessed", "news_bloomberg.csv")
NEWS = os.path.join(HOME, DATA_DIR, "preprocessed", "news.csv")

stocks_ds = NyseStocksDataset(file_path='../data/nyse/prices-split-adjusted.csv'); stocks_ds.load()
securities_ds = NyseSecuritiesDataset(file_path='../data/nyse/securities.csv'); securities_ds.load()
companies = securities_ds.get_all_company_names()  # List[Tuple[symbol, name]]

HBox(children=(IntProgress(value=0, max=470), HTML(value='')))


time: 9.1 s


In [8]:
# skiprows: 48000 -> 10-03-22, 47000 -> 10-02-22, 45400 -> 10-01-05
NEWS_ARTICLE_START = '2010-03-22'
news = tc_utils.load_news(REUTERS, start_date=NEWS_ARTICLE_START)

occs_per_article = tc_utils.get_occs_per_article()

Amount of news articles: 58539
Amount after first filter: 58515
time: 7.35 s


### Process Data

In [9]:
# Get all articles with enough occurrences
MIN_OCCURRENCES = 5  # for one company
rel_article_tuples = tc_utils.get_relevant_articles(news, occs_per_article, securities_ds, min_occ=MIN_OCCURRENCES)
# Remove those which are not available in the training dataset of stock prices
rel_article_tuples = [x for x in rel_article_tuples if stocks_ds.is_company_available(x[0])]
print(f'Selected {len(rel_article_tuples)} relevant article tuples')

Selected 2803 relevant article tuples
time: 17.9 s


In [41]:
LOOK_BACK = 30
FORECAST = 0
EPSILON_DAILY_LABEL = 0.01
EPSILON_OVERALL_LABEL = 0.05

time: 159 ms


In [52]:
continuous_labels = np.array([tc_utils.get_label(*x, stocks_ds, look_back=LOOK_BACK,
                                                 forecast=FORECAST, epsilon=EPSILON_DAILY_LABEL)
                              for x in tqdm(rel_article_tuples)])
print(f'Generated labels for {len(rel_article_tuples)} articles')
discrete_labels = tc_utils.categorize_labels(continuous_labels, epsilon=EPSILON_OVERALL_LABEL)
print('Distribution:', ''.join([f'\n- Label "{cls}": {sum(discrete_labels == cls)} labels' for cls in [1, -1, 0]]))

HBox(children=(IntProgress(value=0, max=2803), HTML(value='')))

Generated labels for 2803 articles
Distribution: 
- Label "1": 829
- Label "-1": 1017
- Label "0": 957
time: 1min 50s


In [53]:
X_train, y_train, X_test, y_test = tc_utils.split_shuffled(rel_article_tuples, discrete_labels)

time: 480 ms


# From tutorial
https://towardsdatascience.com/machine-learning-for-text-classification-using-spacy-in-python-b276b4051a49

### General Setting
- Use {LOOK_BACK} last days until open of the {FORECAST} day in the future (for articles on weekends go back to friday)
- Articles from NYSE start 2010-03-22 to Reuters end 2012-12-31 [not touched final test set will be 2013-01-01 to 2013-11-20 with 3901-2803=1098 articles]
- Only use title and real body (with some exceptions because of regex failure)
- Don't remove numbers, links, special characters from vectorizer

### Experiment 1
- LOOK_BACK = 30
- FORECAST = 0
- EPSILON_DAILY_LABEL = 0.01
- EPSILON_OVERALL_LABEL = 0.05
- Label "1": 829 samples
- Label "-1": 1017 samples
- Label "0": 957 samples
- Train: 2242 out of 2803 shuffled samples (Test: 561 samples)
- LinearSVC warns: "ConvergenceWarning: Liblinear failed to converge, increase the number of iterations."

##### Resulting metrics:
- $Accuray=0.5$
- $MCC=0.25$
- classification_report:

                  precision    recall  f1-score   support

            -1.0       0.56      0.53      0.54       209
             0.0       0.49      0.46      0.48       198
             1.0       0.45      0.51      0.48       154

       micro avg       0.50      0.50      0.50       561
       macro avg       0.50      0.50      0.50       561
    weighted avg       0.51      0.50      0.50       561
   
### Experiment 2:
- Tried calculating the mean of the relative diffs -> Values are very close to zero.
- Therefore stick to the previous method. Calculate daily label and take the mean label.

## TODO:
- Different look_back, forecast, epsilon

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

time: 166 ms


In [None]:
vectorizer = CountVectorizer(tokenizer=tc_utils.tokenizeText, ngram_range=(1,1))
clf = LinearSVC()
pipe = Pipeline([('cleanText', tc_utils.CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

# train
print("Training...")
pipe.fit(X_train, y_train)
# test
print("Testing...")
y_pred = pipe.predict(X_test)
# tc_utils.inspect_vectorizer(vectorizer, clf, X_train, y_train, X_test, y_test)

Training...


In [None]:
print(f"- Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"- MCC: {matthews_corrcoef(y_test, y_pred):.2f}")
print(metrics.classification_report(y_test, y_pred).replace('\n', '\n    '))

## Inspect completeness of NYSE data

In [None]:
fundamentals_ds = NyseFundamentalsDataset(file_path='../data/nyse/fundamentals.csv');
fund_data = fundamentals_ds.data()

fund_symbols = set(fund_data['Ticker Symbol'].unique())
securities_symbols = set(securities_ds.get_all_company_names()['Ticker symbol'].values)
prices = pd.read_csv('../data/nyse/prices-split-adjusted.csv')
prices_symbols = set(prices.symbol.unique())

fund_companies_without_sec = fund_symbols - securities_symbols
fund_companies_without_prices = fund_symbols - prices_symbols
sec_comp_without_fund = securities_symbols - fund_symbols
sec_comp_without_prices = securities_symbols - prices_symbols
prices_comp_without_fund = prices_symbols - fund_symbols
prices_comp_without_sec = prices_symbols - securities_symbols

print(f'Fund companies: {len(fund_symbols)}, Sec companies: {len(securities_symbols)}, Price companies: {len(prices_symbols)}\n')
print('fund_companies_without_sec:', sorted(fund_companies_without_sec), '\n')
print('fund_companies_without_prices:', sorted(fund_companies_without_prices), '\n')
print('sec_comp_without_fund:', sorted(sec_comp_without_fund), '\n')
print('sec_comp_without_prices:', sorted(sec_comp_without_prices), '\n')
print('prices_comp_without_fund:', sorted(prices_comp_without_fund), '\n')
print('prices_comp_without_sec:', sorted(prices_comp_without_sec), '\n')