### Content:
- Train text classifier on custom labels (market sematic)

#### TODO:
- filename_to_id for reuters and bloomberg

In [5]:
import os
import re
import glob
from datetime import datetime
import sys
sys.path.append("..") # Adds higher directory to python modules path for importing from src dir

import pandas as pd
import numpy as np
import tqdm
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn import metrics
import spacy
from spacy import displacy

%matplotlib inline
%load_ext autotime
%load_ext autoreload
%autoreload 2

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
time: 250 ms


### Load Data

In [6]:
from src.datasets import NyseSecuritiesDataset
from src.datasets import NyseStocksDataset
import src.nlp_utils as nlp_utils
import src.text_classification_utils as tc_utils

HOME = ".."
DATA_DIR = "data"
REUTERS = os.path.join(HOME, DATA_DIR, "preprocessed", "news_reuters.csv")
BLOOMBERG = os.path.join(HOME, DATA_DIR, "preprocessed", "news_bloomberg.csv")
NEWS = os.path.join(HOME, DATA_DIR, "preprocessed", "news.csv")

stocks_ds = NyseStocksDataset(file_path='../data/nyse/prices-split-adjusted.csv'); stocks_ds.load()
securities_ds = NyseSecuritiesDataset(file_path='../data/nyse/securities.csv'); securities_ds.load()
companies = securities_ds.get_all_company_names()  # List[Tuple[symbol, name]]

occs_per_article = tc_utils.get_occs_per_article()

HBox(children=(IntProgress(value=0, max=470), HTML(value='')))


time: 11.7 s


In [3]:
LOOK_BACK = 0
FORECAST = 30
news = tc_utils.load_news_clipped(stocks_ds, LOOK_BACK, FORECAST, REUTERS)

time: 6.22 s


##### Define final test run

In [117]:
stocks_test_ds = NyseStocksDataset(file_path='../data/nyse/prices-split-adjusted.csv', only_test=True, load=True)
news_test = tc_utils.load_news_clipped(stocks_test_ds, look_back=0, forecast=30, file_path=REUTERS)

def final_test(pipe, look_back=0, forecast=30, epsilon_daily_label=0.01, epsilon_overall_label=0.05, min_occurrences=5):
    rel_article_tuples_test = tc_utils.get_relevant_articles(
        news_test, occs_per_article, securities_ds, min_occ=min_occurrences)
    rel_article_tuples_test = [x for x in rel_article_tuples_test
                               if stocks_test_ds.is_company_available(x[0])]

    X_test = np.array([nlp_utils.get_plain_content(x[1]) for x in rel_article_tuples_test])
    y_test = tc_utils.get_discrete_labels(
        rel_article_tuples_test, stocks_test_ds, look_back=look_back, forecast=forecast,
        epsilon_daily_label=epsilon_daily_label, epsilon_overall_label=epsilon_overall_label)
    print('Distribution:', ''.join([f'"{cls}": {sum(y_test == cls)} samples; ' for cls in [1, -1, 0]]))

    y_pred = pipe.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    return acc, mcc, y_pred

time: 165 ms


### Process Data

##### Select news with enough occurrences

In [7]:
# Get all articles with enough occurrences for one company
MIN_OCCURRENCES = 5
rel_article_tuples = tc_utils.get_relevant_articles(news, occs_per_article, securities_ds, min_occ=MIN_OCCURRENCES)
# Remove those which are not available in the training dataset of stock prices
rel_article_tuples = [x for x in rel_article_tuples if stocks_ds.is_company_available(x[0])]
print(f'Selected {len(rel_article_tuples)} relevant article tuples')

Selected 2869 relevant article tuples
time: 20.5 s


##### Generate labels

In [94]:
EPSILON_DAILY_LABEL = 0.01
EPSILON_OVERALL_LABEL = 0.05
LOOK_BACK = 0
FORECAST = 30
labels = tc_utils.get_discrete_labels(
    rel_article_tuples, stocks_ds, look_back=LOOK_BACK, forecast=FORECAST,
    epsilon_daily_label=EPSILON_DAILY_LABEL, epsilon_overall_label=EPSILON_OVERALL_LABEL)
print(f'Generated labels for {len(labels)} articles')
print('Distribution:', ''.join([f'\n- Label "{cls}": {sum(discrete_labels == cls)} labels' for cls in [1, -1, 0]]))

HBox(children=(IntProgress(value=0, max=2869), HTML(value='')))

Generated labels for 2869 articles
Distribution: 
- Label "1": 853 labels
- Label "-1": 1031 labels
- Label "0": 985 labels
time: 2min 8s


##### Train pipeline

In [93]:
X_train, y_train, X_test, y_test = tc_utils.split_shuffled(rel_article_tuples, discrete_labels, split_after_shuffle=False)
vectorizer = CountVectorizer(tokenizer=tc_utils.tokenizeText, ngram_range=(1,1), max_features=200)
clf = LinearSVC(max_iter=5000)
pipe = Pipeline([('cleanText', tc_utils.CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

# train
print("Training...")
pipe.fit(X_train, y_train)
# test
print("Testing...")
y_pred = pipe.predict(X_test)
# tc_utils.inspect_vectorizer(vectorizer, clf, X_train, y_train, X_test, y_test)

Training...




Testing...
time: 1min 29s


In [95]:
print(f"- Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print(f"- MCC: {matthews_corrcoef(y_test, y_pred):.2f}")
print('     ', metrics.classification_report(y_test, y_pred).replace('\n', '\n      '))

- Accuracy: 0.39
- MCC: 0.08
                    precision    recall  f1-score   support
      
              -1.0       0.38      0.52      0.44       193
               0.0       0.45      0.37      0.41       216
               1.0       0.32      0.27      0.29       165
      
         micro avg       0.39      0.39      0.39       574
         macro avg       0.39      0.38      0.38       574
      weighted avg       0.39      0.39      0.38       574
      
time: 198 ms


# Validate on final test set

In [116]:
rel_article_tuples_test = tc_utils.get_relevant_articles(news_test, occs_per_article, securities_ds)
rel_article_tuples_test = [x for x in rel_article_tuples_test if stocks_test_ds.is_company_available(x[0])]
X_test = np.array([nlp_utils.get_plain_content(x[1]) for x in rel_article_tuples_test])
y_test = tc_utils.get_discrete_labels(
    rel_article_tuples_test, stocks_test_ds, look_back=0, forecast=30)
print('Distribution:', ''.join([f'"{cls}": {sum(y_test == cls)} samples; ' for cls in [1, -1, 0]]))

y_pred = pipe.predict(X_test)

acc = accuracy_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)

HBox(children=(IntProgress(value=0, max=1095), HTML(value='')))

Distribution: "1": 484 samples; "-1": 295 samples; "0": 316 samples; 
time: 1min 30s


In [None]:
acc, mcc, _ = final_test()

HBox(children=(IntProgress(value=0, max=1095), HTML(value='')))

# Grid Tests

In [None]:
EPSILON_DAILY_LABEL = 0.01
EPSILON_OVERALL_LABEL = 0.05
MIN_OCCURRENCES = 5  # for one company

metrics = []
pipes = []

for time_delta in tqdm([x for x in range(-90, 91, 50) if x != 0]):
    print('-'*40, '\n', f'time_delta={time_delta}')
    look_back = abs(min(time_delta, 0))
    forecast = abs(max(time_delta, 0))
    continuous_labels = np.array([tc_utils.get_label(*x, stocks_ds, look_back=look_back, forecast=forecast)
                                  for x in tqdm(rel_article_tuples)])
    discrete_labels = categorize_labels(continuous_labels, epsilon=epsilon_overall_label)
    pipe, acc, mcc = tc_utils.run(
        stocks_ds, securities_ds, news, occs_per_article, time_delta=time_delta,
        epsilon_daily_label=EPSILON_DAILY_LABEL, epsilon_overall_label=EPSILON_OVERALL_LABEL,
        min_occurrences=MIN_OCCURRENCES)
    test_acc, test_mcc, _ = final_test(pipe, look_back=look_back, forecast=forecast, epsilon_daily_label=0.01, epsilon_overall_label=0.05, min_occurrences=5
    metrics.append((time_delta, acc, mcc, test_acc, test_mcc))
    pipes.append(pipes)

In [None]:
ax = pd.DataFrame(metrics, columns=['time', 'acc', 'mcc']).set_index('time').plot()
ax.set_title('All News - Text Classification Metrics')
# plt.gcf().savefig('all-news-test-classification-metrics.pdf')

In [None]:
stocks_test_ds = NyseStocksDataset(file_path='../data/nyse/prices-split-adjusted.csv', only_test=True, load=True)
news_test = pd.read_csv(NEWS, index_col=0)

pipe, acc, mcc = tc_utils.run(
    stocks_test_ds, securities_ds, news_test, occs_per_article, time_delta=30,
    epsilon_daily_label=0.01, epsilon_overall_label=0.05, min_occurrences=5)

# TODO:
- Plot with acc & val_acc for features from 50 to 5000D
- Show misleading improvement by split_after_shuffle=True (will fail von the test set)

Tutorial: https://towardsdatascience.com/machine-learning-for-text-classification-using-spacy-in-python-b276b4051a49

### General Setting
- Use {LOOK_BACK} last days until open of the {FORECAST} day in the future (for articles on weekends go back to friday)
- Articles from NYSE start 2010-03-22 to Reuters end 2012-12-31 [not touched final test set will be 2013-01-01 to 2013-11-20 with 3901-2803=1098 articles]
- Only use title and real body (with some exceptions because of regex failure)
- Don't remove numbers, links, special characters from vectorizer

### Experiment 1
- LOOK_BACK = 30
- FORECAST = 0
- EPSILON_DAILY_LABEL = 0.01
- EPSILON_OVERALL_LABEL = 0.05
- Label "1": 829 samples
- Label "-1": 1017 samples
- Label "0": 957 samples
- Train: 2242 out of 2803 shuffled samples (Test: 561 samples)
- LinearSVC warns: "ConvergenceWarning: Liblinear failed to converge, increase the number of iterations."

###### Resulting metrics:
- $Accuray=0.5$
- $MCC=0.25$
- classification_report:
                    precision    recall  f1-score   support
      
              -1.0       0.54      0.52      0.53       209
               0.0       0.49      0.47      0.48       198
               1.0       0.46      0.50      0.48       154
      
         micro avg       0.50      0.50      0.50       561
         macro avg       0.50      0.50      0.50       561
      weighted avg       0.50      0.50      0.50       561
   
### Experiment 2:
- Tried calculating the mean of the relative diffs -> Values are very close to zero.
- Therefore stick to the previous method. Calculate daily label and take the mean label.

### Experiment 3:
- LOOK_BACK=7
- Label "1": 991 labels
- Label "-1": 1106 labels
- Label "0": 706 labels

###### Resulting metrics:
- $Accuracy: 0.50$
- $MCC: 0.23$

### Experiment 4:
- LOOK_BACK=3
- Label "1": 834 labels
- Label "-1": 920 labels
- Label "0": 1049 labels

###### Resulting metrics:
- $Accuracy: 0.41$
- $MCC: 0.11$

### Experiment 5:
- LOOK_BACK=1
- Label "1": 620 labels
- Label "-1": 653 labels
- Label "0": 1530 labels

###### Resulting metrics:
- $Accuracy: 0.47$
- $MCC: 0.12$

### Experiment 6:
- LOOK_BACK=60
- Label "1": 559 labels
- Label "-1": 835 labels
- Label "0": 1323 labels

###### Resulting metrics:
- $Accuracy: 0.56$
- $MCC: 0.30$

### Experiment 7:
- LOOK_BACK=0
- FORECAST=30
- Label "1": 853 labels
- Label "-1": 1031 labels
- Label "0": 985 labels
- 2869 samples

###### Resulting metrics:
- $Accuracy: 0.56$
- $MCC: 0.34$
              precision    recall  f1-score   support
    
            -1.0       0.64      0.65      0.64       216
             0.0       0.50      0.48      0.49       197
             1.0       0.53      0.53      0.53       161
    
       micro avg       0.56      0.56      0.56       574
       macro avg       0.56      0.56      0.56       574
    weighted avg       0.56      0.56      0.56       574
