### Content:
- Train text classifier on custom labels (market sematic)

#### TODO:
- filename_to_id for reuters and bloomberg

In [None]:
import os
import re
import glob
from datetime import datetime
import sys
sys.path.append("..") # Adds higher directory to python modules path for importing from src dir

import pandas as pd
import numpy as np
import tqdm
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook as tqdm

from sklearn.metrics import accuracy_score, matthews_corrcoef
from sklearn import metrics
import spacy
from spacy import displacy

%matplotlib inline
%load_ext autotime
%load_ext autoreload
%autoreload 2

### Preparing Data

In [None]:
from src.datasets import NyseSecuritiesDataset
from src.datasets import NyseStocksDataset
from src.datasets import NyseFundamentalsDataset
import src.nlp_utils as nlp_utils
import src.text_classification_utils as tc_utils

HOME = ".."
DATA_DIR = "data"
REUTERS = os.path.join(HOME, DATA_DIR, "preprocessed", "news_reuters.csv")
BLOOMBERG = os.path.join(HOME, DATA_DIR, "preprocessed", "news_bloomberg.csv")
NEWS = os.path.join(HOME, DATA_DIR, "preprocessed", "news.csv")

stocks_ds = NyseStocksDataset(file_path='../data/nyse/prices-split-adjusted.csv'); stocks_ds.load()
securities_ds = NyseSecuritiesDataset(file_path='../data/nyse/securities.csv'); securities_ds.load()
companies = securities_ds.get_all_company_names()  # List[Tuple[symbol, name]]

In [None]:
# skiprows: 48000 -> 10-03-22, 47000 -> 10-02-22, 45400 -> 10-01-05
NEWS_ARTICLE_START = '2010-03-22'
news = tc_utils.load_news(REUTERS, start_date=NEWS_ARTICLE_START)

occs_per_article = tc_utils.get_occs_per_article()

In [None]:
# Get all articles with enough occurrences
MIN_OCCURRENCES = 5  # for one company
rel_article_tuples = tc_utils.get_relevant_articles(news, occs_per_article, securities_ds, min_occ=MIN_OCCURRENCES)
# Remove those which are not available in the training dataset of stock prices
rel_article_tuples = [x for x in rel_article_tuples if stocks_ds.is_company_available(x[0])]
print(f'Selected {len(rel_article_tuples)} relevant article tuples')

In [None]:
LOOK_BACK = 30
FORECAST = 0
continuous_labels = np.array([get_label(*x, stocks_ds, look_back=LOOK_BACK, forecast=FORECAST)
                              for x in tqdm(rel_article_tuples)])
print(f'Generated labels for {len(rel_article_tuples)} articles')

In [None]:
X_train, y_train, X_test, y_test = tc_utils.split_shuffled(rel_article_tuples)

# From tutorial
https://towardsdatascience.com/machine-learning-for-text-classification-using-spacy-in-python-b276b4051a49

### Experiment 1:
- Use 30 last days until open of the current day (for articles on weekends go back to friday)
- Articles from NYSE start 2010-03-22 to Reuters end 2012-12-31 [not touched final test set will be 2013-01-01 to 2013-11-20 with 3901-2803=1098 articles]
- Only use title and real body (with some exceptions)
- Don't remove numbers, links, special characters from vectorizer
- Label "1": 829 samples
- Label "-1": 1017 samples
- Label "0": 957 samples
- Train: 2242 out of 2803 shuffled samples (Test: 561 samples)
- LinearSVC warns: "ConvergenceWarning: Liblinear failed to converge, increase the number of iterations."

##### Resulting metrics:
- $Accuray=0.5$
- $MCC=0.25$
- classification_report:

                  precision    recall  f1-score   support

            -1.0       0.56      0.53      0.54       209
             0.0       0.49      0.46      0.48       198
             1.0       0.45      0.51      0.48       154

       micro avg       0.50      0.50      0.50       561
       macro avg       0.50      0.50      0.50       561
    weighted avg       0.51      0.50      0.50       561

## TODO:
- Use absolute price diff for calculating label

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [None]:
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
clf = LinearSVC()
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

# train
print("Training...")
pipe.fit(train_X, train_y)
# test
print("Testing...")
preds = pipe.predict(test_X)

# # Print 10 best words for 2 classes
# print("Top 10 features used to predict: ")
# printNMostInformative(vectorizer, clf, 10)
# # Get counts of each word
# vect_pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer)])
# print("Training #2...")
# transform = vect_pipe.fit_transform(train1, labelsTrain1)
# vocab = vectorizer.get_feature_names()
# for i in range(len(train1)):
#     s = ""
#     indexIntoVocab = transform.indices[transform.indptr[i]:transform.indptr[i+1]]
#     numOccurences = transform.data[transform.indptr[i]:transform.indptr[i+1]]
#     for idx, num in zip(indexIntoVocab, numOccurences):
#         s += str((vocab[idx], num))

In [None]:
print(f"- Accuracy: {accuracy_score(test_y, preds):.2f}")
print(f"- MCC: {matthews_corrcoef(test_y, preds):.2f}")
print(metrics.classification_report(test_y, preds).replace('\n', '\n    '))

## Inspect completeness of NYSE data

In [None]:
fundamentals_ds = NyseFundamentalsDataset(file_path='../data/nyse/fundamentals.csv');
fund_data = fundamentals_ds.data()

fund_symbols = set(fund_data['Ticker Symbol'].unique())
securities_symbols = set(securities_ds.get_all_company_names()['Ticker symbol'].values)
prices = pd.read_csv('../data/nyse/prices-split-adjusted.csv')
prices_symbols = set(prices.symbol.unique())

fund_companies_without_sec = fund_symbols - securities_symbols
fund_companies_without_prices = fund_symbols - prices_symbols
sec_comp_without_fund = securities_symbols - fund_symbols
sec_comp_without_prices = securities_symbols - prices_symbols
prices_comp_without_fund = prices_symbols - fund_symbols
prices_comp_without_sec = prices_symbols - securities_symbols

print(f'Fund companies: {len(fund_symbols)}, Sec companies: {len(securities_symbols)}, Price companies: {len(prices_symbols)}\n')
print('fund_companies_without_sec:', sorted(fund_companies_without_sec), '\n')
print('fund_companies_without_prices:', sorted(fund_companies_without_prices), '\n')
print('sec_comp_without_fund:', sorted(sec_comp_without_fund), '\n')
print('sec_comp_without_prices:', sorted(sec_comp_without_prices), '\n')
print('prices_comp_without_fund:', sorted(prices_comp_without_fund), '\n')
print('prices_comp_without_sec:', sorted(prices_comp_without_sec), '\n')