# Machine learning headline analysis to predict daily stock movement
## Goal
- When should we buy and sell stocks for profit

## Stratergy
- What links are there between social media / news and stock movements?
- Sentimate analysis of social media / news
- Map Sentimate analysis to stock movement
- Highlight stocks with high correlation
- Train prediction models on those stocks

## Approaches
1. Bag-of-words machine learning model trained on historical data
2. Spark clustering/continues learning trained on historical data
3. mix of sources (reddit, twitter, abc)
4. combination of headlines and traditional algorithms

In [1]:
def evaluate(y_test, pred):
    print('---------------------------------------')
    print("Accuracy: ", metrics.accuracy_score(y_test, pred))
    
    #confusion matrix
    print("Confussion Matrix: ")
    print(metrics.confusion_matrix(y_test, pred))
    
    #Precision, recall, f-measures
    print("Precision, recall, f-measures: ")
    print(metrics.classification_report(y_test, pred))
    
    #Balanced accuracy
    print("Balanced accuracy: ", metrics.balanced_accuracy_score(y_test, pred))
    print('---------------------------------------')

## Data Preprocessing

In [2]:
#Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Bag of words
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

#Machine learning
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

#Evaluation
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split, GridSearchCV

In [10]:
#historical abc new headlines
abc = pd.read_csv('abcnews-date-text.csv')

#convert dates to match
abc['publish_date'] = pd.to_datetime(abc['publish_date'], format='%Y%m%d')
abc.rename(columns = {'publish_date':'Date'}, inplace=True)

#combine all headlines on the same day
abc = abc.groupby(['Date'])['headline_text'].apply(' '.join)

In [12]:
tickers = pd.read_excel('Tickers.xlsx')

#filtered_tickers = tickers[tickers['Country'] == 'Australia']
filtered_tickers = tickers

count = 0

best_score = 0

pool = Pool(processes=12)
pool.map(f, (ticker in filtered_tickers['Ticker']))

for ticker in filtered_tickers['Ticker']:
    #1. process stock
    stock = pd.read_csv('Data/Data/'+ticker+'/'+ticker+'.csv')
    
    if stock.empty:
        continue
        
    stock = stock.dropna()
    
    if stock.shape[0] < 100:
        continue
    
    #format
    stock['Date'] = pd.to_datetime(stock['Date'])
    
    #attach label
    stock['Label'] = (stock['Close'] < stock.shift(periods=-1)['Open']).astype(int)
    stock.drop(stock.tail(1).index,inplace=True)
    
    stock_label_count = stock["Label"].value_counts()
    
    #ignor unblanaced stocks
    if (abs(stock_label_count[0] - stock_label_count[1]) / stock_label_count[0]) > 0.2:
        continue
        
    #attach features for each day or drop days with missing features
    df = pd.merge(abc, stock[['Date','Label']], on='Date')
    
    #bag of words for headline
    count_vect = CountVectorizer(ngram_range=(1, 1))
    data_counts = count_vect.fit_transform(
            df['headline_text'])

    tf_transformer = TfidfTransformer(use_idf=True, norm='l2').fit(data_counts)
    data_tf = tf_transformer.transform(data_counts)
    
    X_train, X_test, y_train, y_test = train_test_split(
        data_tf,
        df['Label'],
        random_state = 42,
        stratify = df['Label'])
    
    model = MultinomialNB(alpha=0.1)

    model.fit(X_train, y_train)
    
    model_predict = model.predict(X_test)
    
    #evaluate(model_predict, y_test)
    score = metrics.balanced_accuracy_score(y_test, model_predict)
    
    if best_score < score:
        best_score = score
    
    print(count)
    
    count = count+1
    
    if count > 20:
        break

print(best_score)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
0.5253936722273945


## Hyper parameter tuning

In [None]:
"""
X_train, X_test, y_train, y_test = train_test_split(
        df['headline_text'],
        df['Label'],
        random_state = 42,
        stratify = df['Label'])

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

grid_search = GridSearchCV(text_clf, parameters, cv=10, n_jobs=-1, verbose=1)

grid_search.fit(X_train, y_train)

print('Best Score: ', grid_search.best_score_) 
print('Best Params: ', grid_search.best_params_)


loss = ['hinge', 'log', 'modified_huber', 'squared_hinge','perceptron']
penalty = ['l1', 'l2', 'elasticnet'] 
alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000] 
learning_rate = ['constant', 'optimal', 'invscaling', 'adaptive'] 
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]
eta0 = [1, 10, 100] 

param_distributions = dict(loss=loss,
                           penalty=penalty,
                           alpha=alpha, 
                           learning_rate=learning_rate, 
                           class_weight=class_weight, 
                           eta0=eta0) 

random = RandomizedSearchCV(estimator=sgd,
                            param_distributions=param_distributions,
                            scoring='roc_auc',
                            verbose=1, n_jobs=-1, 
                            n_iter=100) 

random.fit(X_train, y_train)

print('Best Score: ', random.best_score_) 
print('Best Params: ', random.best_params_)
"""