In [214]:
import pandas as pd
import numpy as np
from newsapi import NewsApiClient
from datetime import datetime, timedelta
from textblob import classifiers
from textblob import TextBlob
from nltk.classify.scikitlearn import SklearnClassifier
import nltk
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

In [58]:
key = 'e55d16a9babf40c586b6275e9b852f23'
news = NewsApiClient(api_key=key)

ARTICLE_COUNT = 5
# finds the relevant news articles for a company on a day
# current_date should be in form "YYYY-MM-DAY"
def parse_articles_on_date(company_name, current_date):
    # this calculates the day one week ago
    DAYS_BACK = 7
    year, month, day = tuple(current_date.split('-'))
    new_date = datetime(int(year), int(month), int(day)) - timedelta(DAYS_BACK)
    start_day = str(new_date)[:10]
    
    company_data = []
    query = company_name + " Stock News"
    all_articles = news.get_everything(q=query, from_param=start_day,to=current_date, language='en', sort_by='relevancy')
    article_data = all_articles['articles']
    for i in range(ARTICLE_COUNT):
        try:
            titlePlusSummary = article_data[i]['title'] + '. ' + article_data[i]['description']
            company_data.append(titlePlusSummary)
        except:
            break
    return company_data

# takes data found from parse_articles_on_date and determines the score using the classifier
def calculate_score(classifer, company_data):
    count = 0
    output = 0
    for summary in company_data:
        try:
            output += classifier.prob_classify(summary).prob(1)
            count += 1
        except:
            continue
    if count > 0:
        return output / count
    else:
        return 0

# a wrapper on the previous two functions
def article_scorer_wrapper(classifier, company_name, current_date):
    company_data = parse_articles_on_date(company_name, current_date)
    return calculate_score(classifier, company_data)

In [59]:
def create_close_open_ratio(df):
    df["Close-Open Ratio"] = (df['close'].values-df['open'].values)/df['open'].values
    ratios = df["Close-Open Ratio"].values
    ratios += np.abs(np.min(ratios))
    ratios /= np.max(ratios, axis=0)
    df["Close-Open Ratio"] = ratios
    return df

Unnamed: 0,timestamp,open,high,low,close,volume,Close-Open Ratio
0,2019-02-15,206.46,208.97,206.0,208.86,2000378,0.515954
1,2019-02-14,206.59,207.12,204.05,204.93,2229753,0.391018
2,2019-02-13,207.09,210.4,206.59,209.72,2967332,0.522787
3,2019-02-12,202.87,206.79,202.19,206.57,2622973,0.557984
4,2019-02-11,200.93,201.2,199.64,200.91,1513457,0.441448


In [60]:
def create_sentiment_dataset(df, company_name):
    for i in range(len(df)):
        date = str(df['timestamp'][i])[:10]
        # the API won't let me pull more than a month back, so this is an ad-hoc fix
        if date == '2019-01-23':
            break
        articles = parse_articles_on_date(company_name=company_name, current_date=date)
        toAdd = pd.DataFrame({'statement': articles,
                              'close-open ratio': df['Close-Open Ratio'][i]})
        if (i == 0):
            df_NLP = toAdd
        else:
            df_NLP = df_NLP.append(toAdd, ignore_index=True)
            
    return df_NLP

In [61]:
company_list = ['MMM', 'BAYZF', 'SYF', 'HON', 'BHC', 'WFC', 'JPM', 'ALV', 'PFE', 'JNJ']
dateColumn = "timestamp"
first = True
for company in company_list:
    PATH = 'data/{}-TIME_SERIES_DAILY.csv'.format(company)
    df_new = pd.read_csv(PATH, low_memory=False, parse_dates=[dateColumn])
    df_new = create_close_open_ratio(df_new)
    df_new = create_sentiment_dataset(df_new, company)
    if first:
        first = False
        df_all = df_new
    else:
        df_all = df_all.append(df_new, ignore_index=True)
    print(company + ' done')

df_all

MMM done
BAYZF done
SYF done
HON done
BHC done
WFC done
JPM done
ALV done
PFE done
JNJ done


Unnamed: 0,statement,close-open ratio
0,Parker-Hannifin: Long-Term Opportunity At The ...,0.515954
1,This Is What Hedge Funds Bought And Sold In Th...,0.515954
2,The Best Dividend Stocks You Can Buy Today. Af...,0.515954
3,Kentucky Retirement Systems Insurance Trust Fu...,0.515954
4,Parker-Hannifin: Long-Term Opportunity At The ...,0.391018
5,Atalanta Sosnoff Capital Maintains Position in...,0.391018
6,The Best Dividend Stocks You Can Buy Today. Af...,0.391018
7,Kentucky Retirement Systems Insurance Trust Fu...,0.391018
8,Parker-Hannifin: Long-Term Opportunity At The ...,0.522787
9,Atalanta Sosnoff Capital Maintains Position in...,0.522787


In [195]:
# works for TextBlob classifiers
values = df_all['close-open ratio'].values
values = values.astype(np.float)
values = np.around(values)
statements = df_all['statement'].values
train_data_full = list(zip(statements, values))
train_data = train_data_full[:500]

In [211]:
train_data = [({'words': statements[i]}, values[i]) for i in range(500)]
test_data = [({'words': statements[i]}, values[i]) for i in range(500, len(df_all))]

In [212]:
print('training NLP classifier...')
classifier = classifiers.NaiveBayesClassifier(train_data)
classifier.train()
print('done')

training NLP classifier...
done


In [222]:
print('training NLP classifier...')
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(train_data)
print("MultinomialNB accuracy percent:", nltk.classify.accuracy(MNB_classifier, test_data))
BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(train_data)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, test_data))

training NLP classifier...
MultinomialNB accuracy percent: 0.3181818181818182
BernoulliNB accuracy percent: 0.3181818181818182


In [223]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(train_data)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, test_data))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(train_data)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, test_data))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(train_data)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, test_data))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(train_data)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, test_data))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(train_data)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, tes_set))*100)

In [217]:
print('classifying...')
prob_dist = classifier.classify("an")
print('result')
print(prob_dist)

classifying...
result
1.0


In [218]:
blob = TextBlob("Stocks have terrible fall down today.", classifier=classifier)
blob.classify()

1.0

In [219]:
classifier.accuracy(test_data)

0.375