In [7]:
import pandas as pd
import numpy as np
from newsapi import NewsApiClient
from datetime import datetime, timedelta

In [24]:
key = '8b8428b3603940ef8485d39a6eb526f2'
news = NewsApiClient(api_key=key)

companiesTotal = {'Synchrony Financial': [],
            '3M': [],
            'Bayer': [],
            'Honeywell': []}

ARTICLE_COUNT = 5
# finds the relevant news articles for a company on a day
# current_date should be in form "YYYY-MM-DAY"
def parse_articles_on_date(company_name, current_date):
    # this calculates the day one week ago
    DAYS_BACK = 7
    year, month, day = tuple(current_date.split('-'))
    new_date = datetime(int(year), int(month), int(day)) - timedelta(DAYS_BACK)
    start_day = str(new_date)[:10]
    
    company_data = []
    query = company_name + " Company News"
    all_articles = news.get_everything(q=query, from_param=start_day,to=current_date, language='en', sort_by='relevancy')
    article_data = all_articles['articles']
    for i in range(ARTICLE_COUNT):
        try:
            titlePlusSummary = article_data[i]['title'] + '. ' + article_data[i]['description']
            company_data.append(titlePlusSummary)
        except:
            break
    return company_data

# takes data found from parse_articles_on_date and determines the score using the classifier
def calculate_score(classifer, company_data):
    count = 0
    output = 0
    for summary in company_data:
        try:
            output += classifier.prob_classify(summary).prob(1)
            count += 1
        except:
            continue
    if count > 0:
        return output / count
    else:
        return 0

# a wrapper on the previous two functions
def article_scorer_wrapper(classifier, company_name, current_date):
    company_data = parse_articles_on_date(company_name, current_date)
    return calculate_score(classifier, company_data)

In [46]:
def create_close_open_ratio(df):
    df["Close-Open Ratio"] = (df['close'].values-df['open'].values)/df['open'].values
    ratios = df["Close-Open Ratio"].values
    ratios += np.abs(np.min(ratios))
    ratios /= np.max(ratios, axis=0)
    df["Close-Open Ratio"] = ratios
    return df

In [47]:
PATH = 'data/MMM-TIME_SERIES_DAILY.csv'
dateColumn = "timestamp"

df_MMM = pd.read_csv(PATH, low_memory=False, parse_dates=[dateColumn])
df_MMM = create_close_open_ratio(df_MMM)

In [48]:
df_MMM.head()

Unnamed: 0,timestamp,open,high,low,close,volume,Close-Open Ratio
0,2019-02-15,206.46,208.97,206.0,208.86,2000378,0.515954
1,2019-02-14,206.59,207.12,204.05,204.93,2229753,0.391018
2,2019-02-13,207.09,210.4,206.59,209.72,2967332,0.522787
3,2019-02-12,202.87,206.79,202.19,206.57,2622973,0.557984
4,2019-02-11,200.93,201.2,199.64,200.91,1513457,0.441448


In [43]:
def create_sentiment_dataset(df, company_name):
    for i in range(len(df)):
        date = str(df['timestamp'][i])[:10]
        # the API won't let me pull more than a month back, so this is an ad-hoc fix
        if date == '2019-01-23':
            break
        articles = parse_articles_on_date(company_name=company_name, current_date=date)
        toAdd = pd.DataFrame({'statement': articles,
                              'close-open ratio': df['Close-Open Ratio'][i]})
        if (i == 0):
            df_NLP = toAdd
        else:
            df_NLP = df_NLP.append(toAdd, ignore_index=True)
            
    return df_NLP

In [44]:
df_NLP = create_sentiment_dataset(df_MMM, "3M")


2019-02-15
2019-02-14
2019-02-13
2019-02-12
2019-02-11
2019-02-08
2019-02-07
2019-02-06
2019-02-05
2019-02-04
2019-02-01
2019-01-31
2019-01-30
2019-01-29
2019-01-28
2019-01-25
2019-01-24
2019-01-23


Unnamed: 0,statement,close-open ratio
0,EPA Announced A Plan For Toxic Chemicals In Dr...,0.515954
1,Deep Dive: These stocks rose the most as Washi...,0.515954
2,Maria Ressa arrest: everything you need to kno...,0.515954
3,"Wall Street rises on trade hopes, deal to avoi...",0.515954
4,"Stocks in the news: Hindalco, Axis Bank, Max I...",0.515954


In [45]:
df_NLP

Unnamed: 0,statement,close-open ratio
0,EPA Announced A Plan For Toxic Chemicals In Dr...,0.515954
1,Deep Dive: These stocks rose the most as Washi...,0.515954
2,Maria Ressa arrest: everything you need to kno...,0.515954
3,"Wall Street rises on trade hopes, deal to avoi...",0.515954
4,"Stocks in the news: Hindalco, Axis Bank, Max I...",0.515954
5,EPA Announced A Plan For Toxic Chemicals In Dr...,0.391018
6,Deaths put e-scooters in spotlight. A watchdog...,0.391018
7,Deep Dive: These stocks rose the most as Washi...,0.391018
8,"Morning Brew claims 1 million subscribers, $3m...",0.391018
9,Maria Ressa arrest: everything you need to kno...,0.391018
