In [1]:
import numpy as np
import nltk 
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as st

In [2]:
news_csv = pd.read_csv("news_data/news_reuters.csv", error_bad_lines=False, header = None, names = ["stock", "company", "date", "title", "summary", "type", "website"])
google_price_csv = pd.read_csv("price_data/GOOGL_2020-01-01_to_2025-11-01.csv")

In [3]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [4]:
number_to_month = {"01": "Jan", "02":"Feb", "03":"Mar", "04":"Apr", "05":"May", "06": "Jun", "07":"Jul", "08":"Aug", "09":"Sep", "10":"Oct", "11":"Nov", "12":"Dec"}
def conv_num_to_string(d):
    year = d[0:4]
    month = d[4:6]
    day = d[6:8]
    new = day + "-" + number_to_month[month] + "-" + year[2:4]
    return new 

In [5]:
def up_down_ratio(stock, day_lag): #ex: sentiment_to_price_plot("AAPL", 1, 'neg')
    stock_data = news_csv[news_csv["stock"] == stock]
    stock_price_csv = pd.read_csv("price_data/"+ stock+"_2006-01-01_to_2017-11-01.csv")
    total = []
    for index, row in stock_data.iterrows():
    

        day = conv_num_to_string(str(row["date"]) )

        if day in stock_price_csv["Date"].values:

            

            row_index = stock_price_csv.index[stock_price_csv["Date"] == day].tolist()[0]
            next_price = stock_price_csv.iloc[row_index - day_lag  ]
            #print next_price["Date"], google_price_csv.iloc[row_index]["Date"]
            diff = next_price["Close"] - next_price["Open"]
            if diff >= 0.0:
                total.append(1) 
            else:
                total.append(0)
    return 100*sum(total)/len(total)
   

In [6]:
def sentiment_to_price_plot(stock, day_lag, pos_or_neg): #ex: sentiment_to_price_plot("AAPL", 1, 'neg')
    stock_data = news_csv[news_csv["stock"] == stock]
    stock_price_csv = pd.read_csv(stock+"_2006-01-01_to_2017-11-01.csv")
    temp_x = []
    temp_y = []
    for index, row in stock_data.iterrows():
        ss = sid.polarity_scores(row["summary"])
        score = ss[pos_or_neg]

        day = conv_num_to_string(str(row["date"]) )

        if day in stock_price_csv["Date"].values:

            temp_x.append(score)

            row_index = stock_price_csv.index[stock_price_csv["Date"] == day].tolist()[0]
            next_price = stock_price_csv.iloc[row_index - day_lag  ]
            #print next_price["Date"], google_price_csv.iloc[row_index]["Date"]
            diff = next_price["Close"] - next_price["Open"]
            temp_y.append(diff)
    print st.spearmanr(temp_x, temp_y)
    plt.plot(temp_x, temp_y, "o")
    plt.ylabel("Closing Minus Opening after" + str(day_lag) + "Days")
    plt.xlabel(pos_or_neg + "NLTK Vader-Sentiment Score of Current Day's Summary")
    plt.title(stock + ",  " + str(day_lag) + " Day Lag,  " + pos_or_neg)
    plt.show()
    

In [17]:
def sentiment_scores_make_csv(stock, number_of_prices): 
    stock_data = news_csv[news_csv["stock"] == stock]
    stock_price_csv = pd.read_csv("price_data/"+ stock+"_2006-01-01_to_2017-11-01.csv")
    
    
    col = ['compound','neg','neu','pos','today price','y_price (next day)']
    col = col + [ 'today-' +str(i) + 'price' for i in range(1,number_of_prices+1)  ]
    df = pd.DataFrame(columns=col)
    
    counter = 0 
    
    for index, row in stock_data.iterrows():
        ss = sid.polarity_scores(row["summary"])
        day = conv_num_to_string(str(row["date"]) )
        
        if day in stock_price_csv["Date"].values:
            scores = [ss['compound'], ss['neg'], ss['neu'], ss['pos']]
            
            prices = []
            
            row_index = stock_price_csv.index[stock_price_csv["Date"] == day].tolist()[0]
            next_price = stock_price_csv.iloc[row_index - 1  ]
            predict_closing = next_price["Close"]
            
            prices.append(next_price["Open"])
            prices.append(next_price["Close"])
            
            for j in range(1,number_of_prices+1):
                temp_price = stock_price_csv.iloc[row_index + j  ]
                prices.append(temp_price["Close"])
            
            total_row = scores + prices
            
            df.loc[counter] = total_row 
            counter+=1 
    
    name = "nltk_scores/" + stock +".csv"
    df.to_csv(name)
    

In [19]:
stocks = ['GOOGL', 'INTC', 'AAPL', 'CSCO', 'AMD', 'QCOM', 'NVDA', 'AMZN', 'MSFT', 'IBM']

for stk in stocks:
    sentiment_scores_make_csv(stk, 5)

In [95]:
def sentiment_to_volume_plot(stock, day_lag, pos_or_neg): #ex: sentiment_to_price_plot("AAPL", 1, 'neg')
    stock_data = news_csv[news_csv["stock"] == stock]
    stock_price_csv = pd.read_csv(stock+"_2006-01-01_to_2017-11-01.csv")
    temp_x = []
    temp_y = []
    for index, row in stock_data.iterrows():
        ss = sid.polarity_scores(row["summary"])
        score = ss[pos_or_neg]

        day = conv_num_to_string(str(row["date"]) )

        if day in stock_price_csv["Date"].values:

            temp_x.append(score)

            row_index = stock_price_csv.index[stock_price_csv["Date"] == day].tolist()[0]
            next_price = stock_price_csv.iloc[row_index - day_lag  ]
            #print next_price["Date"], google_price_csv.iloc[row_index]["Date"]
            vol = next_price["Volume"]
            temp_y.append(vol)
    print st.spearmanr(temp_x, temp_y)
    plt.plot(temp_x, temp_y, "o")
    plt.ylabel("Volume after" + str(day_lag) + "Days")
    plt.xlabel(pos_or_neg + "NLTK Vader-Sentiment Score of Current Day's Summary")
    plt.title(stock + ",  " + str(day_lag) + " Day Lag,  " + pos_or_neg)
    plt.show()
    

In [6]:
def sentiment_to_price_plot_UP_DOWN(stock, day_lag, pos_or_neg): #ex: sentiment_to_price_plot_UP_DOWN("AAPL", 1, 'neg')
    stock_data = news_csv[news_csv["stock"] == stock]
    stock_price_csv = pd.read_csv(stock+"_2006-01-01_to_2017-11-01.csv")
    temp_x = []
    temp_y = []
    for index, row in stock_data.iterrows():
        ss = sid.polarity_scores(row["summary"])
        score = ss[pos_or_neg]

        day = conv_num_to_string(str(row["date"]) )

        if day in stock_price_csv["Date"].values:

            temp_x.append(score)

            row_index = stock_price_csv.index[stock_price_csv["Date"] == day].tolist()[0]
            next_price = stock_price_csv.iloc[row_index - day_lag  ]
            #print next_price["Date"], google_price_csv.iloc[row_index]["Date"]
            diff = next_price["Close"] - next_price["Open"]
            if diff > 0:
                temp_y.append(1.0)
            else:
                temp_y.append(-1.0)

    plt.plot(temp_x, temp_y, "o")
    plt.ylabel("Closing Minus Opening INCREASE OR DECREASE after" + str(day_lag) + "Days")
    plt.xlabel(pos_or_neg + "NLTK Vader-Sentiment Score of Current Day's Summary")
    plt.title(stock + ",  " + str(day_lag) + " Day Lag,  " + pos_or_neg)
    plt.show()
    