In [281]:
#source: https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/
import re
from textblob import TextBlob

def clean_tweet(tweet): 
        ''' 
        Utility function to clean tweet text by removing links, special characters 
        using simple regex statements. 
        '''
        return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) |(\w+:\/\/\S+)", " ", tweet).split()) 

def get_polarity(tweet):
    analysis = TextBlob(tweet)
    polarity = analysis.sentiment.polarity
    return polarity

In [282]:
import json
import pandas as pd
from datetime import datetime

files = ["CSIQ.json", "FSLR.json", "VWSYF.json"]
cols = ["Date", "Sentiment"]
csiq_twt = pd.DataFrame(columns=cols)
fslr_twt = pd.DataFrame(columns=cols)
vwsyf_twt = pd.DataFrame(columns=cols)

for file in files:
    with open("./twitter-data/"+file, encoding="utf-8") as f:
        data = json.load(f)
        for time_stamp in data:
            analysis = None
            tweet = clean_tweet(data[time_stamp])
            if file == "CSIQ.json":
                if "canadian solar" in tweet.lower() or "csiq" in tweet.lower():
                    polarity = get_polarity(tweet)
                    date = datetime.utcfromtimestamp(int(time_stamp)/1000).strftime('%Y-%m-%d')
                    to_add = pd.DataFrame([[date, polarity]], columns=cols)
                    csiq_twt = csiq_twt.append(to_add)
                else:
                    continue
            elif file == "FSLR.json":
                if "first solar" in tweet.lower() or "fslr" in tweet.lower():
                    polarity = get_polarity(tweet)
                    date = datetime.utcfromtimestamp(int(time_stamp)/1000).strftime('%Y-%m-%d')
                    to_add = pd.DataFrame([[date, polarity]], columns=cols)
                    fslr_twt = fslr_twt.append(to_add)
                else:
                    continue
            elif file == "VWSYF.json":
                if "vestas wind systems" in tweet.lower() or "vwsyf" in tweet.lower():
                    polarity = get_polarity(tweet)
                    date = datetime.utcfromtimestamp(int(time_stamp)/1000).strftime('%Y-%m-%d')
                    to_add = pd.DataFrame([[date, polarity]], columns=cols)
                    vwsyf_twt = vwsyf_twt.append(to_add)
                else:
                    continue

In [283]:
csiq_twt = csiq_twt.groupby(['Date']).mean()
csiq_sentiment = csiq_twt.Sentiment
csiq_sentiment.head()

Date
2015-04-28    0.044444
2015-04-29    0.650000
2015-05-07    0.000000
2015-05-08    0.000000
2015-05-12    0.170833
Name: Sentiment, dtype: float64

In [284]:
fslr_twt = fslr_twt.groupby(['Date']).mean()
fslr_sentiment = fslr_twt.Sentiment
fslr_sentiment.head()

Date
2015-04-28    0.391111
2015-04-29    0.125000
2015-04-30    0.187500
2015-05-01    0.218750
2015-05-05    0.250000
Name: Sentiment, dtype: float64

In [285]:
vwsyf_twt = vwsyf_twt.groupby(['Date']).mean()
vwsyf_sentiment = vwsyf_twt.Sentiment
vwsyf_sentiment.head()

Date
2015-05-11    0.000000
2015-08-05   -0.081818
2015-08-06    0.136364
2015-08-09    0.136364
2016-02-24    0.000000
Name: Sentiment, dtype: float64

In [286]:
"""
this function takes in a DataFrame object contain yahoo finance data of a stock and computes the change in stock price
for each trading day. returns a series object containing these changes in prices with the date as the index
"""
def get_change(prices):
    close = prices.Close
    close.index = prices.Date
    N = close.shape[0]
    change = pd.Series(close[1:N].values - close[:N-1].values, index=close.index[1:])
    return change

In [287]:
csiq_prices = pd.read_csv("./yahoo-data/CSIQ.csv")
csiq_change = get_change(csiq_prices)
csiq_change.head()

Date
2015-04-28    2.079998
2015-04-29   -0.610001
2015-04-30   -1.719997
2015-05-01    0.859996
2015-05-04    0.180001
dtype: float64

In [288]:
fslr_prices = pd.read_csv("./yahoo-data/FSLR.csv")
fslr_change = get_change(fslr_prices)
fslr_change.head()

Date
2015-04-28    0.650002
2015-04-29   -1.630001
2015-04-30   -2.270001
2015-05-01   -2.329998
2015-05-04   -0.560001
dtype: float64

In [289]:
vwsyf_prices = pd.read_csv("./yahoo-data/VWSYF.csv")
vwsyf_change = get_change(vwsyf_prices)
vwsyf_change.head()

Date
2015-04-28    0.020000
2015-04-29   -0.189998
2015-04-30    0.000000
2015-05-01    0.599998
2015-05-04    1.320000
dtype: float64

In [290]:
tan_prices = pd.read_csv("./yahoo-data/TAN.csv")
tan_change = get_change(tan_prices)
tan_change.head()

Date
2015-04-28    0.459999
2015-04-29   -0.740002
2015-04-30   -1.049999
2015-05-01    0.490002
2015-05-04   -0.070004
dtype: float64

In [294]:
qcln_prices = pd.read_csv("./yahoo-data/QCLN.csv")
qcln_change = get_change(qcln_prices)
qcln_change.head()

Date
2015-04-28    0.170000
2015-04-29   -0.139999
2015-04-30   -0.400002
2015-05-01    0.120001
2015-05-04    0.129999
dtype: float64

In [292]:
"""
This function takes in a series of sentiment values, an int window size, and a current date of observation. creates and 
returns a series containing the previous window size days of sentiment values from the current date. this will be used to 
predict the sentiment value of the current date
"""
def get_sent_range(series, window, curr_date, debug=False):
    curr_sent = 0
    for sentiment in series.items():
        if debug:
            print(sentiment[0], curr_date)
            print(curr_sent)
        if sentiment[0] >= curr_date:
            break
        else:
            curr_sent += 1
    
    if curr_sent < window:
        sent_range = series[0:curr_sent]
    else:
        sent_range = series[curr_sent-window:curr_sent]
    if debug:
        print(sent_range)
    return sent_range

In [567]:
#use previous "window" days to predict the current days sentiment and stock change for ETFs TAN and QCLN
window = 3
curr_stock = 0
dates = csiq_change.index
final_dict = {}
final_dict["CSIQ_past_change"] = []
final_dict["FSLR_past_change"] = []
final_dict["VWSYF_past_change"] = []
final_dict["QCLN_past_change"] = []
final_dict["TAN_past_change"] = []
final_dict["past_stock_sentiment"] = []
final_dict["avg_stock_change"] = []

#insert window size dummy data so date indexes line up with number of row entries
for i in range(window):
    final_dict["CSIQ_past_change"].append("na") 
    final_dict["FSLR_past_change"].append("na") 
    final_dict["VWSYF_past_change"].append("na") 
    final_dict["QCLN_past_change"].append("na") 
    final_dict["TAN_past_change"].append("na") 
    final_dict["past_stock_sentiment"].append("na") 
    final_dict["avg_stock_change"].append("na")

for i in range(window, len(dates)):
    stock_day = dates[i]
    
    # get series of past window size days sentiments for each stock
    csiq_sent_range = get_sent_range(csiq_sentiment, window, stock_day)
    fslr_sent_range = get_sent_range(fslr_sentiment, window, stock_day)
    vwsyf_sent_range = get_sent_range(vwsyf_sentiment, window, stock_day)
    
    # take the majority sentiment
    pos = 0
    neg = 0
    neut = 0
    sent = ""
    for el in csiq_sent_range:
        if el > 0:
            pos += 1
        elif el < 0:
            neg += 1
        else:
            neut += 1
    for el in fslr_sent_range:
        if el > 0:
            pos += 1
        elif el < 0:
            neg += 1
        else:
            neut += 1
    for el in vwsyf_sent_range:
        if el > 0:
            pos += 1
        elif el < 0:
            neg += 1
        else:
            neut += 1
    
    if pos > neg and pos > neut:
        sent = 1
    elif neg > pos and neg > neut:
        sent = -1
    #if neutral is highest, or any of the values are equal, then make sentiment neutral
    else:
        sent = 0
        
    # get series of past window size days price changes for the ETFs and average them
    tan_change_range = tan_change[i-window:i]
    qcln_change_range = qcln_change[i-window:i]
    
    tan_avg_change = tan_change_range.mean()
    qcln_avg_change = qcln_change_range.mean()
    
    # get series of past window size days pice changes for the stocks
    csiq_change_range = csiq_change[i-window:i]
    fslr_change_range = fslr_change[i-window:i]
    vwsyf_change_range = vwsyf_change[i-window:i]
    
    csiq_avg_change = csiq_change_range.mean()
    fslr_avg_change = fslr_change_range.mean()
    vwsyf_avg_change = vwsyf_change_range.mean()
    
    # average the change in stock prices
    change = 0
    change += csiq_change[stock_day]
    change += fslr_change[stock_day]
    change += vwsyf_change[stock_day]
    change /= 3
    
    final_dict["CSIQ_past_change"].append(csiq_avg_change) 
    final_dict["FSLR_past_change"].append(fslr_avg_change) 
    final_dict["VWSYF_past_change"].append(vwsyf_avg_change) 
    final_dict["QCLN_past_change"].append(qcln_avg_change) 
    final_dict["TAN_past_change"].append(tan_avg_change) 
    final_dict["past_stock_sentiment"].append(sent)
    final_dict["avg_stock_change"].append(change)
    
final_df = pd.DataFrame(data=final_dict, index=dates)
final_df = final_df[window:]
final_df.head()

Unnamed: 0_level_0,CSIQ_past_change,FSLR_past_change,VWSYF_past_change,QCLN_past_change,TAN_past_change,past_stock_sentiment,avg_stock_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2015-05-01,-0.0833333,-1.08333,-0.056666,-0.123334,-0.443334,1,-0.290001
2015-05-04,-0.490001,-2.07667,0.136667,-0.14,-0.433333,1,0.313333
2015-05-05,-0.226667,-1.72,0.639999,-0.0500007,-0.21,1,-0.16
2015-05-06,0.173332,-0.953332,0.643332,-0.0199997,-0.106667,1,-0.0466663
2015-05-07,-0.54,-0.56,1.20667,-0.116667,-0.673334,1,0.150001


In [565]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np

# regression model with just stock data

X = final_df[final_df.columns[0:5]]
y = final_df.avg_stock_change

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create linear regression object
regr = SVR(gamma='scale', C=2, epsilon=0.01)
#regr = linear_model.LinearRegression()

# Fit regression model to the training set
regr.fit(X_train, y_train)

# Apply model to the test set
y_pred_test = regr.predict(X_test)

print("R-square without sentiment = %.4f" % r2_score(y_test, y_pred_test))

R-square without sentiment = 0.0537


In [564]:
# regression model with sentiment added

# regression model with just stock data

X = final_df[final_df.columns[0:-1]]
y = final_df.avg_stock_change

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Create linear regression object
regr = SVR(gamma='scale', C=2, epsilon=0.01)
#regr = linear_model.LinearRegression()

# Fit regression model to the training set
regr.fit(X_train, y_train)

# Apply model to the test set
y_pred_test = regr.predict(X_test)

print("R-square with sentiment = %.4f" % r2_score(y_test, y_pred_test))

R-square with sentiment = 0.0860
