by Antonia Kraft, matr.-nr.: 11731292

In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
import numpy as np
from transformers import pipeline
import csv
import math
import json
import matplotlib as plt
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.dates import MonthLocator
import scipy.stats as stats
import sklearn
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from mlxtend.plotting import scatterplotmatrix
from mlxtend.plotting import heatmap


In [None]:
#read in reddit data
redditdf = pd.read_csv("reddit_posts.csv")
redditdf

In [None]:
#read in yahoo fin data
findf = pd.read_csv("Finance_Data.csv")

findf = findf.sort_values(by = "Date", ascending=True)
findf["Date"] = pd.to_datetime(findf["Date"]) 

findf

In [None]:
#prep data for regression
redditdf["timestamp"] = pd.to_datetime(redditdf["timestamp"])
redditdf["timestamp"] = redditdf["timestamp"].dt.date
redditdf = redditdf.sort_values(by = "timestamp", ascending = False)

redditdf

In [None]:
#dataprep
findf = pd.DataFrame(findf)

findf = findf.pivot_table(index = "Date", columns = "Symbol", values = "Value")
findf.reset_index(inplace = True)

findf["Date"] = pd.to_datetime(findf["Date"])
findf["Date"] = findf["Date"].dt.date
findf = findf.sort_values(by = "Date", ascending = False)

findf.head()

In [None]:
#sentiment analysis finito
redditdf['VADER_class'] = redditdf['compound'].apply(lambda x: "Positive" if x > 0.5 else "Negative" if x < -0.5 else "Neutral")
#print(redditdf['VADER_class'])

N = len(redditdf)

all = redditdf['VADER_class'].value_counts()
pos = redditdf["VADER_class"][redditdf["VADER_class"] == "Positive"].value_counts()
neg = redditdf["VADER_class"][redditdf["VADER_class"] == "Negative"].value_counts()
neu = redditdf["VADER_class"][redditdf["VADER_class"] == "Neutral"].value_counts()


all1 = pd.DataFrame({"class": ["Positive", "Neutral", "Negative"],
                     "num": [(int(pos)/N)*100, (int(neu)/N)*100, (int(neg)/N)*100]})

#absolute val
sns.barplot(x = "class", y = "num", color = "blue", data = all1)
plt.ylabel("%")
plt.xlabel("classes")
plt.show()

print("\tPositive in %: ", (pos/N)*100, "\n\tNegative in %: ", (neg/N)*100, "\n\tNeutral in %: ", (neu/N)*100)

redditdf.head()

In [None]:
#indices over time
sym1 = findf["GME"]
sym2 = findf["MSCI_WORLD"]
sym3 = findf["SPY"]

#timeline
x_axis1 = findf["Date"] 

#plot -> indices
fig, ax = plt.subplots()
ax.plot(x_axis1, sym1, label = "GME", color = "orchid", linewidth = 2)
ax.plot(x_axis1, sym2, label = "MSCI-World", color = "crimson", linewidth = 2)
ax.plot(x_axis1, sym3, label = "SPY", color = "palegreen", linewidth = 2)
plt.ylabel("in US$")
plt.xlabel("timeline")

#stepsize of date = 2months
loc = MonthLocator(interval=2)
ax.xaxis.set_major_locator(loc)

plt.setp(ax.get_xticklabels(), rotation=45)
plt.legend()
plt.show()

In [None]:
#grouping values
daily_class = redditdf.groupby(["timestamp", "VADER_class"]).size().unstack(fill_value=0)
daily_class = daily_class.astype(int)
daily_class.reset_index(inplace = True)
pos = (daily_class["Positive"])/N*100
neu = (daily_class["Neutral"])/N*100
neg = (daily_class["Negative"])/N*100

In [None]:
#merging the csv-files
merge_df = pd.merge(redditdf, findf, left_on = "timestamp", right_on = "Date")

#merge_df.head(-10)

In [None]:
merge_df.dtypes

In [None]:
#creating column "days" to calculate with --> Day0 = 2020-12-08; Day752 = 2022-12-30 
merge_df["delta"] = pd.to_numeric((merge_df["timestamp"] - min(merge_df["timestamp"])).dt.days, downcast = "integer")
merge_df["pos"] = pd.to_numeric(merge_df["pos"]).astype(float)
merge_df["neu"] = pd.to_numeric(merge_df["neu"]).astype(float)
merge_df["neg"] = pd.to_numeric(merge_df["neg"]).astype(float)
merge_df = merge_df.dropna()

merge_df.dtypes

In [None]:
#now the regression operators:
#features = list(merge_df.columns["delta", "neg", "neu", "pos", "compound"])
df = pd.DataFrame().assign(Delta = merge_df["delta"], Compound = merge_df["compound"]).dropna()
features = list(df.dropna())
df.dtypes

In [None]:
merge_df.corr()

In [None]:
columns = ["neg", "neu", "pos", "compound", "GME", "MSCI_WORLD", "SPY", "delta"]
cm = np.corrcoef(merge_df[columns].values.T)
hm = heatmap (cm, row_names=columns, column_names=columns)
plt.show()

In [None]:
features = list(df)
#print(features) --> just print it, if your PC can handle that. Otherwise: don't do it.

target1 = "GME"
target2 = "MSCI_WORLD"
target3 = "SPY"
#target1.reset_index(inplace = True)
#target2.reset_index(inplace = True)
#target3.reset_index(inplace = True)

In [None]:
#normalizing data
def get_data(df, features, normalize = False):
    X = df.loc[:, features]
    if not normalize:
        return X.to_numpy()
    return preprocessing.scale(X)

In [None]:
X_norm = get_data(df, features, normalize = True)
X = get_data(df, features)

y1_norm = get_data(merge_df, target1, normalize = True)
y1 = get_data(merge_df, target1)

y2_norm = get_data(merge_df, target2, normalize = True)
y2 = get_data(merge_df, target2)

y3_norm = get_data(merge_df, target3, normalize = True)
y3 = get_data(merge_df, target3)

In [None]:
#scatterplots
columns = ["compound", "delta", "GME", "MSCI_WORLD", "SPY"]

scatterplotmatrix(merge_df[columns].values, figsize = (15,10), names = columns)
plt.tight_layout()
plt.show()

In [None]:
#MSE (mean squared error), bias, variance using k-fold (k=5) for GME
lam = 1.0
reg = Ridge(alpha = 1.0)
cv = KFold(n_splits=5)

def cross_validate(X, y1, lam=1.0):
    mse1 = []
    bias1 = []
    variance1 = []
    #splitting the data to 5 parts -> 1 test dataset + 4 training datasets. Therefore I can check how consistent it is.
    

    #performing cross validation + getting y-mean + beta-mean
    reg = Ridge(alpha=lam)
    for train, test in cv.split(X):
        X_train, X_test, y1_train, y1_test = X[train], X[test], y1[train], y1[test]
        
        m = reg.fit(X_train, y1_train)
        y1_hat = reg.predict(X_test)
        beta = reg.coef_
        mse1.append(mean_squared_error(y1_test, y1_hat))
        bias1.append(np.mean(y1_hat - y1_test)**2)
        variance1.append(np.var(y1_hat))
    print(np.mean(y1_hat), "\n", np.mean(beta))
    
    return(np.mean(mse1), np.mean(bias1), np.mean(variance1))

#k-fold cross validation with lambda = 1.0
mse1, bias1, variance1 = cross_validate(X_norm, y1_norm, 1)

print

print("GME\nfor lambda = 1.0: \n\tmse=%f \n\tbias=%f \n\tvariance=%f"%(mse1,bias1,variance1))

In [None]:
#cv + reg
reg.fit

In [None]:
#MSE (mean squared error), bias, variance using k-fold (k=5) for MSCI-World

def cross_validate(X, y2, lam=1.0):
    mse2 = []
    bias2 = []
    var2 = []
    #splitting the data to 5 parts -> 1 test dataset + 4 training datasets. Therefore I can check how consistent it is.

    #performing cross validation + getting y-mean + beta-mean
    reg = Ridge(alpha=lam)
    for train, test in cv.split(X):
        X_train, X_test, y2_train, y2_test = X[train], X[test], y2[train], y2[test]
        
        reg.fit(X_train, y2_train)
        y2_hat = reg.predict(X_test)
        beta = reg.coef_

        mse2.append(mean_squared_error(y2_test, y2_hat))
        bias2.append(np.mean(y2_hat - y2_test)**2)
        var2.append(np.var(y2_hat))
        
        
    print(np.mean(y2_hat), "\n", np.mean(beta))
    return(np.mean(mse2), np.mean(bias2), np.mean(var2))

#k-fold cross validation with lambda = 1.0
mse2, bias2, variance2 = cross_validate(X_norm, y2_norm, 1.0)

print("MSCI_WORLD\nfor lambda = 1.0: \n\tmse=%f \n\tbias=%f \n\tvariance=%f"%(mse2,bias2,variance2,))

In [None]:
#MSE (mean squared error), bias, variance using k-fold (k=5) for SPY

def cross_validate(X, y3, lam=1.0):
    mse3 = []
    bias3 = []
    var3 = []
    #splitting the data to 5 parts -> 1 test dataset + 4 training datasets. Therefore I can check how consistent it is.

    #performing cross validation + getting y-mean + beta-mean
    reg = Ridge(alpha=lam)
    for train, test in cv.split(X):
        X_train, X_test, y3_train, y3_test = X[train], X[test], y3[train], y3[test]

        reg.fit(X_train, y3_train)
        y3_hat = reg.predict(X_test)
        beta = reg.coef_

        mse3.append(mean_squared_error(y3_test, y3_hat))
        bias3.append(np.mean(y3_hat - y3_test)**2)
        var3.append(np.var(y3_hat))
    print(np.mean(y3_hat), "\n", np.mean(beta))
    return(np.mean(mse3), np.mean(bias3), np.mean(var3))

#k-fold cross validation with lambda = 1.0
mse3, bias3, variance3 = cross_validate(X_norm, y3_norm, 1.0)

print("SPY: \nfor lambda = 1.0: \n\tmse=%f \n\tbias=%f \n\tvariance=%f"%(mse3,bias3,variance3,))