In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
from datetime import datetime, timedelta, date
from statistics import mean, median, mode

## Stocks Data

In [2]:
sp500 = pd.read_csv('GSPC.csv')

try:
    sp500['Close_Diff'] = pd.to_numeric(sp500['Close']).diff()
    sp500['Close_Diff_Increase'] = sp500['Close_Diff'] > 0
except:
    print(f'Unable to calculate diff for {col}')
    
sp500 = sp500[~sp500['Close_Diff'].isna()]

## Baselines - Classifiers

In [113]:
def average(results):
    return sum(results) / len(results)

def weighted_average(results, weights):
    return results.dot(abs(weights)) / sum(abs(weights))

### Predicting most common value

In [114]:
print('Average: {}'.format(average(sp500['Close_Diff_Increase'])))
print('Weighted Average: {}'.format(weighted_average(sp500['Close_Diff_Increase'], sp500['Close_Diff'])))

Average: 0.5561797752808989
Weighted Average: 0.5369783780919439


### Predicting the previous day's price as the current day's price (last value prediction)

In [116]:
predictions = sp500['Close_Diff_Increase'][:-1]
actual = sp500['Close_Diff_Increase'][1:]
print('Average: {}'.format(average(predictions.values == actual.values)))
print('Weighted Average: {}'.format(weighted_average(predictions.values == actual.values, sp500['Close_Diff'][1:])))

Average: 0.49507735583684953
Weighted Average: 0.5254967231767075


### Using a moving average of prices for the past n days to predict the current day's price

In [117]:
for n in range(3, 10, 2):
    predictions = sp500['Close_Diff_Increase'].rolling(window=n).apply(mode, raw=False)[n:]
    actual = sp500['Close_Diff_Increase'][n:]
    print('Last {} days average: {}'.format(n, average(predictions.values == actual.values)))
    print('Last {} days weighted average: {}'.format(n, weighted_average(predictions.values == actual.values, 
                                                                         sp500['Close_Diff'][n:])))

Last 3 days average: 0.7489421720733427
Last 3 days weighted average: 0.7600737223062531
Last 5 days average: 0.6888260254596889
Last 5 days weighted average: 0.7150582020642741
Last 7 days average: 0.649645390070922
Last 7 days weighted average: 0.6753982617038242
Last 9 days average: 0.6344238975817923
Last 9 days weighted average: 0.6578163783400869


### Using a time series based linear regression without news data and with the previous n days as features

In [120]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

for n in range(1, 6):
    df_dict = {}
    for x in range(1, n+1):
        df_dict[f'T-{x}'] = sp500['Close_Diff_Increase'][n-x:-x].values

    X = pd.DataFrame(df_dict)
    Y = sp500[['Close_Diff', 'Close_Diff_Increase']][n:]
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    classifier = KNeighborsClassifier()
    classifier.fit(X_train, Y_train['Close_Diff_Increase'])

    predictions = classifier.predict(X_test)
    print('Last {} days average: {}'.format(n, average(predictions == Y_test['Close_Diff_Increase'])))
    print('Last {} days weighted average: {}'.format(n, weighted_average(predictions == Y_test['Close_Diff_Increase'], 
                                                                         Y_test['Close_Diff'])))

Last 1 days average: 0.5104895104895105
Last 1 days weighted average: 0.47165592590914407
Last 2 days average: 0.4295774647887324
Last 2 days weighted average: 0.4367610877005068
Last 3 days average: 0.5563380281690141
Last 3 days weighted average: 0.6350036641900725
Last 4 days average: 0.4507042253521127
Last 4 days weighted average: 0.5286900296014021
Last 5 days average: 0.5211267605633803
Last 5 days weighted average: 0.5026746712126555


## Baselines - Regression

In [58]:
from sklearn.metrics import mean_squared_error

### Predicting the average

In [59]:
mean_predictions = [mean(sp500['Close_Diff'])] * len(sp500['Close_Diff'])
median_predictions = [median(sp500['Close_Diff'])] * len(sp500['Close_Diff'])
actual = sp500['Close_Diff']

print('Mean: {}'.format(mean_squared_error(actual, mean_predictions)))
print('Median: {}'.format(mean_squared_error(actual, median_predictions)))

Mean: 493.901657491231
Median: 494.22631884436794


### Predicting the previous day's price as the current day's price (last value prediction)

In [60]:
predictions = sp500['Close_Diff'][:-1]
actual = sp500['Close_Diff'][1:]
mean_squared_error(actual.values, predictions.values)

1021.7089531444352

### Using a moving average of prices for the past n days to predict the current day's price

In [61]:
print('Mean: ')
for n in range(2, 10):
    predictions = sp500['Close_Diff'].rolling(window=n).apply(mean, raw=False)[n:]
    actual = sp500['Close_Diff'][n:]
    print('Last {} days: {}'.format(n, mean_squared_error(actual.values, predictions.values)))
    
print('\nMedian: ')
for n in range(2, 10):
    predictions = sp500['Close_Diff'].rolling(window=n).apply(median, raw=False)[n:]
    actual = sp500['Close_Diff'][n:]
    print('Last {} days: {}'.format(n, mean_squared_error(actual.values, predictions.values)))

Mean: 
Last 2 days: 255.71121798387588
Last 3 days: 345.726783636987
Last 4 days: 368.86790648995844
Last 5 days: 398.57695766023784
Last 6 days: 422.55421429955135
Last 7 days: 434.39174530877267
Last 8 days: 433.9654911990206
Last 9 days: 452.66522272932167

Median: 
Last 2 days: 255.71121798387588
Last 3 days: 425.8621212300155
Last 4 days: 392.72963122142664
Last 5 days: 431.7186368389824
Last 6 days: 453.93635541924357
Last 7 days: 462.5231118428118
Last 8 days: 461.58090617815
Last 9 days: 486.0228007282509


### Using a time series based linear regression without news data and with the previous n days as features

In [96]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

for n in range(1, 6):
    df_dict = {}
    for x in range(1, n+1):
        df_dict[f'T-{x}'] = sp500['Close_Diff'][n-x:-x].values

    X = pd.DataFrame(df_dict)
    Y = sp500['Close_Diff'][n:]
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    regressor = LinearRegression()
    regressor.fit(X_train, Y_train)

    predictions = regressor.predict(X_test)
    print('Last {} days: {}'.format(n, mean_squared_error(Y_test, predictions)))

Last 1 days: 547.6548125266916
Last 2 days: 418.53244808650146
Last 3 days: 576.5804882767417
Last 4 days: 515.8668751824003
Last 5 days: 635.0937395815446
