In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from textblob import TextBlob
from datetime import datetime, timedelta, date
from statistics import mean, median, mode

## Stocks Data

In [2]:
sp500 = pd.read_csv('../GSPC.csv')

try:
    sp500['Close_Diff'] = pd.to_numeric(sp500['Close']).diff()
    sp500['Close_Diff_Increase'] = sp500['Close_Diff'] > 0
except:
    print(f'Unable to calculate diff for {col}')
    
sp500 = sp500[~sp500['Close_Diff'].isna()]

In [13]:
sp500.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Close_Diff,Close_Diff_Increase
1,2017-01-04,2261.600098,2272.820068,2261.600098,2270.75,2270.75,3764890000,12.919922,True
2,2017-01-05,2268.179932,2271.5,2260.449951,2269.0,2269.0,3761820000,-1.75,False
3,2017-01-06,2271.139893,2282.100098,2264.060059,2276.97998,2276.97998,3339890000,7.97998,True
4,2017-01-09,2273.590088,2275.48999,2268.899902,2268.899902,2268.899902,3217610000,-8.080078,False
5,2017-01-10,2269.719971,2279.27002,2265.27002,2268.899902,2268.899902,3638790000,0.0,False


## Baselines - Classifiers

In [3]:
def average(results):
    return sum(results) / len(results)

def weighted_average(results, weights):
    return results.dot(abs(weights)) / sum(abs(weights))

### Predicting most common value

In [4]:
print('Average: {}'.format(average(sp500['Close_Diff_Increase'])))
print('Weighted Average: {}'.format(weighted_average(sp500['Close_Diff_Increase'], sp500['Close_Diff'])))

Average: 0.5561797752808989
Weighted Average: 0.5369783780919439


### Predicting the previous day's price as the current day's price (last value prediction)

In [5]:
predictions = sp500['Close_Diff_Increase'][:-1]
actual = sp500['Close_Diff_Increase'][1:]
print('Average: {}'.format(average(predictions.values == actual.values)))
print('Weighted Average: {}'.format(weighted_average(predictions.values == actual.values, sp500['Close_Diff'][1:])))

Average: 0.49507735583684953
Weighted Average: 0.5254967231767075


### Using a moving average of prices for the past n days to predict the current day's price

In [6]:
for n in range(3, 10, 2):
    predictions = sp500['Close_Diff_Increase'].rolling(window=n).apply(mode, raw=False).shift()[n:]
    actual = sp500['Close_Diff_Increase'][n:]
    print('Last {} days average: {}'.format(n, average(predictions.values == actual.values)))
    print('Last {} days weighted average: {}'.format(n, weighted_average(predictions.values == actual.values, 
                                                                         sp500['Close_Diff'][n:])))

Last 3 days average: 0.5162200282087447
Last 3 days weighted average: 0.576639607421786
Last 5 days average: 0.5233380480905233
Last 5 days weighted average: 0.548734263125999
Last 7 days average: 0.5148936170212766
Last 7 days weighted average: 0.5592074739288141
Last 9 days average: 0.5078236130867709
Last 9 days weighted average: 0.5449829716887348


In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

for n in range(1, 6):
    df_dict = {}
    for x in range(1, n+1):
        df_dict[f'T-{x}'] = sp500['Close_Diff_Increase'][n-x:-x].values

    X = pd.DataFrame(df_dict)
    Y = sp500[['Close_Diff', 'Close_Diff_Increase']][n:]
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    classifier = KNeighborsClassifier()
    classifier.fit(X_train, Y_train['Close_Diff_Increase'])

    predictions = classifier.predict(X_test)
    print('Last {} days average: {}'.format(n, average(predictions == Y_test['Close_Diff_Increase'])))
    print('Last {} days weighted average: {}'.format(n, weighted_average(predictions == Y_test['Close_Diff_Increase'], 
                                                                         Y_test['Close_Diff'])))

Last 1 days average: 0.46853146853146854
Last 1 days weighted average: 0.46539986118899396
Last 2 days average: 0.5140845070422535
Last 2 days weighted average: 0.536446687839115
Last 3 days average: 0.4647887323943662
Last 3 days weighted average: 0.47467517602704856
Last 4 days average: 0.47183098591549294
Last 4 days weighted average: 0.4898147137779147
Last 5 days average: 0.5211267605633803
Last 5 days weighted average: 0.5621540379784321


### Using a time series based linear regression without news data and with the previous n days as features

## Baselines - Regression

In [8]:
from sklearn.metrics import mean_squared_error

### Predicting the average

In [9]:
mean_predictions = [mean(sp500['Close_Diff'])] * len(sp500['Close_Diff'])
median_predictions = [median(sp500['Close_Diff'])] * len(sp500['Close_Diff'])
actual = sp500['Close_Diff']

print('Mean: {}'.format(mean_squared_error(actual, mean_predictions)))
print('Median: {}'.format(mean_squared_error(actual, median_predictions)))

Mean: 493.901657491231
Median: 494.22631884436794


### Predicting the previous day's price as the current day's price (last value prediction)

In [10]:
predictions = sp500['Close_Diff'][:-1]
actual = sp500['Close_Diff'][1:]
mean_squared_error(actual.values, predictions.values)

1021.7089531444352

### Using a moving average of prices for the past n days to predict the current day's price

In [11]:
print('Mean: ')
for n in range(2, 10):
    predictions = sp500['Close_Diff'].rolling(window=n).apply(mean, raw=False).shift()[n:]
    actual = sp500['Close_Diff'][n:]
    print('Last {} days: {}'.format(n, mean_squared_error(actual.values, predictions.values)))
    
print('\nMedian: ')
for n in range(2, 10):
    predictions = sp500['Close_Diff'].rolling(window=n).apply(median, raw=False).shift()[n:]
    actual = sp500['Close_Diff'][n:]
    print('Last {} days: {}'.format(n, mean_squared_error(actual.values, predictions.values)))

Mean: 
Last 2 days: 776.797729173118
Last 3 days: 655.1352998934882
Last 4 days: 621.9076860565056
Last 5 days: 607.6424449137635
Last 6 days: 590.5040267915564
Last 7 days: 566.0162416813055
Last 8 days: 572.2019222251441
Last 9 days: 559.4475981465187

Median: 
Last 2 days: 776.797729173118
Last 3 days: 679.1258591807976
Last 4 days: 634.805955401581
Last 5 days: 632.9022678013438
Last 6 days: 600.6223069411042
Last 7 days: 593.580351209557
Last 8 days: 587.4457761683861
Last 9 days: 565.7323924907972


### Using a time series based linear regression without news data and with the previous n days as features

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

for n in range(1, 6):
    df_dict = {}
    for x in range(1, n+1):
        df_dict[f'T-{x}'] = sp500['Close_Diff'][n-x:-x].values

    X = pd.DataFrame(df_dict)
    Y = sp500['Close_Diff'][n:]
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    regressor = LinearRegression()
    regressor.fit(X_train, Y_train)

    predictions = regressor.predict(X_test)
    print('Last {} days: {}'.format(n, mean_squared_error(Y_test, predictions)))

Last 1 days: 573.6634281166897
Last 2 days: 555.7812001678834
Last 3 days: 529.5318518468746
Last 4 days: 446.5226402731071
Last 5 days: 448.06095940068275
