In [1]:
from preprocessing import *
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Normalizer

### Purpose: To investigate the predictive ability of only technical indicators using simple algorithms 
Conclusion: Technical indicators on this dataset can achieve an accuracy of about 52% to 53% using non-neural-network type classifiers. Individual technical indicators generally have a very weak correlation between their value and the resulting market close the next day and must be either analyzed together using a more comprehensive algorithm (such as a NN) or in conjunction with other data (sentiment analysis, quarterly reports, etc). In other words, I have no idea why technical indicators are even a thing since they don't seem to give an advantage in predictions. 

In [9]:
def prep_data(file, test_split, valid_split, drop_columns=[], scaler=MinMaxScaler()):
    df = combine_csvs_from_folder('market_data/merged_data', scaler)
    df = add_up_column(df)
    df["next_up"] = df["up"].shift(-1)
    df.dropna(inplace=True)
    df.drop(columns=drop_columns, inplace=True)
    X_column = df.columns[:-1]
    y_column = df.columns[-1]
    X = df[X_column].to_numpy()
    y = df[y_column].to_numpy().astype(int)
    X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size=test_split, shuffle=True, random_state=42)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size=valid_split, shuffle=True, random_state=52)
    print("Features in X_data: \n\t", end="")
    [print(c, end=", ") for c in X_column]
    print()
    print(f"length of training data: {y_train.size}")
    print(f"length of validation data: {y_valid.size}")
    print(f"length of testing data: {y_test.size}")
    return X_column, X_train, X_valid, X_test, y_train, y_valid, y_test

features_to_drop = ["1. open", "2. high", "3. low", "4. close", "5. adjusted close", "6. volume", "7. dividend amount", "8. split coefficient", "up", "company"]
X_column, X_train, X_valid, X_test, y_train, y_valid, y_test = prep_data('market_data/merged_data', 0.2, 0.2, drop_columns=features_to_drop, scaler=MinMaxScaler())

Features in X_data: 
	Chaikin A/D, ADOSC, ADX, ADXR, APO, Aroon Down, Aroon Up, AROONOSC, ATR, Real Upper Band, Real Middle Band, Real Lower Band, BOP, CCI, CMO, DEMA, DX, EMA, DCPERIOD, HT_DCPHASE, PHASE, QUADRATURE, SINE, LEAD SINE, HT_TRENDLINE, TRENDMODE, KAMA, MACD, MACD_Signal, MACD_Hist, MACD.1, MACD_Signal.1, MACD_Hist.1, MAMA, FAMA, MFI, MIDPOINT, MIDPRICE, MINUS_DI, MINUS_DM, MOM, NATR, OBV, PLUS_DI, PLUS_DM, PPO, ROC, ROCR, RSI, SAR, SMA, SlowK, SlowD, FastK, FastD, FastK.1, FastD.1, T3, TEMA, TRANGE, TRIMA, TRIX, ULTOSC, WILLR, WMA, 
length of training data: 27560
length of validation data: 6890
length of testing data: 8613


# Testing individual Tech Indicators with Logistic Regresssion

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

n_samples, n_features = X_train.shape
for feature in range(n_features):
    x = X_train[:, feature].reshape(-1,1)
    model = LogisticRegression()
    model.fit(x, y_train)
    y_pred = model.predict(X_test[:, feature].reshape(-1,1))
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy using only {X_column[feature]} is {acc}")

Accuracy using only Chaikin A/D is 0.520840589806107
Accuracy using only ADOSC is 0.520840589806107
Accuracy using only ADX is 0.5209566933704864
Accuracy using only ADXR is 0.520840589806107
Accuracy using only APO is 0.5188668292116568
Accuracy using only Aroon Down is 0.520840589806107
Accuracy using only Aroon Up is 0.520840589806107
Accuracy using only AROONOSC is 0.520840589806107
Accuracy using only ATR is 0.520840589806107
Accuracy using only Real Upper Band is 0.520840589806107
Accuracy using only Real Middle Band is 0.520840589806107
Accuracy using only Real Lower Band is 0.520840589806107
Accuracy using only BOP is 0.5189829327760362
Accuracy using only CCI is 0.5207244862417276
Accuracy using only CMO is 0.5230465575293162
Accuracy using only DEMA is 0.520840589806107
Accuracy using only DX is 0.5218855218855218
Accuracy using only EMA is 0.520840589806107
Accuracy using only DCPERIOD is 0.520840589806107
Accuracy using only HT_DCPHASE is 0.520840589806107
Accuracy using on

## Simple Classifiers

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier, Perceptron, RidgeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

models = [KNeighborsClassifier(),
          LogisticRegression(),
          SGDClassifier(penalty="l1"),
          Perceptron(),
          RidgeClassifier(),
          LinearSVC(),
          DecisionTreeClassifier()
          ]
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    acc = accuracy_score(y_valid, y_pred)
    print(f"Accuracy using {model}: {acc}")

Accuracy using KNeighborsClassifier(): 0.5005805515239478


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy using LogisticRegression(): 0.5245283018867924
Accuracy using SGDClassifier(penalty='l1'): 0.5230769230769231
Accuracy using Perceptron(): 0.5182873730043541
Accuracy using RidgeClassifier(): 0.5206095791001452
Accuracy using LinearSVC(): 0.5230769230769231
Accuracy using DecisionTreeClassifier(): 0.5105950653120465


## Testing Various SVM

In [14]:
from sklearn.svm import SVC
from itertools import product

c_vals = [0.01, 0.1, 1, 10]
kernels = ["linear", "poly", "rbf", "sigmoid"]
for c,k in product(c_vals, kernels):
    model = SVC(C=c, kernel=k, max_iter=10000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy using SVC with c={c} and k={k}: {acc}")



Accuracy using SVC with c=0.01 and k=linear: 0.5062115406942993




Accuracy using SVC with c=0.01 and k=poly: 0.5201439684198305




Accuracy using SVC with c=0.01 and k=rbf: 0.5186346220828979




Accuracy using SVC with c=0.01 and k=sigmoid: 0.5151515151515151




Accuracy using SVC with c=0.1 and k=linear: 0.5166608614884477




Accuracy using SVC with c=0.1 and k=poly: 0.520840589806107




Accuracy using SVC with c=0.1 and k=rbf: 0.5233948682224544
Accuracy using SVC with c=0.1 and k=sigmoid: 0.4836874492046906




Accuracy using SVC with c=1 and k=linear: 0.5197956577266922




Accuracy using SVC with c=1 and k=poly: 0.5178218971322419




Accuracy using SVC with c=1 and k=rbf: 0.5158481365377917
Accuracy using SVC with c=1 and k=sigmoid: 0.4841518634622083




Accuracy using SVC with c=10 and k=linear: 0.48566120979914085




Accuracy using SVC with c=10 and k=poly: 0.4935562521769418




Accuracy using SVC with c=10 and k=rbf: 0.5095785440613027
Accuracy using SVC with c=10 and k=sigmoid: 0.48438407059096716


In [15]:
from sklearn.svm import SVC
from itertools import product

c_vals = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1]
kernels = ["poly"]
for c,k in product(c_vals, kernels):
    model = SVC(C=c, kernel=k, max_iter=10000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy using SVC with c={c} and k={k}: {acc}")



Accuracy using SVC with c=1e-05 and k=poly: 0.5179380006966214




Accuracy using SVC with c=0.0001 and k=poly: 0.5194473470335539




Accuracy using SVC with c=0.001 and k=poly: 0.5187507256472774




Accuracy using SVC with c=0.01 and k=poly: 0.5201439684198305




Accuracy using SVC with c=0.1 and k=poly: 0.520840589806107




Accuracy using SVC with c=1 and k=poly: 0.5178218971322419
