# KNN classifier

### Target2: 10% increase in 5 days

In [1]:
import pandas as pd
import numpy as np
import time, datetime
import math
import matplotlib.pyplot as plt

### Gather data and create features

In [2]:
import pandas_datareader as web

  from pandas.util.testing import assert_frame_equal


In [3]:
tickers = ['VBIV', 'AGRX', 'VXRT', 'JAGX', 'MNKD', 'OPK', 'IMGN']

In [4]:
today = datetime.date.today()
start_date = '2018-01-01'
end_date = today

In [5]:
#The following function loads the data for each ticker and creates the features and target variables
#For more information about each feature see FeaturesEngineering.ipynb

def get_data(ticker):
    df = web.DataReader(ticker, data_source = 'yahoo', start = start_date, end = end_date)
    df = df.reset_index()
    df['Symbol'] = ticker
    
    df['Avg_Vol_20'] = -1
    for row in range(20, df.shape[0]):
        df.iloc[row, -1] = np.mean(df.iloc[row-20:row, df.columns.get_loc('Volume')])
    
    df['Vol_Ratio1'] = df['Volume'] / df['Avg_Vol_20']
    
    df['Open/Close'] = -1
    for row in range(1, df.shape[0]):
        df.iloc[row,-1] = df.iloc[row, df.columns.get_loc('Open')] / df.iloc[row-1, df.columns.get_loc('Close')]
    
    df['Low/Open'] = df['Low'] / df['Open']
    df['High/Open'] = df['High'] / df['Open']
    
    df['DailyLogReturn'] = -1
    for row in range(1, df.shape[0]):
        df.iloc[row, -1] = math.log(df.iloc[row, df.columns.get_loc('Close')] / df.iloc[row-1, df.columns.get_loc('Close')])

    df['SquaredDailyLogReturn'] = df['DailyLogReturn']**2
    
    df['SMA10'] = -1
    for row in range(10, df.shape[0]):
        df.iloc[row, -1] = np.average(df.iloc[row-10:row, df.columns.get_loc('Close')])
    
    df['SMA30'] = -1
    for row in range(30, df.shape[0]):
        df.iloc[row, -1] = np.average(df.iloc[row-30:row, df.columns.get_loc('Close')])
        
    df['SMA_Ratio'] = df['SMA30'] / df['SMA10']
    
    Smoothing = 2
    Days = 12
    df['EMA12'] = -1
    #The first observation of EMA will not a have a previous EMA so we will use a 12-day SMA
    df.iloc[12, -1] = (df.iloc[row, df.columns.get_loc('Close')] * Smoothing / (1 + Days)
                       + (np.average(df.iloc[0:12, df.columns.get_loc('Close')]) * (1 - (Smoothing / (1 + Days)))))

    #For the remaining observations, we will use the previous EMA as in the formula
    for row in range(13, df.shape[0]):
        df.iloc[row, -1] = (df.iloc[row, df.columns.get_loc('Close')] * Smoothing / (1 + Days)
                       + (df.iloc[row - 1, -1]) * (1 - (Smoothing / (1 + Days))))    
    
    df['StochasticOscillator'] = -1
    for row in range(13, df.shape[0]):
        df.iloc[row, -1] = ((df.iloc[row, df.columns.get_loc('Close')] - np.min(df.iloc[row-13:row+1, df.columns.get_loc('Low')]))
                            / (np.max(df.iloc[row-13:row+1, df.columns.get_loc('High')]) - np.min(df.iloc[row-13:row+1, df.columns.get_loc('Low')]))
                            * 100)
        
    #Target1: Binary- Up(1) or Down(0) movement, closing price to closing price
    df['Target1'] = -1
    for row in range(0, df.shape[0]-1):
        if df.iloc[row, df.columns.get_loc('Close')] < df.iloc[row+1, df.columns.get_loc('Close')]:
            df.iloc[row, -1] = 1
        else:
            df.iloc[row, -1] = 0
            
            
    #Target2: Binary: (1)Price increases by 10% withing 5 days, or not(0)
    df['Target2'] = -1
    for row in range(0, df.shape[0]-4):
        if df.iloc[row, df.columns.get_loc('Close')] * 1.1 < np.max(df.iloc[row+1:row+6, df.columns.get_loc('High')]):
            df.iloc[row, -1] = 1
        else:
            df.iloc[row, -1] = 0
            
            
    #Target3: Multiclassification: Next days movement
    df['Target3'] = -1
    for row in range(df.shape[0] - 1):
        current_close = df.iloc[row, df.columns.get_loc('Close')]
        next_close = df.iloc[row + 1, df.columns.get_loc('Close')]
        ratio = next_close / current_close

        if ratio <= 0.9:
            df.iloc[row, -1] = 1
        elif ratio <= 0.95:
            df.iloc[row, -1] = 2
        elif ratio <= 1:
            df.iloc[row, -1] = 3
        elif ratio <= 1.05:
            df.iloc[row, -1] = 4
        elif ratio <= 1.1:
            df.iloc[row, -1] = 5
        else:
            df.iloc[row, -1] = 6
        
    df = df[(df['Avg_Vol_20'] != -1) & (df['Open/Close'] != -1) & (df['SMA10'] != -1) & (df['SMA30'] != -1)
            & (df['EMA12'] != -1) & (df['StochasticOscillator'] != -1) & (df['Target1'] != -1)
            & (df['Target2'] != -1) & (df['Target3'] != -1)]
            
    return df 
    

In [6]:
start_time = time.time()

df = pd.DataFrame()

for ticker in tickers:
    sub_df = get_data(ticker)
    df = pd.concat([df, sub_df])
    
print(time.time() - start_time)

24.36577558517456


### Target2 Count

In [7]:
df[['Target2', 'Symbol']].groupby('Target2').count()

Unnamed: 0_level_0,Symbol
Target2,Unnamed: 1_level_1
0,2700
1,1514


### Train Test Split

In [8]:
#X_features = list(df.columns)[8:-3]
X_features = (['Vol_Ratio1',
             'Open/Close',
             'Low/Open',
             'High/Open',
             'DailyLogReturn',
             'SquaredDailyLogReturn',
             'SMA_Ratio',
             'StochasticOscillator'])
y_feature = 'Target2'

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df[X_features], df[y_feature], random_state = 0)

In [11]:
print(X_train.shape)

(3160, 8)


### Performance Metrics

In [12]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix

In [13]:
def performance(model, X_train, X_test, y_train, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    
    print('Model train accuracy: %0.3f' % accuracy_score(y_train, y_pred_train))
    print('Model test accuracy: %0.3f' % accuracy_score(y_test, y_pred_test))
    
    
    
    #confusion matrix
    print('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred_test))
    
    print('')
    #accuracy
    print('Accuracy score: ', accuracy_score(y_test, y_pred_test))
    
    print(' ')
    #recall score
    print('Recall score: ', recall_score(y_test, y_pred_test))
    
    print(' ')
    #precision score
    print('Precision score: ', precision_score(y_test, y_pred_test))

### Scaling

In [14]:
from sklearn.preprocessing import scale

In [15]:
X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

### KNN Model without scaling

In [16]:
from sklearn.neighbors import KNeighborsClassifier

In [17]:
for k in range(1, 11):
    clf = KNeighborsClassifier(n_neighbors = k)
    model1 = clf.fit(X_train, y_train)
    y_pred = model1.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(k)
    print('1 predictions: ', cm[0][1] + cm[1][1])
    print('Precision: %.3f' % precision_score(y_test, y_pred))


1
1 predictions:  359
Precision: 0.379
2
1 predictions:  119
Precision: 0.412
3
1 predictions:  290
Precision: 0.376
4
1 predictions:  125
Precision: 0.392
5
1 predictions:  225
Precision: 0.418
6
1 predictions:  108
Precision: 0.407
7
1 predictions:  202
Precision: 0.396
8
1 predictions:  115
Precision: 0.391
9
1 predictions:  181
Precision: 0.381
10
1 predictions:  109
Precision: 0.385


### KNN after scaling

In [18]:
model2 = clf.fit(X_train_scaled, y_train)

In [19]:
performance(model2, X_train_scaled, X_test_scaled, y_train, y_test)

Model train accuracy: 0.692
Model test accuracy: 0.642
Confusion Matrix:
[[620  65]
 [312  57]]

Accuracy score:  0.642314990512334
 
Recall score:  0.15447154471544716
 
Precision score:  0.4672131147540984


In [20]:
for k in range(1, 11):
    clf = KNeighborsClassifier(n_neighbors = k)
    model1 = clf.fit(X_train_scaled, y_train)
    y_pred = model1.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    print(k)
    print('1 predictions: ', cm[0][1] + cm[1][1])
    print('Precision: %.3f' % precision_score(y_test, y_pred))

1
1 predictions:  385
Precision: 0.392
2
1 predictions:  150
Precision: 0.440
3
1 predictions:  303
Precision: 0.432
4
1 predictions:  155
Precision: 0.452
5
1 predictions:  260
Precision: 0.446
6
1 predictions:  149
Precision: 0.470
7
1 predictions:  231
Precision: 0.450
8
1 predictions:  137
Precision: 0.474
9
1 predictions:  200
Precision: 0.425
10
1 predictions:  122
Precision: 0.467


### Dummy Classifier

In [21]:
from sklearn.dummy import DummyClassifier

In [22]:
strategies = ['most_frequent', 'stratified', 'uniform'] 

In [23]:
for s in strategies:
    print(s)
    if s == 'constant':
        dclf = DummyClassifier(strategy = s, random_state = 0, constant = 'M').fit(X_train, y_train)
    else:
        dclf = DummyClassifier(strategy = s, random_state = 0).fit(X_train, y_train)
    performance(dclf, X_train, X_test, y_train, y_test)

most_frequent
Model train accuracy: 0.638
Model test accuracy: 0.650
Confusion Matrix:
[[685   0]
 [369   0]]

Accuracy score:  0.6499051233396584
 
Recall score:  0.0
 
Precision score:  0.0
stratified
Model train accuracy: 0.531
Model test accuracy: 0.581
Confusion Matrix:
[[464 221]
 [221 148]]

Accuracy score:  0.5806451612903226
 
Recall score:  0.4010840108401084
 
Precision score:  0.4010840108401084
uniform
Model train accuracy: 0.510
Model test accuracy: 0.509
Confusion Matrix:
[[348 337]
 [180 189]]

Accuracy score:  0.5094876660341556
 
Recall score:  0.5121951219512195
 
Precision score:  0.3593155893536122


  _warn_prf(average, modifier, msg_start, len(result))
