In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import quandl
import numpy as np
import pandas as pd
import pickle

In [34]:
# Load dictionary of stock prices
with open('stock_prices_dict.pickle', 'rb') as f:
    stock_prices_dict = pickle.load(f)

In [28]:
# Extracting stock prices for Wipro
historical_stock_price = stock_prices_dict["WIPRO"]
historical_stock_price.head()
historical_stock_price.shape


Unnamed: 0_level_0,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1998-03-20,651.0,682.0,651.0,,,,
1998-03-23,690.0,693.95,690.0,691.5,691.5,400.0,2.76
1998-03-24,710.0,715.0,693.05,706.0,702.85,8500.0,59.83
1998-03-25,722.0,740.95,722.0,727.0,730.1,4450.0,32.52
1998-03-26,734.95,738.0,718.55,723.0,723.0,2800.0,20.4


(4929, 7)

In [59]:
# Extract data for last 1 year
historical_stock_price["date"] = pd.to_datetime(historical_stock_price.index)

In [60]:
stock_price_sub = historical_stock_price[historical_stock_price["date"] >= pd.to_datetime("2010-11-01")]

In [61]:
stock_price_sub.head(3)
stock_price_sub.shape

Unnamed: 0_level_0,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs),date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-11-01,428.0,428.0,422.05,425.0,423.7,984965.0,4186.94,2010-11-01
2010-11-02,425.05,436.6,422.5,434.95,433.85,1442069.0,6200.08,2010-11-02
2010-11-03,436.15,438.45,431.5,436.4,434.7,1031618.0,4494.58,2010-11-03


(1774, 8)

In [62]:
# Get closing day prices
closing_prices = stock_price_sub["Close"].values
closing_prices[0:10]

array([ 423.7 ,  433.85,  434.7 ,  436.75,  438.05,  431.2 ,  431.65,
        436.55,  428.65,  425.2 ])

In [63]:
# Previous day prices as features
# consider last 6 day prices

time_lag = 5
lag_prices = []
for i in range(time_lag,len(closing_prices)):
    lag_prices.append(closing_prices[(i-(time_lag)):i])


In [64]:
lag_prices[0]
lag_prices[1]

array([ 423.7 ,  433.85,  434.7 ,  436.75,  438.05])

array([ 433.85,  434.7 ,  436.75,  438.05,  431.2 ])

In [65]:
# Lag prices as features
features = pd.DataFrame(lag_prices)
features.reset_index(drop=True,inplace=True)
features.columns = list(map(lambda x:'t-'+ str(x),list(range(1,time_lag+1))))

In [66]:
# Create Training data
train_df = pd.DataFrame({"target_var": closing_prices[(time_lag):]})
train_df.reset_index(drop=True,inplace=True)
train_df = pd.concat([features,train_df], axis=1)
train_df.head(12)

Unnamed: 0,t-1,t-2,t-3,t-4,t-5,target_var
0,423.7,433.85,434.7,436.75,438.05,431.2
1,433.85,434.7,436.75,438.05,431.2,431.65
2,434.7,436.75,438.05,431.2,431.65,436.55
3,436.75,438.05,431.2,431.65,436.55,428.65
4,438.05,431.2,431.65,436.55,428.65,425.2
5,431.2,431.65,436.55,428.65,425.2,433.2
6,431.65,436.55,428.65,425.2,433.2,421.75
7,436.55,428.65,425.2,433.2,421.75,418.4
8,428.65,425.2,433.2,421.75,418.4,400.85
9,425.2,433.2,421.75,418.4,400.85,418.3


In [67]:
# Convert raw scores to percentages
perct_changes = train_df.apply(lambda x: [100.0 * a1 / a2 - 100 for a1, a2 in zip(x[1:], x)], axis = 1)
perct_changes = list(map(lambda x: list(x), perct_changes))
perct_changes = pd.DataFrame(perct_changes)
col_names = list(map(lambda x:'t-'+ str(x),list(range(1,time_lag))))
col_names.append("target_var")
perct_changes.columns = col_names

perct_changes.head(3)

Unnamed: 0,t-1,t-2,t-3,t-4,target_var
0,2.395563,0.19592,0.47159,0.297653,-1.563748
1,0.19592,0.47159,0.297653,-1.563748,0.10436
2,0.47159,0.297653,-1.563748,0.10436,1.135179


In [68]:
# Convert percent change to category
# compute percentile change quantiles
daily_perct_change = [100.0 * a1 / a2 - 100 for a1, a2 in zip(closing_prices[1:], closing_prices)]

percentiles = list(map(lambda x:np.nanpercentile(np.array(daily_perct_change),q = x),
                       [20,40,60,80]))

percentiles

[-1.0445217980848072,
 -0.23325565347734309,
 0.38151286983717236,
 1.1188100583758032]

In [69]:
# Function to convert %change to category
def change_perct_change_to_cat(raw_perct_change,percentiles):
    category = np.where(raw_perct_change < percentiles[0], "Cat1",
                        np.where(raw_perct_change < percentiles[1],"Cat2",
                                 np.where(raw_perct_change < percentiles[2], "Cat3",
                                          np.where(raw_perct_change < percentiles[3], "Cat4","Cat5"))))
    return category


In [70]:
# Check function
change_perct_change_to_cat(1,percentiles)

array('Cat4',
      dtype='<U4')

In [71]:
# Apply function on all columns
perct_changes_cat = perct_changes.apply(lambda x: change_perct_change_to_cat(x,percentiles), axis=1)
perct_changes_cat.head(3)
perct_changes.head(3)

Unnamed: 0,t-1,t-2,t-3,t-4,target_var
0,Cat5,Cat3,Cat4,Cat3,Cat1
1,Cat3,Cat4,Cat3,Cat1,Cat3
2,Cat4,Cat3,Cat1,Cat3,Cat5


Unnamed: 0,t-1,t-2,t-3,t-4,target_var
0,2.395563,0.19592,0.47159,0.297653,-1.563748
1,0.19592,0.47159,0.297653,-1.563748,0.10436
2,0.47159,0.297653,-1.563748,0.10436,1.135179


In [73]:
# 1-hot-encoding of categorical predictors

X = pd.get_dummies(perct_changes_cat.drop(labels=["target_var"],axis=1))
X.head(3)
X.shape
y = perct_changes_cat["target_var"]
y.shape

Unnamed: 0,t-1_Cat1,t-1_Cat2,t-1_Cat3,t-1_Cat4,t-1_Cat5,t-2_Cat1,t-2_Cat2,t-2_Cat3,t-2_Cat4,t-2_Cat5,t-3_Cat1,t-3_Cat2,t-3_Cat3,t-3_Cat4,t-3_Cat5,t-4_Cat1,t-4_Cat2,t-4_Cat3,t-4_Cat4,t-4_Cat5
0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0
2,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0


(1769, 20)

(1769,)

In [74]:
# Train/Test splitting

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

In [75]:
# training a DescisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth = 8).fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)

In [76]:
# creating a confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, dt_pred)
print("Accuracy score ", accuracy_score(y_test,dt_pred))
cm

Accuracy score  0.243792325056


array([[25, 19, 13, 10, 13],
       [19, 20, 27, 11, 10],
       [19, 21, 26, 12,  9],
       [23, 16, 23, 13, 19],
       [22, 21, 21,  7, 24]])

In [77]:
# training a linear SVM classifier
from sklearn.svm import SVC
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
svm_predict = svm_model_linear.predict(X_test)
 
# model accuracy for X_test  
accuracy = svm_model_linear.score(X_test, y_test)
 
# creating a confusion matrix
cm = confusion_matrix(y_test, svm_predict)
print(accuracy)
print(cm)

0.214446952596
[[21 15 13  8 23]
 [26 24 12  9 16]
 [22 20 13 11 21]
 [21 23 17  9 24]
 [29 14 17  7 28]]


In [98]:
# Generic function for any stock
def stock_predictor(comp_symb,time_lag,target_var_binary=False):
    historical_stock_price = stock_prices_dict[comp_symb]
    
    # Extract data for last t years
    historical_stock_price["date"] = pd.to_datetime(historical_stock_price.index)
    stock_price_sub = historical_stock_price[historical_stock_price["date"] >= pd.to_datetime("2010-11-01")]
    
    # Get closing day prices
    closing_prices = stock_price_sub["Close"].values
    
    # Previous day prices as features
    lag_prices = []
    for i in range(time_lag,len(closing_prices)):
        lag_prices.append(closing_prices[(i-(time_lag)):i])
        
    # Lag prices as features
    features = pd.DataFrame(lag_prices)
    features.reset_index(drop=True,inplace=True)
    features.columns = list(map(lambda x:'t-'+ str(x),list(range(1,time_lag+1))))
    
    # Create Training data
    train_df = pd.DataFrame({"target_var": closing_prices[(time_lag):]})
    train_df.reset_index(drop=True,inplace=True)
    train_df = pd.concat([features,train_df], axis=1)
    

    # Convert raw scores to percentages
    perct_changes = train_df.apply(lambda x: [100.0 * a1 / a2 - 100 for a1, a2 in zip(x[1:], x)], axis = 1)
    perct_changes = list(map(lambda x: list(x), perct_changes))
    perct_changes = pd.DataFrame(perct_changes)
    col_names = list(map(lambda x:'t-'+ str(x),list(range(1,time_lag))))
    col_names.append("target_var")
    perct_changes.columns = col_names
    
    # Convert percent change to category
    # compute percentile change quantiles
    daily_perct_change = [100.0 * a1 / a2 - 100 for a1, a2 in zip(closing_prices[1:], closing_prices)]

    percentiles = list(map(lambda x:np.nanpercentile(np.array(daily_perct_change),q = x),
                           [20,40,60,80]))
    
    # Convert %change to category
    if (target_var_binary):
        perct_changes_cat = perct_changes.drop(labels=["target_var"],axis=1).apply(lambda x: change_perct_change_to_cat(x,percentiles), axis=1)
        perct_changes_cat["target_var"] = [1 if x > 0 else 0 for x in perct_changes["target_var"]]
    else:
        perct_changes_cat = perct_changes.apply(lambda x: change_perct_change_to_cat(x,percentiles), axis=1)
    
    # 1-hot-encoding of categorical predictors
    X = pd.get_dummies(perct_changes_cat.drop(labels=["target_var"],axis=1))
    y = perct_changes_cat["target_var"]

    # Train/Test splitting
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)
    
    # training a linear SVM classifier
    from sklearn.svm import SVC
    svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
    svm_predict = svm_model_linear.predict(X_test)

    # model accuracy for X_test  
    accuracy = svm_model_linear.score(X_test, y_test)

    # creating a confusion matrix
    cm = confusion_matrix(y_test, svm_predict)
    res = {"comp_symb":comp_symb, "accuracy": accuracy, "cm": cm,
          "percentiles": percentiles}
    return res



In [100]:
stock_predictor("WIPRO",time_lag=10,target_var_binary=True)

{'accuracy': 0.50793650793650791, 'cm': array([[ 73, 123],
        [ 94, 151]]), 'comp_symb': 'WIPRO', 'percentiles': [-1.0445217980848072,
  -0.23325565347734309,
  0.38151286983717236,
  1.1188100583758032]}

In [101]:
stock_predictor("3MINDIA",time_lag=10)

{'accuracy': 0.2471655328798186, 'cm': array([[18, 21, 16, 16, 23],
        [19, 25, 13, 10,  8],
        [11, 22, 21, 21, 10],
        [12, 23, 19, 21, 20],
        [21, 16, 14, 17, 24]]), 'comp_symb': '3MINDIA', 'percentiles': [-1.0711845122428083,
  -0.33731828170748468,
  0.22898870798161375,
  1.1171879668124494]}

In [56]:
# Run function for all NSE 50 Stocks
nse_50 = pd.read_csv("nse_50.csv")
nse_50 = nse_50["Symbol"].values
nse_50[0:5]

array(['ABB', 'ACC', 'ASHOKLEY', 'DMART', 'BAJAJFINSV'], dtype=object)

In [57]:
from tqdm import tqdm
res = []
for i in tqdm(range(0,len(nse_50))):
    try:
        res.append(stock_predictor(nse_50[i],time_lag=10))
    except Exception:
        pass


100%|██████████| 50/50 [01:36<00:00,  1.92s/it]


In [58]:
accuracy_out = pd.DataFrame({"comp": [x["comp_symb"] for x in res],
                             "accuracy":[x["accuracy"] for x in res]})
accuracy_out

Unnamed: 0,accuracy,comp
0,0.231293,ABB
1,0.21542,ACC
2,0.21542,ASHOKLEY
3,0.195652,DMART
4,0.229025,BAJAJFINSV
5,0.23356,BANKBARODA
6,0.213152,BEL
7,0.21542,BHEL
8,0.213152,BRITANNIA
9,0.213152,CADILAHC


In [102]:
from tqdm import tqdm
res = []
for i in tqdm(range(0,len(nse_50))):
    try:
        res.append(stock_predictor(nse_50[i],time_lag=10,target_var_binary=True))
    except Exception:
        pass

100%|██████████| 50/50 [01:29<00:00,  1.80s/it]


In [103]:
accuracy_out = pd.DataFrame({"comp": [x["comp_symb"] for x in res],
                             "accuracy":[x["accuracy"] for x in res]})
accuracy_out

Unnamed: 0,accuracy,comp
0,0.496599,ABB
1,0.462585,ACC
2,0.528345,ASHOKLEY
3,0.565217,DMART
4,0.496599,BAJAJFINSV
5,0.530612,BANKBARODA
6,0.517007,BEL
7,0.498866,BHEL
8,0.501134,BRITANNIA
9,0.469388,CADILAHC
