In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
import numpy as np
import pandas as pd
import pickle
from stockstats import StockDataFrame

In [4]:
# Load dictionary of stock prices
with open('../nse_50_stock_data.pickle', 'rb') as f:
    stock_prices_dict = pickle.load(f)

In [95]:
# Check all keys
[k for k,v in stock_prices_dict.items()]

['ABB',
 'ACC',
 'ASHOKLEY',
 'BAJAJFINSV',
 'BANKBARODA',
 'BEL',
 'BHEL',
 'BRITANNIA',
 'CADILAHC',
 'COLPAL',
 'CONCOR',
 'CUMMINSIND',
 'DABUR',
 'DLF',
 'DMART',
 'EMAMILTD',
 'GLAXO',
 'GLENMARK',
 'GODREJCP',
 'GSKCONS',
 'HAVELLS',
 'HINDZINC',
 'ICICIPRULI',
 'IDEA',
 'INDIGO',
 'JSWSTEEL',
 'LICHSGFIN',
 'MARICO',
 'MOTHERSUMI',
 'MRF',
 'NHPC',
 'NMDC',
 'OFSS',
 'OIL',
 'PEL',
 'PETRONET',
 'PFC',
 'PGHH',
 'PIDILITIND',
 'PNB',
 'RECLTD',
 'SAIL',
 'SHREECEM',
 'SIEMENS',
 'SRTRANSFIN',
 'SUNTV',
 'TATAPOWER',
 'TITAN',
 'TORNTPHARM']

In [139]:
# Extracting stock prices for Wipro
historical_stock_price = stock_prices_dict["ABB"]
historical_stock_price.head()
historical_stock_price.shape


Unnamed: 0_level_0,open,high,low,last,close,total trade quantity,turnover (lacs),rsv_9,kdjk_9,kdjk,...,mdm_14_ema,mdm_14,mdi_14,mdi,dx_14,dx,dx_6_ema,adx,adx_6_ema,adxr
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1998-03-20,455.0,474.0,453.05,,,,,0.0,33.333333,33.333333,...,0.0,0.0,,,,,,,,
1998-03-23,474.9,504.0,473.9,504.0,504.0,61350.0,301.84,100.0,55.555556,55.555556,...,0.0,0.0,,,,,,,,
1998-03-24,525.0,554.4,510.0,554.4,554.4,151300.0,818.72,100.0,70.37037,70.37037,...,0.0,0.0,0.0,0.0,100.0,100.0,100.0,100.0,100.0,100.0
1998-03-25,554.0,569.0,551.0,558.0,558.95,152000.0,849.79,91.332471,77.357737,77.357737,...,0.0,0.0,0.0,0.0,100.0,100.0,100.0,100.0,100.0,100.0
1998-03-26,559.95,568.0,540.0,542.65,543.0,174900.0,961.71,77.576542,77.430672,77.430672,...,2.869883,2.869883,9.083801,9.083801,72.852317,72.852317,87.795996,87.795996,94.513796,94.513796


(4930, 52)

In [140]:
# Add techniical indiactors

historical_stock_price = StockDataFrame.retype(historical_stock_price)
tech_indicators = ["kdjk","macd","rsi_6","rsi_12",
                   "wr_10","wr_6","cci","adx","mdi"]
tech_indicators_df = list(map(lambda x: historical_stock_price[x], tech_indicators))
tech_indicators_df = pd.DataFrame(tech_indicators_df).transpose()

stock_df = pd.concat([historical_stock_price[["close"]],tech_indicators_df],axis=1)
stock_df.head(3)

# temp = pd.concat([historical_stock_price,tech_indicators_df],axis=1)
# temp.to_clipboard()

Unnamed: 0_level_0,close,kdjk,macd,rsi_6,rsi_12,wr_10,wr_6,cci,adx,mdi
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1998-03-20,,33.333333,,,,,,,,
1998-03-23,504.0,55.555556,0.0,,,0.0,0.0,,,
1998-03-24,554.4,70.37037,1.130769,100.0,100.0,0.0,0.0,,100.0,0.0


In [141]:
# Extract data for last 1 year
stock_df["date"] = pd.to_datetime(stock_df.index)

In [142]:
stock_price_sub = stock_df[stock_df["date"] >= pd.to_datetime("2010-11-01")]

In [143]:
stock_price_sub.head(10)
stock_price_sub.shape

Unnamed: 0_level_0,close,kdjk,macd,rsi_6,rsi_12,wr_10,wr_6,cci,adx,mdi
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2010-11-01,842.65,22.166395,-4.48073,29.671261,36.050354,72.723475,70.291174,-187.760622,51.00707,40.864892
2010-11-02,864.1,29.843038,-5.440132,43.450163,43.833379,54.803676,48.500714,-86.620515,44.25729,34.652133
2010-11-03,860.25,33.888675,-6.436927,41.690873,42.813123,58.02005,47.105263,-62.715654,39.436017,30.537273
2010-11-04,862.35,38.470521,-6.977015,43.196309,43.594345,56.265664,33.312102,-54.974301,35.992252,27.433323
2010-11-05,861.8,42.083632,-7.364526,42.848623,43.424853,52.866242,26.0,-43.128914,32.344375,24.556013
2010-11-08,869.3,48.862772,-6.985914,49.500005,46.518081,43.55069,16.997617,-35.170178,31.712416,24.926326
2010-11-09,863.4,55.250341,-7.080325,44.600046,44.433237,43.789474,41.5,-35.191082,31.261016,22.270391
2010-11-10,875.45,68.000227,-6.112352,55.416355,49.47816,16.624204,9.739369,-23.661811,34.417482,25.051059
2010-11-11,857.5,61.847059,-6.716221,41.07951,43.116775,46.448703,73.374889,3.561756,26.781957,20.784625
2010-11-12,851.8,47.932111,-7.567501,37.393137,41.278296,57.440294,83.526269,-68.228804,24.476937,22.91076


(1774, 10)

In [144]:
# Get closing day prices
closing_prices = stock_price_sub["close"].values
closing_prices[0:10]

array([ 842.65,  864.1 ,  860.25,  862.35,  861.8 ,  869.3 ,  863.4 ,
        875.45,  857.5 ,  851.8 ])

In [145]:
# Previous day prices as features
# consider last 6 day prices

time_lag = 5
lag_prices = []
for i in range(time_lag,len(closing_prices)):
    lag_prices.append(closing_prices[(i-(time_lag)):i])


In [146]:
lag_prices[0]
lag_prices[1]

array([ 842.65,  864.1 ,  860.25,  862.35,  861.8 ])

array([ 864.1 ,  860.25,  862.35,  861.8 ,  869.3 ])

In [147]:
# Lag prices as features
features = pd.DataFrame(lag_prices)
features.reset_index(drop=True,inplace=True)
features.columns = list(map(lambda x:'t-'+ str(x),list(range(1,time_lag+1))))
features.head(3)

Unnamed: 0,t-1,t-2,t-3,t-4,t-5
0,842.65,864.1,860.25,862.35,861.8
1,864.1,860.25,862.35,861.8,869.3
2,860.25,862.35,861.8,869.3,863.4


In [148]:
# Create Training data
train_df = pd.DataFrame({"target_var": closing_prices[(time_lag):]})
train_df.reset_index(drop=True,inplace=True)

train_df = pd.concat([features,train_df], axis=1)

train_df.head(12)

Unnamed: 0,t-1,t-2,t-3,t-4,t-5,target_var
0,842.65,864.1,860.25,862.35,861.8,869.3
1,864.1,860.25,862.35,861.8,869.3,863.4
2,860.25,862.35,861.8,869.3,863.4,875.45
3,862.35,861.8,869.3,863.4,875.45,857.5
4,861.8,869.3,863.4,875.45,857.5,851.8
5,869.3,863.4,875.45,857.5,851.8,857.05
6,863.4,875.45,857.5,851.8,857.05,839.4
7,875.45,857.5,851.8,857.05,839.4,853.6
8,857.5,851.8,857.05,839.4,853.6,835.15
9,851.8,857.05,839.4,853.6,835.15,851.55


In [149]:
# Convert raw scores to percentages
perct_changes = train_df.apply(lambda x: [100.0 * a1 / a2 - 100 for a1, a2 in zip(x[1:], x)], axis = 1)
perct_changes = list(map(lambda x: list(x), perct_changes))
perct_changes = pd.DataFrame(perct_changes)
col_names = list(map(lambda x:'t-'+ str(x),list(range(1,time_lag))))
col_names.append("target_var")
perct_changes.columns = col_names

perct_changes.head(3)

Unnamed: 0,t-1,t-2,t-3,t-4,target_var
0,2.545541,-0.44555,0.244115,-0.063779,0.870272
1,-0.44555,0.244115,-0.063779,0.870272,-0.678707
2,0.244115,-0.063779,0.870272,-0.678707,1.395645


In [150]:
# Convert percent change to category
# compute percentile change quantiles
daily_perct_change = [100.0 * a1 / a2 - 100 for a1, a2 in zip(closing_prices[1:], closing_prices)]

percentiles = list(map(lambda x:np.nanpercentile(np.array(daily_perct_change),q = x),
                       [20,40,60,80]))

percentiles

[-1.2716365865539014,
 -0.48194066047091716,
 0.21766274587002105,
 1.2648706677379182]

In [7]:
# Function to convert %change to category
def change_perct_change_to_cat(raw_perct_change,percentiles):
    category = np.where(raw_perct_change < percentiles[0], "Cat1",
                        np.where(raw_perct_change < percentiles[1],"Cat2",
                                 np.where(raw_perct_change < percentiles[2], "Cat3",
                                          np.where(raw_perct_change < percentiles[3], "Cat4","Cat5"))))
    return category


In [152]:
# Check function
change_perct_change_to_cat(1,percentiles)

array('Cat4',
      dtype='<U4')

In [153]:
# Apply function on all columns
perct_changes_cat = perct_changes.apply(lambda x: change_perct_change_to_cat(x,percentiles), axis=1)
perct_changes_cat.head(3)
perct_changes.head(3)

Unnamed: 0,t-1,t-2,t-3,t-4,target_var
0,Cat5,Cat3,Cat4,Cat3,Cat4
1,Cat3,Cat4,Cat3,Cat4,Cat2
2,Cat4,Cat3,Cat4,Cat2,Cat5


Unnamed: 0,t-1,t-2,t-3,t-4,target_var
0,2.545541,-0.44555,0.244115,-0.063779,0.870272
1,-0.44555,0.244115,-0.063779,0.870272,-0.678707
2,0.244115,-0.063779,0.870272,-0.678707,1.395645


In [154]:
# 1-hot-encoding of categorical predictors

X = pd.get_dummies(perct_changes_cat.drop(labels=["target_var"],axis=1))
X.head(3)
X.shape
y = perct_changes_cat["target_var"]
y.shape

Unnamed: 0,t-1_Cat1,t-1_Cat2,t-1_Cat3,t-1_Cat4,t-1_Cat5,t-2_Cat1,t-2_Cat2,t-2_Cat3,t-2_Cat4,t-2_Cat5,t-3_Cat1,t-3_Cat2,t-3_Cat3,t-3_Cat4,t-3_Cat5,t-4_Cat1,t-4_Cat2,t-4_Cat3,t-4_Cat4,t-4_Cat5
0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0
1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0
2,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0


(1769, 20)

(1769,)

In [155]:
# Append technical indiactors as features
stock_price_sub.reset_index(drop=True,inplace=True)
tech_features = stock_price_sub.iloc[(time_lag - 1):len(stock_price_sub) - 1,1:]
tech_features.shape

tech_features.reset_index(drop=True, inplace=True)
X = pd.concat([X,tech_features],axis=1)
X.head(3)

(1769, 9)

Unnamed: 0,t-1_Cat1,t-1_Cat2,t-1_Cat3,t-1_Cat4,t-1_Cat5,t-2_Cat1,t-2_Cat2,t-2_Cat3,t-2_Cat4,t-2_Cat5,...,t-4_Cat5,kdjk,macd,rsi_6,rsi_12,wr_10,wr_6,cci,adx,mdi
0,0,0,0,0,1,0,0,1,0,0,...,0,42.083632,-7.364526,42.848623,43.424853,52.866242,26.0,-43.128914,32.344375,24.556013
1,0,0,1,0,0,0,0,0,1,0,...,0,48.862772,-6.985914,49.500005,46.518081,43.55069,16.997617,-35.170178,31.712416,24.926326
2,0,0,0,1,0,0,0,1,0,0,...,0,55.250341,-7.080325,44.600046,44.433237,43.789474,41.5,-35.191082,31.261016,22.270391


In [156]:
# Train/Test splitting

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)

In [157]:
# training a DescisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(max_depth = 8).fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)

In [158]:
# creating a confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, dt_pred)
print("Accuracy score ", accuracy_score(y_test,dt_pred))
cm

Accuracy score  0.223476297968


array([[21,  9, 26, 20, 12],
       [18, 14, 37, 19, 11],
       [13,  4, 26, 19, 11],
       [13, 11, 34, 21, 10],
       [10, 10, 39, 18, 17]], dtype=int64)

In [159]:
# training a linear SVM classifier
from sklearn.svm import SVC
svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
svm_predict = svm_model_linear.predict(X_test)
 
# model accuracy for X_test  
accuracy = svm_model_linear.score(X_test, y_test)
 
# creating a confusion matrix
cm = confusion_matrix(y_test, svm_predict)
print(accuracy)
print(cm)

0.189616252822
[[23 19 21 13 12]
 [23  9 39 14 14]
 [24 12 17  8 12]
 [19 15 25 16 14]
 [25 14 25 11 19]]


In [162]:
# Train RF
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=180,min_samples_leaf=20,random_state=123,max_depth=14)
train_model_ngram= rf_model.fit(X_train,y_train)
var_importance = rf_model.feature_importances_
var_imp_df = pd.DataFrame({"Var": X_train.columns,"Imp": var_importance})
var_imp_df = var_imp_df.sort_values(by="Imp", ascending=False)
var_imp_df

Unnamed: 0,Imp,Var
22,0.097208,rsi_6
28,0.094554,mdi
25,0.092425,wr_6
24,0.092259,wr_10
21,0.090994,macd
23,0.088496,rsi_12
27,0.086432,adx
26,0.084088,cci
20,0.079241,kdjk
19,0.024856,t-4_Cat5


In [164]:
from sklearn.model_selection import cross_val_score
import numpy as np
scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validated AUC is: ",np.mean(scores))
print("AUC for each fold:", scores)

Cross-Validated AUC is:  0.254148127609
AUC for each fold: [ 0.27443609  0.23308271  0.24528302  0.26415094  0.25378788]


In [169]:
rf_predict = rf_model.predict(X_test)
 
# model accuracy for X_test  
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, rf_predict)
 
# creating a confusion matrix
cm = confusion_matrix(y_test, rf_predict)
print(accuracy)
print(cm)

0.198645598194
[[20  9 27 14 18]
 [15  8 42 12 22]
 [13 11 22 15 12]
 [17  7 35 13 17]
 [19 11 26 13 25]]


In [10]:
# Generic function for any stock
def stock_predictor(comp_symb,time_lag,target_var_binary=False):
    from sklearn.metrics import accuracy_score, confusion_matrix
    historical_stock_price = stock_prices_dict[comp_symb]
    
    # Extract data for last t years
    historical_stock_price["date"] = pd.to_datetime(historical_stock_price.index)
    
    # Add techniical indiactors
    historical_stock_price = StockDataFrame.retype(historical_stock_price)
    tech_indicators = ["kdjk","macd","rsi_6","rsi_12",
                       "wr_10","wr_6","cci","adx","mdi"]
    tech_indicators_df = list(map(lambda x: historical_stock_price[x], tech_indicators))
    tech_indicators_df = pd.DataFrame(tech_indicators_df).transpose()

    stock_df = pd.concat([historical_stock_price[["close"]],tech_indicators_df],axis=1)
    stock_df["date"] = pd.to_datetime(stock_df.index)
    stock_price_sub = stock_df[stock_df["date"] >= pd.to_datetime("2010-11-01")]
    
    # Get closing day prices
    closing_prices = stock_price_sub["close"].values
    
    # Previous day prices as features
    lag_prices = []
    for i in range(time_lag,len(closing_prices)):
        lag_prices.append(closing_prices[(i-(time_lag)):i])
        
    # Lag prices as features
    features = pd.DataFrame(lag_prices)
    features.reset_index(drop=True,inplace=True)
    features.columns = list(map(lambda x:'t-'+ str(x),list(range(1,time_lag+1))))
    
    # Create Training data
    train_df = pd.DataFrame({"target_var": closing_prices[(time_lag):]})
    train_df.reset_index(drop=True,inplace=True)
    train_df = pd.concat([features,train_df], axis=1)
    

    # Convert raw scores to percentages
    perct_changes = train_df.apply(lambda x: [100.0 * a1 / a2 - 100 for a1, a2 in zip(x[1:], x)], axis = 1)
    perct_changes = list(map(lambda x: list(x), perct_changes))
    perct_changes = pd.DataFrame(perct_changes)
    col_names = list(map(lambda x:'t-'+ str(x),list(range(1,time_lag))))
    col_names.append("target_var")
    perct_changes.columns = col_names
    
    # Convert percent change to category
    # compute percentile change quantiles
    daily_perct_change = [100.0 * a1 / a2 - 100 for a1, a2 in zip(closing_prices[1:], closing_prices)]

    percentiles = list(map(lambda x:np.nanpercentile(np.array(daily_perct_change),q = x),
                           [20,40,60,80]))
    
    # Convert %change to category
    if (target_var_binary):
        perct_changes_cat = perct_changes.drop(labels=["target_var"],axis=1).apply(lambda x: change_perct_change_to_cat(x,percentiles), axis=1)
        perct_changes_cat["target_var"] = [1 if x > 0 else 0 for x in perct_changes["target_var"]]
    else:
        perct_changes_cat = perct_changes.apply(lambda x: change_perct_change_to_cat(x,percentiles), axis=1)
    
    # 1-hot-encoding of categorical predictors
    X = pd.get_dummies(perct_changes_cat.drop(labels=["target_var"],axis=1))
    y = perct_changes_cat["target_var"]

    # Append technical indicators as features
    stock_price_sub.reset_index(drop=True,inplace=True)
    tech_features = stock_price_sub.iloc[(time_lag-1):len(stock_price_sub)-1,1:]
    tech_features.shape

    tech_features.reset_index(drop=True, inplace=True)
    X = pd.concat([X,tech_features],axis=1)
    
    # Train/Test splitting
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123)
    
 
    # model accuracy for X_test 
    from sklearn.ensemble import RandomForestClassifier
    rf_model = RandomForestClassifier(n_estimators=180,min_samples_leaf=20,random_state=123,max_depth=14)
    rf_model.fit(X_train,y_train)
    
    rf_predict = rf_model.predict(X_test)
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(y_test, rf_predict)

    # creating a confusion matrix
    cm = confusion_matrix(y_test, rf_predict)
    # training a linear SVM classifier
#     from sklearn.svm import SVC
#     svm_model_linear = SVC(kernel = 'linear', C = 1).fit(X_train, y_train)
#     svm_predict = svm_model_linear.predict(X_test)

#     # model accuracy for X_test  
#     accuracy = svm_model_linear.score(X_test, y_test)

    # creating a confusion matrix
    res = {"comp_symb":comp_symb, "accuracy": accuracy, "cm": cm,
          "percentiles": percentiles}
    return res



In [12]:
stock_predictor("ACC",time_lag=5,target_var_binary=False)

{'accuracy': 0.21218961625282168, 'cm': array([[16, 22, 20, 23, 13],
        [11, 25, 18, 28,  5],
        [13, 19, 15, 27, 17],
        [18,  9, 14, 26, 11],
        [20, 22, 24, 15, 12]], dtype=int64), 'comp_symb': 'ACC', 'percentiles': [-1.121880868668603,
  -0.30916110247614198,
  0.28351324638237557,
  1.2307485002717848]}

In [14]:
# Run function for all NSE 50 Stocks
nse_50 = pd.read_csv("nse_50.csv")
nse_50 = nse_50["Symbol"].values
nse_50[0:5]

array(['ABB', 'ACC', 'ASHOKLEY', 'DMART', 'BAJAJFINSV'], dtype=object)

In [176]:
from tqdm import tqdm
res = []
for i in tqdm(range(0,len(nse_50))):
    try:
        res.append(stock_predictor(nse_50[i],time_lag=10))
    except Exception:
        pass


100%|██████████| 50/50 [03:26<00:00,  4.14s/it]


In [177]:
accuracy_out = pd.DataFrame({"comp": [x["comp_symb"] for x in res],
                             "accuracy":[x["accuracy"] for x in res]})
accuracy_out

Unnamed: 0,accuracy,comp
0,0.235828,ABB
1,0.219955,ACC
2,0.179138,ASHOKLEY
3,0.217391,DMART
4,0.238095,BAJAJFINSV
5,0.219955,BANKBARODA
6,0.226757,BEL
7,0.22449,BHEL
8,0.22449,BRITANNIA
9,0.21542,CADILAHC


In [15]:
from tqdm import tqdm
res = []
for i in tqdm(range(0,len(nse_50))):
    try:
        res.append(stock_predictor(nse_50[i],time_lag=10,target_var_binary=True))
    except Exception:
        pass

100%|██████████| 50/50 [05:04<00:00,  6.08s/it]


In [16]:
accuracy_out = pd.DataFrame({"comp": [x["comp_symb"] for x in res],
                             "accuracy":[x["accuracy"] for x in res]})
accuracy_out

Unnamed: 0,accuracy,comp
0,0.54195,ABB
1,0.519274,ACC
2,0.530612,ASHOKLEY
3,0.478261,DMART
4,0.485261,BAJAJFINSV
5,0.482993,BANKBARODA
6,0.514739,BEL
7,0.52381,BHEL
8,0.535147,BRITANNIA
9,0.489796,CADILAHC
