In [1]:
import talib
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score as score
from sklearn.model_selection import train_test_split 

In [2]:
df = pd.read_csv('dataupdated.csv')
df.head()
df.dtypes

Date         object
Symbol       object
LTP          object
% Change    float64
High         object
Low          object
Open         object
Qty.         object
Turnover     object
Status        int64
dtype: object

In [3]:
df.head()

Unnamed: 0,Date,Symbol,LTP,% Change,High,Low,Open,Qty.,Turnover,Status
0,1/2/2011,ADBL,136,-4.9,145,134,143,10768,1516505.0,1
1,1/3/2011,ADBL,137,0.74,140,135,136,4292,588806.0,-1
2,1/4/2011,ADBL,139,1.46,139,133,137,5048,696161.0,1
3,1/5/2011,ADBL,132,-5.04,140,132,139,5588,763980.0,1
4,1/6/2011,ADBL,140,6.06,140,132,132,590,80520.0,-1


In [4]:
df.drop(columns=['Symbol', 'LTP', '% Change','Qty.','Turnover'],axis=1, inplace=True)

In [5]:
# Features for Time Series
df['DateTime'] = pd.to_datetime(df.Date)
df['year'] = df['DateTime'].dt.year
df['month'] = df['DateTime'].dt.month
df['Open'] = df.Open.astype(str)
df['High'] = df.Open.astype(str)
df['Low'] = df.Open.astype(str)
df['Open'] = df['Open'].str.replace(',', '').astype(float)
df['High'] = df['High'].str.replace(',', '').astype(float)
df['Low'] = df['Low'].str.replace(',', '').astype(float)
df['day'] = df['DateTime'].dt.day
df['30d_avg'] = df['Open'].rolling(window=30, center=False).mean()
df['Std'] = df['Open'].rolling(window=30, center=False).std()

In [6]:
# Features for Time Series
df['RSI'] = talib.RSI(df['Open'].values, timeperiod = 14) #relative strength index
df['Williams %R'] = talib.WILLR(df['High'].values, df['Low'].values, df['Open'].values, 7) 
df.tail()

Unnamed: 0,Date,High,Low,Open,Status,DateTime,year,month,day,30d_avg,Std,RSI,Williams %R
1579,12/24/2017,420.0,420.0,420.0,1,2017-12-24,2017,12,24,427.166667,7.358239,44.612225,-81.25
1580,12/26/2017,421.0,421.0,421.0,1,2017-12-26,2017,12,26,427.333333,7.150275,45.366171,-69.230769
1581,12/27/2017,413.0,413.0,413.0,-1,2017-12-27,2017,12,27,427.266667,7.277425,40.604345,-100.0
1582,12/28/2017,408.0,408.0,408.0,-1,2017-12-28,2017,12,28,426.666667,8.082904,37.924983,-100.0
1583,12/31/2017,409.0,409.0,409.0,1,2017-12-31,2017,12,31,425.966667,8.672125,38.794867,-92.307692


In [7]:
X = df.iloc[31:,6:]
Y = df.iloc[31:,4]

In [8]:
x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.05, random_state=2)

In [9]:
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)
y_test = np.array(y_test)

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [245]:
from sklearn.ensemble import RandomForestClassifier as rfc
rf = rfc(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=18, verbose=0,
            warm_start=False)
rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=18, verbose=0, warm_start=False)

In [246]:
rfc_prediction = rf.predict(x_test)
rfc_prediction

array([ 1,  1,  1, -1, -1, -1, -1,  0,  1,  1, -1, -1, -1,  1, -1,  0,  0,
       -1,  0, -1,  1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1,  1,  1, -1,
       -1, -1, -1, -1, -1, -1,  1,  1, -1,  1,  1, -1,  1, -1,  1,  1,  1,
       -1,  0,  1,  1, -1,  1, -1, -1,  1,  1, -1,  1,  1, -1, -1, -1,  1,
        1, -1,  0, -1, -1, -1,  1,  1,  1, -1])

In [249]:
rfc_accuracy = score(rfc_prediction, y_test) * 100
rfc_accuracy

58.97435897435898

Average calculation of Random Forest Classifier with different n_estimators value which is number of decision trees used.

In [255]:
n_estimators = []
for i in range(60,160,10):
    n_estimators.append(i)
n_estimators    

[60, 70, 80, 90, 100, 110, 120, 130, 140, 150]

In [257]:
rfc_accuracies = []
for i in n_estimators:
    rf = rfc(bootstrap=True, class_weight=None, criterion='gini',
                max_depth=None, max_features='auto', max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
                min_samples_leaf=1, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=i, n_jobs=None,
                oob_score=False, random_state=18, verbose=0,
                warm_start=False)
    rf.fit(x_train, y_train)
    rfc_prediction = rf.predict(x_test)
    rfc_accuracy = score(rfc_prediction, y_test) * 100
    rfc_accuracies.append(rfc_accuracy)

In [258]:
rfc_accuracies

[60.256410256410255,
 57.692307692307686,
 57.692307692307686,
 57.692307692307686,
 58.97435897435898,
 58.97435897435898,
 60.256410256410255,
 58.97435897435898,
 58.97435897435898,
 58.97435897435898]

In [260]:
average = pd.Series(rfc_accuracies).mean()
average

58.846153846153854

So, the Random Forest Classifier has average accuracy rate of around 59 percent.