# Project For QI

In [None]:
#Importing libraries and classes
import pandas as pd
import numpy as np
import yfinance as yf
from sklearn.model_selection import train_test_split
import talib as ta

In [None]:
#Selecting Date Range and Ticker
start_date = '2018-1-1' #YYY-MM-DD
end_date = '2020-12-31'
ticker = 'HDFCBANK.NS'

In [None]:
#Downloading the data from yahoo finance
df = yf.download("{}".format(ticker), start="{}".format(start_date), end="{}".format(end_date))

In [None]:
#Checking the downloaded data
df.head()

In [None]:
#Checking for outliers
df.describe().round(2)

In [None]:
#Checking for Null values
df.info()

In [None]:
#Visualizing the close price
df['Adj Close'].plot(label='Closing Price')


In [None]:
#Creating a copy of data
features = df.copy()

In [None]:
#checking the copy
features.head()

In [None]:
#Creating functions for all the indicators

def SMA (data, period=21, column = 'Adj Close'):
    
#    data ['SMA'] = data[column].rolling(window=period).mean()
    return data[column].rolling(window=period).mean()


def Pct_change (data, column = 'Adj Close'):
    
    data ['pct_change'] = (data[column].pct_change())*100
    return (data[column].pct_change())*100


#Instead of using a standard technical indicator, I have used 14 period rolling standard deviation of returns as a volatility indicator
def Volatility (data, period = 14, column = 'pct_change'):
    
    data['Volatility'] = data[column].rolling(window=period).std()
    return data[column].rolling(window=period).std()
    

def Volume_SMA(data, period=10, column = 'Volume'):

    data['Volume_SMA'] = data[column].rolling(window=period).mean().round(2)
    return data[column].rolling(window=period).mean().round(2)


def RSI (data, period = 14, column = 'Adj Close'):
    """
    This function calculates the RSI indicator
  
       Input : 
       first arg = dataframe
       second arg = RSI period
       third arg = column name on which RSI is to be calculated
       
       Output : 
       Returns RSI values and also creates an RSI column in the dataframe
    
    """

    x = features['Adj Close'].diff(1)
    x = x.dropna()
    rs_up = x.copy()
    rs_down = x.copy()
    rs_up[rs_up < 0] = 0
    rs_down[rs_down > 0] = 0
    data['rs_up'] = rs_up
    data['rs_down'] = rs_down
    pav = SMA(data, period, column = 'rs_up')
    nav = abs (SMA(data, period, column = 'rs_down'))
    RS = pav/nav
    RSI = 100.0 - (100.0 / (1.0 + RS)) 
    
    data ['RSI'] = RSI
    return data      

In [None]:
#Calculating all the indicators
RSI(features)
Pct_change(features)
Volatility(features)
Volume_SMA(features)
#For MOM indicator we are using talib instead of writing and using a function
features['ROC'] = ta.ROC(features['Adj Close'], timeperiod = 14)

In [None]:
#Checking the features (indicators)
features

In [None]:
#Creating the dependent variable. If next day close was up, 1 or -1
features['Dependent_Variable'] = np.where(features['pct_change'].shift(-1) > 0, 1, -1)

In [None]:
#Checking the features
features.head(10)

In [None]:
#Removing all the rows with NaN values
features = features.dropna() 

In [None]:
#Checking the features
features.head()

In [None]:
#Creating new features dataframe containing only indicators and the independent variable
features_new = features.filter(['ROC','RSI', 'Volatility', 'Volume_SMA', 'Dependent_Variable'])

In [None]:
#Checking the new dataframe
features_new

In [None]:
X = features_new.iloc[:, :-1].values #Independent Variables
y = features_new.iloc[:, -1].values   #Dependent Variable

In [None]:
#Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
#Feature Scaling 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

##                                       Reason for Selecting XB Boost

Five different models were traine on the same data. The performance was compared based on weighted average f1 score. They have been listed below in descending order of their performance. XG boost outperformed all these models

XG Boost: 52, Random Forest - 0.50, KNN - 0.51, Decision Tree - 0.47, Naive Baise - 0.41

In [None]:
#Training
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

In [None]:
#Predicting the Test set results
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
#Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
import sklearn
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

In [None]:
#Printing f1 score and accuracy report
from sklearn.metrics import classification_report
print (classification_report(y_test, y_pred))

# How can we improving the performance?

1)  and Feature Engineering 
To boost the models performance, some proprietory indicators can be created based on research. 

2) Feature selection
Including more indicators of different classes might help imporove the performance of the model. For eg. including the data from the derivatives market such as OI can help the model with market sentiment. We can also include data from social media, macroeconomic indicators, bond yields etc.

3) Optimizing Indicator Parameters
Short term and long term trends can be defined with slower and faster parameters for the indicators.

4) Tuning Hyper Parameters of the Mode  
There are many parameters of the model which the models do not learn during training. Thus tuning the hyperparameters might give a performance boost 

In [None]:
#Naive Baise
#Training
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
#Prediction
y_pred = classifier.predict(X_test)
print (classification_report(y_test, y_pred))

In [None]:
#Decision Tree
#Training
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
#Prediction
y_pred = classifier.predict(X_test)
print (classification_report(y_test, y_pred))

In [None]:
#Random Forest
#Training
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
#Prediction
y_pred = classifier.predict(X_test)
print (classification_report(y_test, y_pred))

In [None]:
#KNN
#Training
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)
#Prediction
y_pred = classifier.predict(X_test)
print (classification_report(y_test, y_pred))

In [None]:
#XGBoost
#Training
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)
#Prediction
y_pred = classifier.predict(X_test)
print (classification_report(y_test, y_pred))