In [74]:
from pandas.core.frame import DataFrame
import yfinance as yf
import pandas as pd
import numpy as np
import talib as ta
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, mean_squared_error, accuracy_score

In [75]:
data = pd.read_csv("output/TECHM.csv")
data = data.iloc[-1500:]
data.set_index('Date')
data.Date = pd.to_datetime(data.Date)
model = LinearRegression()

In [76]:
"""
	Parameters
	----------
	df -> Dataframe of stock's historical data
	indicator -> mathematical formula to make an indicator on
		Supported indicators:
			rsi -> Relative Strength Indicator
			macd -> Moving Average Convergence Divergence
			ema -> Exponential Moving Average
			volume -> It's not actually an indicator but we are using it to analyse
			price volume data by correlating price movements with volume.
	entry_type -> The view of underlying stock.
		long -> We take long entry for bullishness
		short -> We take short entry for bearishness
	"""
def get_score(df: DataFrame, indicator: str, entry_type='long'):
	indicator = indicator.upper()

	if indicator == 'RSI' and entry_type == 'long':
		"""
		General Strategy with RSI:
			If RSI Value is >= 70:
				It is considered that the stock is overpriced as
				the stock is overbought in the last few trading sessions.
				
				Increasing RSI is generally considered a negative sign,
				and the stock is expected to be bearish.
			If RSI Value is <= 30:
				It is considered that the stock is underpriced as
				the stock is oversold in the last few trading sessions.

				Decreasing RSI is generally considered as a positive sign,
				and the stock is expected to be bullish in upcoming trading sessions.

		Our strategy with RSI:
			If RSI Value in range of 60-70
				We consider this the best range of RSI as if the stock has
				RSI value of >=60 and <= 70. It is not overbought but it actually
				deserves to be a performing stock.

				We expect the stock to perform more in upcoming trading sessions.

			If RSI Value in range of 70-80
				We consider this the next best range of RSI. If the stock is in 
				ragne of this we can expect bullishness but now the upmove is
				considered to be weak as the move may have came to an end as 
				RSI is already this much high.
			
			If RSI Value in range of >= 80
				We consider this as a weakening score and a potential threat, 
				as the stock might be actually overperforming. 

				This is just like a typical overbought zone.
			
			If RSI Value in range 50-60
				We consider this as a moderate range, and the stock might
				start performing from here. So we return the score with value of 2.

			If any given condition is not matched we simply return the score value of 0.
		"""
		try:
			rsiValue = df.rsi.head(1).values[0]
			if rsiValue in range(60,70):
				return 5
			elif rsiValue in range(70,80):
				return 4
			elif rsiValue >= 80:
				return 3
			elif rsiValue in range(50,60):
				return 2
			else:
				return 0
		except IndexError:
			return 0
	"""
		General Strategy with MACD:
			In MACD there are two components which are calculated.
			MACD Signal and MACD.
			If value of MACD is greaten than MACD Signal, 
				it is considered a bullish crossover and the stock might
				start performing from here.
			If value of MACD is less than MACD Signal,
				it is considered a bearish crossover and the stock might
				start falling from here.

		Our Strategy with MACD:
			We go along with the general strategy but with some modifications.
			Instead of simply checking the condition of MACD > Macd Signal, 
			we check when the crossover happened.

			The nearer the crossover with today's date the more strength the
			buy signal has.
	"""
	if indicator == 'MACD' and entry_type == 'long':
		macd = df.macd_crossover
		try:
			date = macd.iloc[list(np.where(df["macd_crossover"] == 1)[0])].index.values[0]
			date = pd.to_datetime(date)
			dates = df.index.values
			for i in range(0,len(dates)):
				if pd.to_datetime(dates[i]).date() == date:
					return 5 - i
			return 0
		except IndexError:
			return 0
	if indicator == 'MACD_BUY' and entry_type == 'long':
		macd = df.macd_buy
		try:
			date = macd.iloc[list(np.where(df["macd_buy"] == 1)[0])].index.values[0]
			date = pd.to_datetime(date)
			dates = df.index.values
			for i in range(0,len(dates)):
				if pd.to_datetime(dates[i]).date() == date:
					return 5 - i
			return 0
		except IndexError:
			return 0
	if indicator == 'EMA_BUY' and entry_type == 'long':
		try:
			date = df.ema_buy.iloc[list(np.where(df["ema_buy"] == 1)[0])].index.values[0]
			date = pd.to_datetime(date)
			dates = df.index.values
			for i in range(0,len(dates)):
				if pd.to_datetime(dates[i]).date() == date:
					return 5 - i
			return 0
		except IndexError:
			return 0
	if indicator == 'EMA' and entry_type == 'long':
		try:
			date = df.ema_crossover.iloc[list(np.where(df["ema_crossover"] == 1)[0])].index.values[0]
			date = pd.to_datetime(date)
			dates = df.index.values
			for i in range(0,len(dates)):
				if pd.to_datetime(dates[i]).date() == date:
					return 5 - i
			return 0
		except IndexError:
			return 0
	if indicator == 'VOLUME' and entry_type == 'long':
		try:
			date = df.volume_buy.iloc[list(np.where(df["volume_buy"] == 1)[0])].index.values[0]
			date = pd.to_datetime(date)
			dates = df.index.values
			for i in range(0,len(dates)):
				if pd.to_datetime(dates[i]).date() == date:
					return 5 - i
			return 0
		except IndexError:
			return 0
	return None

In [77]:
data['5EMA'] = pd.Series.ewm(data['Close'], span=5).mean()
data['26EMA'] = pd.Series.ewm(data['Close'], span=26).mean()
data['rsi'] = ta.RSI(data['Close'].values, timeperiod=14)

data['macd'], data['macdSignal'], data['macdHist'] = ta.MACD(data.Close.values, fastperiod=12, slowperiod=26, signalperiod=9)

data['macd_crossover'] = np.where(((data.macd > data.macdSignal) & (data.macd.shift(1) < data.macdSignal.shift(1))), 1, 0)
data['macd_crossunder'] = np.where(((data.macd < data.macdSignal) & (data.macd.shift(1) > data.macdSignal.shift(1))), 1, 0)
data['ema_crossover'] = np.where(((data['5EMA'].shift(1) <= data['26EMA'].shift(1)) & (data['5EMA'] > data['26EMA'] )), 1, 0)
data['ema_crossunder'] = np.where(((data['5EMA'].shift(1) >= data['26EMA'].shift(1)) & (data['5EMA'] < data['26EMA'] )), 1, 0)
data['macd_buy'] = np.where((data.macd > data.macdSignal), 1, 0)
data['macd_sell'] = np.where((data.macd < data.macdSignal), 1, 0)
data['ema_buy'] = np.where((data['5EMA'] > data['26EMA']), 1, 0)
data['ema_sell'] = np.where((data['5EMA'] < data['26EMA']), 1, 0)
data['rsi_buy'] = np.where(data.rsi >= 60, 1, 0)
data['rsi_sell'] = np.where(data.rsi <= 40, 1, 0)

data['volume_buy'] = np.where((data.Volume > data.Volume.ewm(span=5).mean()) & (data.Close > data.Close.shift(1)), 1, 0)
data['volume_sell'] = np.where((data.Volume > data.Volume.ewm(span=5).mean()) & (data.Close < data.Close.shift(1)), 1, 0)
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,5EMA,26EMA,rsi,macd,...,ema_crossunder,macd_buy,macd_sell,ema_buy,ema_sell,rsi_buy,rsi_sell,volume_buy,volume_sell,scores
2284,2015-12-07,458.542258,467.524895,457.815113,465.942261,788606,465.942261,465.942261,,,...,0,0,0,0,0,0,0,0,0,8.640660
2285,2015-12-08,466.198901,467.097154,452.211620,454.008179,1061492,458.781812,459.745718,,,...,1,0,0,0,1,0,0,0,1,10.093773
2286,2015-12-09,454.264830,459.825510,449.217422,451.142273,1223560,455.163083,456.654584,,,...,0,0,0,0,1,0,0,0,1,11.729182
2287,2015-12-10,453.751522,462.391943,448.105262,460.894836,702931,457.543965,457.839973,,,...,0,0,0,0,1,0,0,0,0,11.152788
2288,2015-12-11,461.964182,465.300611,457.772264,459.354950,926348,458.239177,458.191303,,,...,0,0,0,1,0,0,0,0,1,10.768525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3779,2021-12-27,1724.500000,1792.000000,1703.650024,1785.000000,4512000,1712.952697,1624.712030,72.784959,43.660604,...,0,1,0,1,0,1,0,1,0,13.325695
3780,2021-12-28,1785.000000,1823.000000,1776.099976,1806.099976,3605483,1744.001790,1638.148174,74.302972,51.413282,...,0,1,0,1,0,1,0,1,0,13.883796
3781,2021-12-29,1800.500000,1821.000000,1782.750000,1786.849976,1902921,1758.284518,1649.163123,70.442549,55.365798,...,0,1,0,1,0,1,0,0,0,15.255864
3782,2021-12-30,1787.000000,1838.000000,1786.000000,1799.949951,4650594,1772.172996,1660.332517,71.526702,58.876562,...,0,1,0,1,0,1,0,1,0,14.837243


In [78]:
totalScoreL = [0,0,0,0,0]
for i in range(len(data.index.values)-5):
	df = data[i:i+5]
	rsiScore = get_score(df, indicator='rsi')
	macdScore = get_score(df, indicator='macd')
	emaScore = get_score(df, indicator='ema')
	volumeScore = get_score(df, indicator='volume')
	macdBuyScore = get_score(df, indicator='macd_buy', entry_type='long')
	emaBuyScore = get_score(df, indicator='ema_buy', entry_type='long')
	totalScore = rsiScore + macdScore + emaScore + volumeScore + macdBuyScore + emaBuyScore
	totalScoreL.append(totalScore)

In [79]:
len(data), len(totalScoreL)

(1500, 1500)

In [80]:
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,5EMA,26EMA,rsi,macd,...,ema_crossunder,macd_buy,macd_sell,ema_buy,ema_sell,rsi_buy,rsi_sell,volume_buy,volume_sell,scores
2284,2015-12-07,458.542258,467.524895,457.815113,465.942261,788606,465.942261,465.942261,,,...,0,0,0,0,0,0,0,0,0,8.640660
2285,2015-12-08,466.198901,467.097154,452.211620,454.008179,1061492,458.781812,459.745718,,,...,1,0,0,0,1,0,0,0,1,10.093773
2286,2015-12-09,454.264830,459.825510,449.217422,451.142273,1223560,455.163083,456.654584,,,...,0,0,0,0,1,0,0,0,1,11.729182
2287,2015-12-10,453.751522,462.391943,448.105262,460.894836,702931,457.543965,457.839973,,,...,0,0,0,0,1,0,0,0,0,11.152788
2288,2015-12-11,461.964182,465.300611,457.772264,459.354950,926348,458.239177,458.191303,,,...,0,0,0,1,0,0,0,0,1,10.768525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3779,2021-12-27,1724.500000,1792.000000,1703.650024,1785.000000,4512000,1712.952697,1624.712030,72.784959,43.660604,...,0,1,0,1,0,1,0,1,0,13.325695
3780,2021-12-28,1785.000000,1823.000000,1776.099976,1806.099976,3605483,1744.001790,1638.148174,74.302972,51.413282,...,0,1,0,1,0,1,0,1,0,13.883796
3781,2021-12-29,1800.500000,1821.000000,1782.750000,1786.849976,1902921,1758.284518,1649.163123,70.442549,55.365798,...,0,1,0,1,0,1,0,0,0,15.255864
3782,2021-12-30,1787.000000,1838.000000,1786.000000,1799.949951,4650594,1772.172996,1660.332517,71.526702,58.876562,...,0,1,0,1,0,1,0,1,0,14.837243


In [84]:
data['totalScore'] = totalScoreL
data['totalScoreEma'] = data.totalScore.ewm(span=5).mean()
data = data.dropna()
data["prevClose"] = data.Close.shift(1)
data['changeClose'] = data[['Close','prevClose']].pct_change(axis=1)['Close']
data["prevScore"] = data.totalScoreEma.shift(1)
data['changeEma'] = data[['totalScoreEma','prevScore']].pct_change(axis=1)['totalScoreEma']
features = ['changeEma']
y = data[['changeClose']]
X = data[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 7 * len(X) // 10,shuffle=False)

ValueError: train_size=0 should be either positive and smaller than the number of samples 0 or a float in the (0, 1) range

In [82]:
data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Volume,5EMA,26EMA,rsi,macd,...,rsi_sell,volume_buy,volume_sell,scores,totalScore,totalScoreEma,prevClose,changeClose,prevScore,changeEma
3779,2021-12-27,1724.5,1792.0,1703.650024,1785.0,4512000,1712.952697,1624.71203,72.784959,43.660604,...,0,1,0,13.325695,20,17.615216,1723.800049,,1723.800049,
3780,2021-12-28,1785.0,1823.0,1776.099976,1806.099976,3605483,1744.00179,1638.148174,74.302972,51.413282,...,0,1,0,13.883796,20,18.410144,1785.0,,1785.0,
3781,2021-12-29,1800.5,1821.0,1782.75,1786.849976,1902921,1758.284518,1649.163123,70.442549,55.365798,...,0,0,0,15.255864,20,18.940096,1806.099976,,1806.099976,
3782,2021-12-30,1787.0,1838.0,1786.0,1799.949951,4650594,1772.172996,1660.332517,71.526702,58.876562,...,0,1,0,14.837243,15,17.626731,1786.849976,,1786.849976,
3783,2021-12-31,1800.199951,1813.0,1783.449951,1790.550049,2378048,1778.29868,1669.97826,69.555228,60.206353,...,0,0,0,14.891495,15,16.751154,1799.949951,,1799.949951,


In [72]:
X = X.dropna()
y = y.dropna()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size= 7 * len(X) // 10,shuffle=False)

ValueError: train_size=0 should be either positive and smaller than the number of samples 0 or a float in the (0, 1) range

In [61]:
model.fit(X_train, y_train)
preds = model.predict(X_test)
X.head(), y.head()

(      changeEma
 2318   0.333371
 2319   0.166681
 2320   0.095245
 2321   0.057975
 2322  -0.148401,
       changeClose
 2318     0.000583
 2319    -0.022350
 2320    -0.005963
 2321     0.002700
 2322    -0.007579)