<a href="https://colab.research.google.com/github/BillWENZE/BillWENZE/blob/main/1_GenerativeModel_GridSearchCV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Outline:
- Load data from Yahoo Finance
- Get some technical indicators
- Implement Gaussian Naive Bayes
- Introduce GridSearchCV on Random Forest


In [None]:
! pip install yfinance
!pip install pandas_ta

Collecting pandas_ta
  Downloading pandas_ta-0.3.14b.tar.gz (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.1/115.1 kB[0m [31m606.8 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pandas_ta
  Building wheel for pandas_ta (setup.py) ... [?25l[?25hdone
  Created wheel for pandas_ta: filename=pandas_ta-0.3.14b0-py3-none-any.whl size=218907 sha256=a62fe57ea0c3769d119fffde3516e864d95a3f1c9b438df6ea5ba8957e90df5a
  Stored in directory: /root/.cache/pip/wheels/69/00/ac/f7fa862c34b0e2ef320175100c233377b4c558944f12474cf0
Successfully built pandas_ta
Installing collected packages: pandas_ta
Successfully installed pandas_ta-0.3.14b0


In [None]:
import yfinance as yf
import pandas_ta as ta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df= pd.DataFrame()
df=df.ta.ticker('pfe')

In [None]:
# Exponential Moving Averages
df['ema10']=ta.ema(df['Close'],length=10)
df['ema30']=ta.ema(df['Close'],length=30)

# Average True Range- Measures Volatility Caused by Price Gaps or Limit Moves
df['atr'] = ta.atr(df['High'],df['Low'],df['Close'])

# Average Directional Movement Index - to  quantify trend strength by measuring
# the amount of movement in a single direction

adx= ta.adx(df['High'],df['Low'],df['Close'])
df['adx'] = adx['ADX_14']

# Moving Average Convergence/ Divergence
#   Used to identify aspects of a security's overall trend
#   MACD Line: (12-day EMA - 26-day EMA)
#   Signal Line: 9-day EMA of MACD Line
#   MACD Histogram: MACD Line - Signal Line

macd = ta.macd(df['Close'], fast=12, slow=26, signal=9)
df['macd']=macd['MACD_12_26_9']
df['macds']=macd['MACDs_12_26_9']


# Relative Strength Index
#   momentum oscillator used to measure the
#   velocity as well as the magnitude of directional price movements

df['rsi'] =ta.rsi(df['Close'],length=14)

df['Cgtema10'] = np.where(df['Close'] > df['ema10'], 1, -1)
df['ema10gtema30'] = np.where(df['ema10'] > df['ema30'], 1, -1)
df['macdsgtmacd'] = np.where(df['macds'] > df['macd'], 1, -1)
df['Return_1'] = df['Close'].pct_change(1).shift(-1)


In [None]:
df['target'] = 0
df.loc[df['Return_1']<-0.0164,'target']=-1
df.loc[df['Return_1']>0.0177,'target']=1

df.dropna(inplace=True)

# Features
predictors_list = ['atr', 'adx','rsi', 'Cgtema10', 'ema10gtema30', 'macdsgtmacd']
X = df[predictors_list]

# Target Variable
y = df.target

In [None]:
X_train=X.loc['1981-01-01':'2020-12-31']
X_valid=X.loc['2021-01-01':'2022-12-31']
X_test=X.loc['2023-01-01':]
y_train=y.loc['1981-01-01':'2020-12-31']
y_valid=y.loc['2021-01-01':'2022-12-31']
y_test=y.loc['2023-01-01':]

In [None]:
print (X_train.shape)
X_train.head()

(10087, 6)


Unnamed: 0_level_0,atr,adx,rsi,Cgtema10,ema10gtema30,macdsgtmacd
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1981-01-02 00:00:00-05:00,0.006522,25.968685,72.84897,1,1,-1
1981-01-05 00:00:00-05:00,0.006428,27.632896,69.144385,1,1,-1
1981-01-06 00:00:00-05:00,0.007038,29.528662,72.190312,1,1,-1
1981-01-07 00:00:00-05:00,0.007047,31.289016,68.546804,1,1,-1
1981-01-08 00:00:00-05:00,0.007055,33.097445,62.593083,1,1,-1


NB doc:
https://scikit-learn.org/stable/modules/naive_bayes.html

GaussianNB doc:
https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB

**priors:** array-like of shape (n_classes,) \
Prior probabilities of the classes. If specified the priors are not adjusted according to the data.

**var_smoothing:** float, default=1e-9 \
Portion of the largest variance of all features that is added to variances for calculation stability.



In [None]:
# Return numbers spaced evenly on a log scale.
np.logspace(0, -12, num =13)

array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 1.e-05, 1.e-06, 1.e-07,
       1.e-08, 1.e-09, 1.e-10, 1.e-11, 1.e-12])

In [None]:
from sklearn.naive_bayes import GaussianNB

for vs in np.logspace(0, -12, num =13):
    # Initialize Gaussian Naive Bayes
    gnb = GaussianNB(var_smoothing = vs)
    # Train the classifier
    gnb.fit(X_train, y_train)
    # Make predictions on test data
    y_pred = gnb.predict(X_test)
    y_train_pred = gnb.predict(X_train)

    print ('vs = ' + str(vs))
    print ('Training accuracy = ' + str(np.sum(y_train_pred == y_train)/len(y_train)))
    print ('Test accuracy = ' + str(np.sum(y_pred == y_test)/len(y_test)))

vs = 1.0
Training accuracy = 0.7554277783285417
Test accuracy = 0.8157099697885196
vs = 0.1
Training accuracy = 0.7554277783285417
Test accuracy = 0.8157099697885196
vs = 0.01
Training accuracy = 0.7554277783285417
Test accuracy = 0.8157099697885196
vs = 0.001
Training accuracy = 0.7554277783285417
Test accuracy = 0.8157099697885196
vs = 0.0001
Training accuracy = 0.7528502032318827
Test accuracy = 0.7190332326283988
vs = 1e-05
Training accuracy = 0.7495786656092
Test accuracy = 0.6344410876132931
vs = 1e-06
Training accuracy = 0.7487855655794587
Test accuracy = 0.622356495468278
vs = 1e-07
Training accuracy = 0.7487855655794587
Test accuracy = 0.6102719033232629
vs = 1e-08
Training accuracy = 0.7487855655794587
Test accuracy = 0.6102719033232629
vs = 1e-09
Training accuracy = 0.7487855655794587
Test accuracy = 0.6102719033232629
vs = 1e-10
Training accuracy = 0.7487855655794587
Test accuracy = 0.6102719033232629
vs = 1e-11
Training accuracy = 0.7487855655794587
Test accuracy = 0.61027

In [None]:
print (gnb.class_prior_)

[0.1213443  0.75542778 0.12322792]


In [None]:
print (gnb.predict_proba(X_test))

[[0.41542424 0.18644381 0.39813195]
 [0.4348528  0.12080872 0.44433848]
 [0.43650802 0.10673671 0.45675527]
 [0.44847408 0.09266614 0.45885978]
 [0.46245475 0.02523276 0.51231249]
 [0.46455743 0.01907949 0.51636307]
 [0.4623349  0.02326503 0.51440007]
 [0.46358079 0.02520912 0.51121009]
 [0.46008541 0.03528074 0.50463385]
 [0.4614927  0.01643543 0.52207187]
 [0.46071802 0.01609387 0.5231881 ]
 [0.46068901 0.02288767 0.51642332]
 [0.46041541 0.03117775 0.50840684]
 [0.45664087 0.04385447 0.49950466]
 [0.45632611 0.04904172 0.49463217]
 [0.45133816 0.07079294 0.4778689 ]
 [0.45464737 0.06693082 0.47842181]
 [0.44987831 0.07991384 0.47020785]
 [0.44884226 0.08755782 0.46359992]
 [0.46345328 0.07681713 0.45972959]
 [0.4633989  0.08450299 0.45209811]
 [0.46474514 0.0937502  0.44150466]
 [0.4649352  0.1213251  0.41373971]
 [0.45751191 0.13803459 0.4044535 ]
 [0.45342221 0.15136918 0.39520861]
 [0.43905454 0.1887499  0.37219556]
 [0.45278382 0.16249925 0.38471693]
 [0.44256005 0.1949331  0.36

# GridSearchCV and accuracy_score using Random Forest

RandomForestClassifier doc:
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

para_grid = {'n_estimators': [20, 50], # Number of trees in random forest
               'max_features': ['sqrt'], # Number of features to consider at every split
               'max_depth': [10, 20, 30], # Maximum number of levels in tree
               'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
               'min_samples_leaf': [1, 2, 4]} # Minimum number of samples required at each leaf node

rf = RandomForestClassifier()
clf = GridSearchCV(rf, para_grid, cv = 3)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# optimal parameters
print (clf.best_params_)

accuracy = accuracy_score(y_pred, y_test)
train_acc = accuracy_score(clf.predict(X_train), y_train)
print ('Test accuracy = ' + str(accuracy))
print ('Train accuracy = ' + str(train_acc))

{'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Test accuracy = 0.7885196374622356
Train accuracy = 0.7604837910181421
