In [4]:
from yahoofinancials import YahooFinancials as yf
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
from splinter import Browser

# models 
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier

import pandas_datareader.data as reader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, plot_confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# %matplotlib inline
# dependencies 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from mlens.ensemble import SuperLearner
import datetime as dt


def get_stock_data(tickers, start_date=None, end_date=None, freq='daily'):
    ticker_not_found=[]
    for ticker in tickers:
        yf_engine = yf(ticker)
        price = yf_engine.get_historical_price_data(start_date,end_date,freq)
        #store the data in DataFrame
        try:
            ticker_data = pd.DataFrame(price[ticker]['prices'])
            ticker_data = ticker_data.drop('date', axis=1) # We will use formatted_date columns instead
        except:
            ticker_not_found.append(ticker)
            continue
            
    return ticker_data, ticker_not_found

ticker = ['A']
start_date = '2010-09-01'
end_date = dt.datetime.today().strftime("%Y-%m-%d")


import pandas_datareader.data as reader
data = reader.get_data_yahoo('A')
df = data.reset_index()
df1=df.copy()

df, ticker_not_found = get_stock_data(ticker, start_date, end_date)


executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=True)

url = 'https://stockanalysis.com/stocks/'
browser.visit(url)
element = browser.find_by_name('perpage').first
element.select('10000')
html = browser.html   
soup = BeautifulSoup(html,'html.parser')

table = soup.find('table', {'class' : 'symbol-table index'})

symbol = []
for row in table.find_all('tr')[1:]:
    symbol.append(row.find_all('td')[0].text)

browser.quit()

accuracy_df = pd.DataFrame(columns = ["","KNN","Logistic Regression","Random Forest","SGD Classifier","SVM","Average Accuracy","Symbol"])

for i in symbol:
    try:
        def get_clean_data (df, start_date, end_date):
            features = df.copy()
            features = features.drop(['formatted_date'], axis=1)
            #creating features as stated above
            features['volume'] = features['volume'].shift(1)
            features['SMA'] = features['adjclose'].rolling(window=20).mean().shift(1)
            features['Std_20'] = features['adjclose'].rolling(window=20).std().shift(1)
            features['Band_1'] = features['SMA'] - features['Std_20']
            features['Band_2'] = features['SMA'] + features['Std_20']
            features['ON_returns'] = features['close'] - features['open'].shift(-1)
            features['ON_returns'] = features['ON_returns'].shift(1)
            features['ON_returns_signal'] = np.where(features['ON_returns']<0, 'up', 'down')
            features['dist_from_mean'] = features['adjclose'].shift(1) - features['SMA']
            
        #Obtaining Vix Data and combining with existing features of stock
            ticker = [i]
            start_date = start_date
            end_date = end_date
            i_data, ticker_not_found = get_stock_data(ticker, start_date, end_date)
            i_data = pd.DataFrame(i_data['adjclose'].shift(1))
            i_data = i_data.rename(columns = {'adjclose':'vix_data'})
            comb_features = pd.concat([features,i_data], axis=1)
            comb_features = comb_features.dropna() #dropping NaN values
            comb_features = pd.get_dummies(comb_features, columns=['ON_returns_signal']) #for categorical variables
            comb_features = comb_features.drop('ON_returns', axis=1) #dropping original categorical column
            comb_features = comb_features.drop('close', axis=1) #not really needed this value since we have adj close now  
            ###Create return column to predict
            comb_features['stock_move'] = np.where(comb_features['adjclose']-
                                                comb_features['adjclose'].shift(-1)<0, "Buy", "Sell")
            features_clean = comb_features.dropna() #Dropping Nan values
            features_clean = features_clean[:-1] #Drop last row which do not have any stock signal
            features_clean.tail()
            return features_clean

        # np.random.seed(42)
        features = get_clean_data(df, start_date, end_date)

        features.head()
        #convert stock_move to binary
        features['stock_move'] = np.where(features['stock_move'] == 'Sell', 0, 1)
        # Split Data 
        X = features.drop(['high', 'low', 'stock_move'], axis=1)
        y = features['stock_move']


        #test train split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # Scale Our Data

        #Scale the features
        scaler = MinMaxScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Put models in a dictionary
        models = {"KNN": KNeighborsClassifier(),
            "Logistic Regression": LogisticRegression(), 
            "Random Forest": RandomForestClassifier(),
            "SGD Classifier": SGDClassifier(), 
            "SVM": svm.SVC(kernel = 'rbf')}
        # Create func  tion to fit and score models
        def fit_and_score(models, X_train, X_test, y_train, y_test):
            """
            Fits and evaluates given machine learning models.
            models : a dict of different Scikit-Learn machine learning models
            X_train : training data
            X_test : testing data
            y_train : labels assosciated with training data
            y_test : labels assosciated with test data
            """
            # Random seed for reproducible results
            np.random.seed(42)
            # Make a list to keep model scores
            model_scores = {}
            # Loop through models
            for name, model in models.items():
                # Fit the model to the data
                model.fit(X_train, y_train)
                # Evaluate the model and append its score to model_scores
                model_scores[name] = model.score(X_test, y_test)
            return model_scores

        model_scores = fit_and_score(models=models,
                                X_train=X_train,
                                X_test=X_test,
                                y_train=y_train,
                                y_test=y_test)
        
        model_compare = pd.DataFrame(model_scores,index=[""])   
        model_compare['Average Accuracy'] = np.array(list(model_scores.values())).mean()
        model_compare["Symbol"] = i
        accuracy_df = accuracy_df.append(model_compare)

        print(f"{i} added")
    except:
        print(f"No Data for {i}")
        pass


[WDM] - 

[WDM] - Current google-chrome version is 96.0.4664
[WDM] - Get LATEST driver version for 96.0.4664
[WDM] - Driver [C:\Users\bmontague\.wdm\drivers\chromedriver\win32\96.0.4664.45\chromedriver.exe] found in cache


A added
AA added
AAC added
AACG added
No Data for AACI
AADI added
AAIC added
AAL added
AAMC added
AAME added
AAN added
AAOI added
AAON added
AAP added
AAPL added
AAQC added
AAT added
AATC added
AAU added
AAWW added
AB added
ABB added
ABBV added
ABC added
ABCB added
ABCL added
ABCM added
ABEO added
ABEV added
ABG added
ABGI added
ABIO added
ABM added
ABMD added
ABNB added
ABOS added
ABR added
ABSI added
ABST added
ABT added
ABTX added
ABUS added
ABVC added
AC added
ACA added
ACAD added
ACAH added
ACB added
ACBA added
ACBI added
ACC added
ACCD added
ACCO added
ACEL added
ACER added
ACET added
ACEV added
ACGL added
ACH added
ACHC added
ACHL added
ACHR added
ACHV added
ACI added
ACII added
ACIU added
ACIW added
ACKIT added
ACLS added
ACM added
ACMR added
ACN added
ACNB added
ACOR added
ACQR added
ACR added
ACRE added
ACRO added
ACRS added
ACRX added
ACST added
ACT added
ACTD added
ACTG added
ACU added
ACVA added
ACXP added
ACY added
ADAG added
ADAP added
ADBE added
ADC added
ADCT added
ADE

In [3]:
accuracy_df.to_csv('Output/accuracy.csv')