<a href="https://colab.research.google.com/github/DesiPilla/MachineLearning/blob/master/Project/CISC_684_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import time as t
from pandas_datareader import data as wb

In [0]:
def getTickers(step):
    url = 'http://www.stockpup.com/data/'
    site = BeautifulSoup(requests.get(url).text, "html.parser")
    files = site.findAll('a')[22:-1][::step]
    
    tickers = []
    for f in files:
        if f['href'][-3:] == 'csv':
            tickers.append(f['href'][6:].split('_')[0])
    return tickers

In [0]:
def getPriceData(ticker):  
    pData = pd.DataFrame()
    
    try:
        startDate = '1980-01-01'
        dataSource = 'yahoo'

        ticker_data= wb.DataReader(ticker, data_source = dataSource, start = startDate)
        pData = pd.DataFrame(ticker_data)[['Close', 'Volume']]
        pData['Ticker'] = ticker
        pData['50 Day Moving Average'] = pData['Close'].rolling(window = 50).mean()
        daysIntoFuture = 2
        pData['Class'] = pData['Close'].rolling(daysIntoFuture).sum().shift(-daysIntoFuture) - pData['Close'].rolling(daysIntoFuture - 1).sum().shift(-daysIntoFuture + 1) > pData['Close']

    except:
        pass
    
    return pData.reset_index()

In [0]:
def getAllData(tickers):
   
    url = 'http://www.stockpup.com/data/'
    filePath = '_quarterly_financial_data.csv'
    
    print('Fetching data... (this may take up to 60 seconds or more)')
    t1 = t.time()
    
    # Get GICS Sector info
    sectorInfo = pd.read_csv('https://datahub.io/core/s-and-p-500-companies/r/constituents.csv').set_index('Symbol')
    
    allData = pd.DataFrame()
    for tick in tickers:
        try:
            fData = pd.read_csv(url + tick + filePath)
        
            # Get fundamentals data for stock
            fData = fData[['Quarter end',
                           'Cash at end of period',
                           'Shares split adjusted',
                           'Cash from operating activities',
                           'Capital expenditures',
                           'Assets',
                           'Liabilities',
                           'EPS basic']]
            fData['Ticker'] = tick
            fData['Cash from operating activities'] = pd.to_numeric(fData['Cash from operating activities'], errors = 'corece')
            fData['EPS basic'] = pd.to_numeric(fData['EPS basic'], errors = 'coerce')
            fData['Quarter end'] = pd.to_datetime(fData['Quarter end'])
            fData = fData.reset_index().drop(columns = ['index']).sort_values(by = 'Quarter end', ascending = True)

            # Get price data for stock
            pData = getPriceData(tick)

            # Merge price and fundamentals data and build attributes
            stockData = pd.merge_asof(pData, fData, left_on = 'Date', right_on = 'Quarter end', by = 'Ticker', direction = 'backward', allow_exact_matches = False)
            stockData['Market / Book Ratio'] = stockData['Close'] / (stockData['Assets'] - stockData['Liabilities']) * stockData['Shares split adjusted']
            stockData['P/E'] = stockData['Close'] / stockData['EPS basic']
            stockData['Debt / Equity Ratio'] = stockData['Liabilities'] / (stockData['Assets'] - stockData['Liabilities'])
            stockData['Free Cash Flow Yield'] = (stockData['Cash from operating activities'] - stockData['Capital expenditures']) / (stockData['Shares split adjusted']*stockData['Close'] + stockData['Liabilities'] - stockData['Cash at end of period'])
            try:
                stockData['Sector'] = sectorInfo['Sector'][tick]
            except:
                stockData['Sector'] = 'Unkown'
            stockData = stockData[['Ticker',
                                   'Sector',
                                   'Date', 
                                   'Close',
                                   '50 Day Moving Average',
                                   'Volume',
                                   'Market / Book Ratio',
                                   'P/E',
                                   'Debt / Equity Ratio',
                                   'Free Cash Flow Yield',
                                   'Class']]

            allData = pd.concat([allData, stockData])
            print(tick)
        except:
            continue
        
    t2 = t.time()
    print('Done fetching! (%d seconds to complete)\n' % (t2 - t1))
    return allData .dropna()

In [13]:
n = 250     # Get 1/n the total number of stocks on the nyse
allData = getAllData(getTickers(n))
allData.head()

Fetching data... (this may take up to 60 seconds or more)
A
EMC
IR
SLG
X
Done fetching! (6 seconds to complete)



Unnamed: 0,Ticker,Sector,Date,Close,50 Day Moving Average,Volume,Market / Book Ratio,P/E,Debt / Equity Ratio,Free Cash Flow Yield,Class
50,A,Health Care,2000-02-01,50.786839,40.462267,1404200.0,5.117176,169.289462,0.584262,0.014623,True
51,A,Health Care,2000-02-02,54.721031,40.979077,1945100.0,5.513577,182.403437,0.584262,0.013622,False
52,A,Health Care,2000-02-03,55.615166,41.46191,1779500.0,5.603668,185.383886,0.584262,0.013414,True
53,A,Health Care,2000-02-04,54.542202,41.980508,1145600.0,5.495558,181.80734,0.584262,0.013665,True
54,A,Health Care,2000-02-07,56.866951,42.5304,1274200.0,5.729795,189.556503,0.584262,0.013132,False


In [25]:
# Example of a subset for a single stock
ticker = allData['Ticker'].unique()[3]
print(ticker)
sectorData = allData[allData['Ticker'] == ticker]
sectorData.head()

SLG


Unnamed: 0,Ticker,Sector,Date,Close,50 Day Moving Average,Volume,Market / Book Ratio,P/E,Debt / Equity Ratio,Free Cash Flow Yield,Class
157,SLG,Real Estate,1998-04-01,25.6875,25.9275,2700.0,1.509399,77.840909,1.6595,-0.243788,True
158,SLG,Real Estate,1998-04-02,26.25,25.93,17800.0,1.542452,79.545455,1.6595,-0.241231,True
159,SLG,Real Estate,1998-04-03,26.4375,25.92875,5800.0,1.553469,80.113636,1.6595,-0.24039,True
160,SLG,Real Estate,1998-04-06,26.5,25.92875,16500.0,1.557142,80.30303,1.6595,-0.240111,False
161,SLG,Real Estate,1998-04-07,26.5,25.92625,37600.0,1.557142,80.30303,1.6595,-0.240111,False


In [26]:
# Example of a subset for a single sector
ticker = allData['Sector'].unique()[2]
print(ticker)
sectorData = allData[allData['Sector'] == ticker]
sectorData.head()

Industrials


Unnamed: 0,Ticker,Sector,Date,Close,50 Day Moving Average,Volume,Market / Book Ratio,P/E,Debt / Equity Ratio,Free Cash Flow Yield,Class
5564,IR,Industrials,2002-04-01,19.544729,18.848323,3436200.0,1.662277,40.718186,1.806317,-0.013295,False
5565,IR,Industrials,2002-04-02,19.548721,18.91901,3192800.0,1.662617,40.726503,1.806317,-0.013293,False
5566,IR,Industrials,2002-04-03,18.726038,18.973802,4383200.0,1.592648,39.012579,1.806317,-0.013569,True
5567,IR,Industrials,2002-04-04,19.269169,19.039057,5630900.0,1.638841,40.144102,1.806317,-0.013386,True
5568,IR,Industrials,2002-04-05,19.740416,19.110383,4146600.0,1.67892,41.125866,1.806317,-0.013231,True
