# Capstone Project- Data Collection Workbook
Cary Mosley, May 2020

## Import Libraries

In [1]:
import pandas as pd
import calendar, time
import datetime as dt
from datetime import date
import praw
import requests
from requests import get
import json
import quandl






## Collect Data

In [2]:
#Set date window to collect data

#Stock data interval
interval='1d'

#Set dates in YYYY-MM-DD format
end_date = '2020-04-30'
start_date = '2004-12-31'


#Stock Tickers
tickers = ['SPY','%5EVIX','VXX','IWM','QQQ']

### Collect historical stock data from Yahoo Finance

In [45]:
#Get Historical StockData From Yahoo Finance
def StockData(ticker, start_date, end_date = date.today(), interval = '1d'):
    start_date_epoch = calendar.timegm(time.strptime(str(start_date), '%Y-%m-%d'))
    end_date_epoch = calendar.timegm(time.strptime(str(end_date), '%Y-%m-%d'))
    url = 'https://query1.finance.yahoo.com/v7/finance/download/' + ticker
    url = url + '?period1=' + str(start_date_epoch) + '&period2=' + str(end_date_epoch)
    url = url + '&interval=' + interval + '&events=history'
    df = pd.read_csv(url)
    df.sort_values(by=['Date'],ascending=False,inplace=True)
    df.set_index('Date', inplace=True)
    columns_to_drop = ['Open','High','Low','Close','Volume']
    df.drop(columns=columns_to_drop,inplace=True)
    return df

In [46]:
#Create and export SPY prices
ticker=tickers[0]

spy_df = StockData(ticker, start_date, end_date, interval)
spy_df.head()
spy_df.to_csv('spy.csv')


In [47]:
#Create and export VIX prices
ticker=tickers[1]

vix_df = StockData(ticker, start_date, end_date, interval)
vix_df.head()
vix_df.to_csv('vix.csv')

In [48]:
#Create and export VXX prices
ticker=tickers[2]

vxx_df = StockData(ticker, start_date, end_date, interval)
vxx_df.head()
vxx_df.to_csv('vxx.csv')

In [49]:
#Create and export IWM prices
ticker=tickers[3]

iwm_df = StockData(ticker, start_date, end_date, interval)
iwm_df.head()
iwm_df.to_csv('iwm.csv')

In [50]:
#Create and export QQQ prices
ticker=tickers[4]

qqq_df = StockData(ticker, start_date, end_date, interval)
qqq_df.head()
qqq_df.to_csv('qqq.csv')

### Scrape reddit submissions
This section was not used for the final project

In [29]:
#Use PRAW to perform API pull
with open('reddit.json') as f:
    reddit_keys= json.load(f)

reddit = praw.Reddit(client_id = reddit_keys['client_id'], client_secret = reddit_keys['client_secret'], 
                     password = reddit_keys['password'], user_agent = reddit_keys['user_agent'],
                     username = reddit_keys['username'])


In [30]:
#Choose subreddit
subreddit = 'WallStreetBets'
#Use praw to collect data
top_posts = reddit.subreddit(subreddit).top(limit=500)
#create dictionary of elements that are returned
topics_dict = { "title":[], 
                "score":[], 
                "id":[], "url":[],  
                "comms_num": [], 
                "created": [], 
                "body":[]}

#create dataframe from the praw data
for post in top_posts:
    topics_dict["title"].append(post.title)
    topics_dict["score"].append(post.score)
    topics_dict["id"].append(post.id)
    topics_dict["url"].append(post.url)
    topics_dict["comms_num"].append(post.num_comments)
    topics_dict["created"].append(post.created)
    topics_dict["body"].append(post.selftext)
topics_data = pd.DataFrame(topics_dict)

In [31]:
topics_data.head()

Unnamed: 0,title,score,id,url,comms_num,created,body
0,"Elon has transcended time, space, and county r...",69256,ghw12u,https://i.redd.it/frcjfc2967y41.jpg,4941,1589258000.0,
1,How to get oil back up,58582,g55or2,https://i.redd.it/w5iqqihjo2u41.jpg,548,1587461000.0,
2,All In The War Machine,55123,eld4i6,https://i.redd.it/cj2vw26nmd941.jpg,784,1578440000.0,
3,Type yy into google.,52812,c75d5x,https://www.reddit.com/r/wallstreetbets/commen...,17263,1561875000.0,That’s all I’m saying.
4,Oil is now expenzive,49777,d51f4o,https://i.redd.it/2j386s5iuym31.png,1018,1568673000.0,


In [32]:
#Convert created to date-time
topics_data['created'] = topics_data["created"].apply(lambda x: dt.datetime.fromtimestamp(x))

In [33]:
topics_data.head()

Unnamed: 0,title,score,id,url,comms_num,created,body
0,"Elon has transcended time, space, and county r...",69256,ghw12u,https://i.redd.it/frcjfc2967y41.jpg,4941,2020-05-12 00:41:17,
1,How to get oil back up,58582,g55or2,https://i.redd.it/w5iqqihjo2u41.jpg,548,2020-04-21 05:18:21,
2,All In The War Machine,55123,eld4i6,https://i.redd.it/cj2vw26nmd941.jpg,784,2020-01-07 18:40:30,
3,Type yy into google.,52812,c75d5x,https://www.reddit.com/r/wallstreetbets/commen...,17263,2019-06-30 02:14:57,That’s all I’m saying.
4,Oil is now expenzive,49777,d51f4o,https://i.redd.it/2j386s5iuym31.png,1018,2019-09-16 18:28:12,


### Use Quandl API to scrape data

In [34]:
#Import quandl API key
with open('quandl.json') as f:
    quandl_keys= json.load(f)
quandl.ApiConfig.api_key = quandl_keys['api_key']

#### American Association of Individual Investor’s sentiment data

In [37]:
#AAII Data API Pull from Quandl
AAII_data = 'AAII/AAII_SENTIMENT'
AAII_sentiment_df = quandl.get(AAII_data, start_date=start_date, end_date=end_date)

In [39]:

AAII_sentiment_df.head()


Unnamed: 0_level_0,Bullish,Neutral,Bearish,Total,Bullish 8-Week Mov Avg,Bull-Bear Spread,Bullish Average,Bullish Average + St. Dev,Bullish Average - St. Dev,S&P 500 Weekly High,S&P 500 Weekly Low,S&P 500 Weekly Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2005-01-06,0.381,0.2738,0.3452,1.0,0.537763,0.0358,0.384907,0.488293,0.281522,1217.8,1183.72,1183.74
2005-01-13,0.3399,0.2614,0.3987,1.0,0.500125,-0.0588,0.384907,0.488293,0.281522,1194.78,1175.64,1184.52
2005-01-20,0.3366,0.3267,0.3366,0.9999,0.48035,0.0,0.384907,0.488293,0.281522,1195.98,1180.1,1184.52
2005-01-27,0.2643,0.3714,0.3643,1.0,0.442437,-0.1,0.384907,0.488293,0.281522,1175.96,1163.75,1174.07
2005-02-03,0.4167,0.3333,0.25,1.0,0.430337,0.1667,0.384907,0.488293,0.281522,1195.25,1171.36,1193.19


In [40]:
AAII_sentiment_df.columns

Index(['Bullish', 'Neutral', 'Bearish', 'Total', 'Bullish 8-Week Mov Avg',
       'Bull-Bear Spread', 'Bullish Average', 'Bullish Average + St. Dev',
       'Bullish Average - St. Dev', 'S&P 500 Weekly High',
       'S&P 500 Weekly Low', 'S&P 500 Weekly Close'],
      dtype='object')

In [41]:
#Drop Columns
drop_columns = ['Total', 'Bullish Average','Bullish 8-Week Mov Avg', 'Bullish Average + St. Dev',
       'Bullish Average - St. Dev','S&P 500 Weekly High','S&P 500 Weekly Low','S&P 500 Weekly Close']

In [42]:
AAII_sentiment_df.drop(columns=drop_columns,inplace=True)

In [51]:
AAII_sentiment_df.head()

Unnamed: 0_level_0,Bullish,Neutral,Bearish,Bull-Bear Spread
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-01-06,0.381,0.2738,0.3452,0.0358
2005-01-13,0.3399,0.2614,0.3987,-0.0588
2005-01-20,0.3366,0.3267,0.3366,0.0
2005-01-27,0.2643,0.3714,0.3643,-0.1
2005-02-03,0.4167,0.3333,0.25,0.1667


In [52]:
#Export to CSV
AAII_sentiment_df.to_csv('AAII.csv')

#### FINRA Short Interest By Security 
This was not used for the final project

In [71]:
#Get Finra Data 
def finra_data(ticker,start_date, end_date = date.today(), exchange='NYSE'):
    finra_short = 'FINRA/FN'
    finra_dict = {'NYSE': 'YX','NASDAQ':'SQ'}
    if exchange == 'NYSE':
        finra_call = finra_short + 'YX_' + ticker
        print(finra_call)
        return quandl.get(finra_call, start_date=start_date, end_date=end_date)
    elif exchange == 'NASDAQ':
        finra_call = finra_short + 'SQ_' + ticker
        print(finra_call)
        return quandl.get(finra_call, start_date=start_date, end_date=end_date)
    else:
        print('error')
        return 'error'
    return

In [72]:
exchange = 'NYSE'
short_df = finra_data(ticker,start_date,end_date,exchange)

FINRA/FNYX_SPY


In [73]:
short_df

Unnamed: 0_level_0,ShortVolume,ShortExemptVolume,TotalVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-05-12,2111990.0,0.0,2681572.0
2015-05-13,1278698.0,0.0,1830248.0
2015-05-14,1235013.0,0.0,1652625.0
2015-05-15,1269385.0,0.0,1967962.0
2015-05-18,846827.0,0.0,1309836.0
...,...,...,...
2020-05-05,2980150.0,10.0,5148326.0
2020-05-06,3298070.0,255.0,4846833.0
2020-05-07,3652096.0,0.0,5451842.0
2020-05-08,2872024.0,0.0,4007267.0


#### NAAIM Exposure Index

In [60]:
#NAAIM Data API Pull from Quandl
exposure = 'NAAIM/NAAIM'
NAAIM_exposure_df = quandl.get(exposure, start_date=start_date, end_date=end_date)


In [61]:
NAAIM_exposure_df.head()

Unnamed: 0_level_0,Mean/Average,Most Bearish Response,Quart 1 (25% at/below),Quart 2 (median),Quart 3 (25% at/above),Most Bullish Response,Standard Deviation,NAAIM Number,S&P 500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2006-06-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2006-06-30,56.33,-125.0,40.0,65.0,100.0,125.0,53.51,56.33,1326.05
2006-07-05,19.44,-100.0,0.0,20.0,50.0,100.0,55.55,19.44,1265.48
2006-07-12,31.2,-50.0,0.0,25.0,50.0,150.0,47.84,31.2,1236.2
2006-07-19,18.76,-100.0,0.0,25.0,50.0,100.0,38.17,18.76,1240.29


In [62]:
NAAIM_exposure_df.columns

Index(['Mean/Average', 'Most Bearish Response', 'Quart 1 (25% at/below)',
       'Quart 2 (median)', 'Quart 3 (25% at/above)', 'Most Bullish Response',
       'Standard Deviation', 'NAAIM Number', 'S&P 500'],
      dtype='object')

In [63]:
#drop columns
drop_columns = ['Quart 1 (25% at/below)','Quart 3 (25% at/above)', 'Standard Deviation', 'NAAIM Number', 'S&P 500']

In [64]:
NAAIM_exposure_df.drop(columns=drop_columns,inplace=True)

In [65]:
NAAIM_exposure_df.head()

Unnamed: 0_level_0,Mean/Average,Most Bearish Response,Quart 2 (median),Most Bullish Response
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006-06-23,0.0,0.0,0.0,0.0
2006-06-30,56.33,-125.0,65.0,125.0
2006-07-05,19.44,-100.0,20.0,100.0
2006-07-12,31.2,-50.0,25.0,150.0
2006-07-19,18.76,-100.0,25.0,100.0


In [66]:
#Export to CSV
NAAIM_exposure_df.to_csv('NAAIM.csv')

#### University of Michigan Consumer Sentiment

In [67]:
mich_sent = 'UMICH/SOC1'
mich_sent_df = quandl.get(mich_sent, start_date=start_date, end_date=end_date)

In [68]:
mich_sent_df.head()

Unnamed: 0_level_0,Index
Date,Unnamed: 1_level_1
2004-12-31,97.1
2005-01-31,95.5
2005-02-28,94.1
2005-03-31,92.6
2005-04-30,87.7


In [69]:
#Export to CSV
mich_sent_df.to_csv('UMICH.csv')

#### NYTimes Articles

In [9]:
#Import NYTimes API key
with open('nytimes.json') as f:
    nyt_keys= json.load(f)


I had to break the API pulls into chunks as it was 1-2m articles and appending to one giant dataframe took too long. I'll be merging and filtering the articles in the DataCleanand Process workbook and combining into one dataframe there.

In [16]:
#Set year and month ranges
years = range(2010,2014)
months1 = range(1,7)
months2 = range(7,13)

The loop below pulls the publication date, headline and snippet from all NY Times articles over the specified time frame and exports to a csv

In [None]:
#loop through each year
for year in years:
    #create temp dataframe
    nyt_df = pd.DataFrame(columns=['Date', 'Headline','Snippet'])
    
    #loop through first 6 months
    for month in months1:
        #set the url for the api pull
        url = 'https://api.nytimes.com/svc/archive/v1/'
        url = url + str(year) + '/' + str(month) + '.json?api-key=' + nyt_keys['api_key']
        
        #pull data from NY Times
        responses=requests.get(url)
        articles = responses.json()
        print(year,month)
        
        #loop through each article and grab the relavent fields
        for article in articles['response']['docs']:
            try:
                nyt_df = nyt_df.append({'Date' : article['pub_date'] , 'Headline' : article['headline']['main'] ,'Snippet' : article['snippet']},ignore_index=True)
            except:
                print('bad entry')
                continue
    #set path to export
    path = 'nyt'+str(year)+'1.csv'            
    nyt_df.to_csv(path)
    nyt_df = pd.DataFrame(columns=['Date', 'Headline','Snippet'])
    for month in months2:
        url = 'https://api.nytimes.com/svc/archive/v1/'
        url = url + str(year) + '/' + str(month) + '.json?api-key=' + nyt_keys['api_key']
        responses=requests.get(url)
        articles = responses.json()
        print(year,month)
        for article in articles['response']['docs']:
            try:
                nyt_df = nyt_df.append({'Date' : article['pub_date'] , 'Headline' : article['headline']['main'] ,'Snippet' : article['snippet']},ignore_index=True)
            except:
                print('bad entry')
                continue
    path = 'nyt'+str(year)+'2.csv'            
    nyt_df.to_csv(path)

2010 1
2010 2
2010 3
bad entry
2010 4
bad entry
2010 5
2010 6
