# Capstone Project- Data Collection Workbook
Cary Mosley, May 2020

## Import Libraries

In [136]:
import pandas as pd
import calendar, time
import datetime as dt
from datetime import date
from dateutil.relativedelta import relativedelta
import praw
import requests
from requests import get
import json
import quandl
from nytimesarticle import articleAPI
import pyjq





## Collect Data

In [66]:
#Set date window to collect data
#length in years
length=5

#Stock data interval
interval='1d'

#Set dates in YYYY-MM-DD format
end_date = date.today()
start_date = (end_date - relativedelta(years=length))
start_date_epoch = calendar.timegm(time.strptime(str(start_date), '%Y-%m-%d'))

#Stock Ticker
ticker = 'SPY'

### Collect historical stock data from Yahoo Finance

In [40]:
#Get Historical StockData From Yahoo Finance
def StockData(ticker, start_date, end_date = date.today(), interval = '1d'):
    end_date_epoch = calendar.timegm(time.strptime(str(end_date), '%Y-%m-%d'))
    url = 'https://query1.finance.yahoo.com/v7/finance/download/' + ticker
    url = url + '?period1=' + str(start_date_epoch) + '&period2=' + str(end_date_epoch)
    url = url + '&interval=' + interval + '&events=history'
    df = pd.read_csv(url)
    df.sort_values(by=['Date'],ascending=False,inplace=True)
    df.set_index('Date', inplace=True)
    columns_to_drop = ['Open','High','Low','Close']
    df.drop(columns=columns_to_drop,inplace=True)
    return df

In [41]:


df = StockData(ticker, start_date, end_date, interval)
df.head()

Unnamed: 0_level_0,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-05-11,19.4,13069600
2020-05-08,20.18,23698500
2020-05-07,19.84,24927600
2020-05-06,18.5,23781300
2020-05-05,16.620001,7678400


### Scrape reddit submissions

In [8]:
with open('reddit.json') as f:
    reddit_keys= json.load(f)

reddit = praw.Reddit(client_id = reddit_keys['client_id'], client_secret = reddit_keys['client_secret'], 
                     password = reddit_keys['password'], user_agent = reddit_keys['user_agent'],
                     username = reddit_keys['username'])


In [27]:
#Choose subreddit
subreddit = 'WallStreetBets'
#Use praw to collect data
top_posts = reddit.subreddit(subreddit).top(limit=500)
#create dictionary of elements that are returned
topics_dict = { "title":[], 
                "score":[], 
                "id":[], "url":[],  
                "comms_num": [], 
                "created": [], 
                "body":[]}

#create dataframe from the praw data
for post in top_posts:
    topics_dict["title"].append(post.title)
    topics_dict["score"].append(post.score)
    topics_dict["id"].append(post.id)
    topics_dict["url"].append(post.url)
    topics_dict["comms_num"].append(post.num_comments)
    topics_dict["created"].append(post.created)
    topics_dict["body"].append(post.selftext)
topics_data = pd.DataFrame(topics_dict)

In [28]:
topics_data.head()

Unnamed: 0,title,score,id,url,comms_num,created,body
0,"Elon has transcended time, space, and county r...",67401,ghw12u,https://i.redd.it/frcjfc2967y41.jpg,4766,1589258000.0,
1,How to get oil back up,58383,g55or2,https://i.redd.it/w5iqqihjo2u41.jpg,549,1587461000.0,
2,All In The War Machine,55047,eld4i6,https://i.redd.it/cj2vw26nmd941.jpg,784,1578440000.0,
3,Type yy into google.,52806,c75d5x,https://www.reddit.com/r/wallstreetbets/commen...,17263,1561875000.0,That’s all I’m saying.
4,Oil is now expenzive,49769,d51f4o,https://i.redd.it/2j386s5iuym31.png,1019,1568673000.0,


In [29]:
#Convert created to date-time
topics_data['created'] = topics_data["created"].apply(lambda x: dt.datetime.fromtimestamp(x))

In [30]:
topics_data.head()

Unnamed: 0,title,score,id,url,comms_num,created,body
0,"Elon has transcended time, space, and county r...",67401,ghw12u,https://i.redd.it/frcjfc2967y41.jpg,4766,2020-05-12 00:41:17,
1,How to get oil back up,58383,g55or2,https://i.redd.it/w5iqqihjo2u41.jpg,549,2020-04-21 05:18:21,
2,All In The War Machine,55047,eld4i6,https://i.redd.it/cj2vw26nmd941.jpg,784,2020-01-07 18:40:30,
3,Type yy into google.,52806,c75d5x,https://www.reddit.com/r/wallstreetbets/commen...,17263,2019-06-30 02:14:57,That’s all I’m saying.
4,Oil is now expenzive,49769,d51f4o,https://i.redd.it/2j386s5iuym31.png,1019,2019-09-16 18:28:12,


### Use Quandl API to scrape data

In [11]:
#Import quandl API key
with open('quandl.json') as f:
    quandl_keys= json.load(f)
quandl.ApiConfig.api_key = quandl_keys['api_key']

{'api_key': 'rs4CXL9RLiwyCbU7Lx-6'}

#### American Association of Individual Investor’s sentiment data

In [42]:
#AAII Table Name
AAII_data = 'AAII/AAII_SENTIMENT'
sentiment_df = quandl.get(AAII_data, start_date=start_date, end_date=end_date)

In [43]:

sentiment_df.head()


Unnamed: 0_level_0,Bullish,Neutral,Bearish,Total,Bullish 8-Week Mov Avg,Bull-Bear Spread,Bullish Average,Bullish Average + St. Dev,Bullish Average - St. Dev,S&P 500 Weekly High,S&P 500 Weekly Low,S&P 500 Weekly Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-05-14,0.267442,0.468992,0.263566,1.0,0.313376,0.003876,0.384734,0.488168,0.281301,2117.69,2067.93,2098.48
2015-05-21,0.252119,0.497881,0.25,1.0,0.296854,0.002119,0.384685,0.488102,0.281268,2134.72,2085.57,2125.85
2015-05-28,0.270019,0.478585,0.251397,1.000001,0.286362,0.018622,0.384675,0.488058,0.281291,2134.72,2099.18,2123.48
2015-06-04,0.273399,0.480296,0.246305,1.0,0.28466,0.027094,0.384655,0.488007,0.281302,2126.22,2099.14,2114.07
2015-06-11,0.200375,0.473783,0.325843,1.000001,0.269622,-0.125468,0.384607,0.487942,0.281272,2121.92,2072.14,2105.2


In [44]:
sentiment_df.columns

Index(['Bullish', 'Neutral', 'Bearish', 'Total', 'Bullish 8-Week Mov Avg',
       'Bull-Bear Spread', 'Bullish Average', 'Bullish Average + St. Dev',
       'Bullish Average - St. Dev', 'S&P 500 Weekly High',
       'S&P 500 Weekly Low', 'S&P 500 Weekly Close'],
      dtype='object')

In [45]:
drop_columns = ['Total', 'Bullish Average','Bullish 8-Week Mov Avg', 'Bullish Average + St. Dev',
       'Bullish Average - St. Dev','S&P 500 Weekly High','S&P 500 Weekly Low','S&P 500 Weekly Close']

In [46]:
sentiment_df.drop(columns=drop_columns,inplace=True)

In [47]:
sentiment_df.head()

Unnamed: 0_level_0,Bullish,Neutral,Bearish,Bull-Bear Spread
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-05-14,0.267442,0.468992,0.263566,0.003876
2015-05-21,0.252119,0.497881,0.25,0.002119
2015-05-28,0.270019,0.478585,0.251397,0.018622
2015-06-04,0.273399,0.480296,0.246305,0.027094
2015-06-11,0.200375,0.473783,0.325843,-0.125468


#### FINRA Short Interest By Security 

In [71]:
#Get Finra Data 
def finra_data(ticker,start_date, end_date = date.today(), exchange='NYSE'):
    finra_short = 'FINRA/FN'
    finra_dict = {'NYSE': 'YX','NASDAQ':'SQ'}
    if exchange == 'NYSE':
        finra_call = finra_short + 'YX_' + ticker
        print(finra_call)
        return quandl.get(finra_call, start_date=start_date, end_date=end_date)
    elif exchange == 'NASDAQ':
        finra_call = finra_short + 'SQ_' + ticker
        print(finra_call)
        return quandl.get(finra_call, start_date=start_date, end_date=end_date)
    else:
        print('error')
        return 'error'
    return

In [72]:
exchange = 'NYSE'
short_df = finra_data(ticker,start_date,end_date,exchange)

FINRA/FNYX_SPY


In [73]:
short_df

Unnamed: 0_level_0,ShortVolume,ShortExemptVolume,TotalVolume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-05-12,2111990.0,0.0,2681572.0
2015-05-13,1278698.0,0.0,1830248.0
2015-05-14,1235013.0,0.0,1652625.0
2015-05-15,1269385.0,0.0,1967962.0
2015-05-18,846827.0,0.0,1309836.0
...,...,...,...
2020-05-05,2980150.0,10.0,5148326.0
2020-05-06,3298070.0,255.0,4846833.0
2020-05-07,3652096.0,0.0,5451842.0
2020-05-08,2872024.0,0.0,4007267.0


#### NAAIM Exposure Index

In [74]:
exposure = 'NAAIM/NAAIM'
exposure_df = quandl.get(exposure, start_date=start_date, end_date=end_date)


In [75]:
exposure_df.head()

Unnamed: 0_level_0,Mean/Average,Most Bearish Response,Quart 1 (25% at/below),Quart 2 (median),Quart 3 (25% at/above),Most Bullish Response,Standard Deviation,NAAIM Number,S&P 500
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-05-13,60.38,-150.0,50.0,82.98,97.0,100.0,54.76,60.38,2098.48
2015-05-20,67.61,-150.0,50.0,80.0,100.0,200.0,55.76,67.61,2125.85
2015-05-27,65.5,-150.0,50.0,80.0,100.0,200.0,59.14,65.5,2123.48
2015-06-03,71.07,-150.0,63.65,81.0,100.0,200.0,58.57,71.07,2114.07
2015-06-10,68.7,-150.0,50.0,86.0,100.0,200.0,58.29,68.7,2105.2


In [76]:
exposure_df.columns

Index(['Mean/Average', 'Most Bearish Response', 'Quart 1 (25% at/below)',
       'Quart 2 (median)', 'Quart 3 (25% at/above)', 'Most Bullish Response',
       'Standard Deviation', 'NAAIM Number', 'S&P 500'],
      dtype='object')

In [77]:
drop_columns = ['Quart 1 (25% at/below)','Quart 3 (25% at/above)', 'Standard Deviation', 'NAAIM Number', 'S&P 500']

In [78]:
exposure_df.drop(columns=drop_columns,inplace=True)

In [81]:
exposure_df.head()

Unnamed: 0_level_0,Mean/Average,Most Bearish Response,Quart 2 (median),Most Bullish Response
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-05-13,60.38,-150.0,82.98,100.0
2015-05-20,67.61,-150.0,80.0,200.0
2015-05-27,65.5,-150.0,80.0,200.0
2015-06-03,71.07,-150.0,81.0,200.0
2015-06-10,68.7,-150.0,86.0,200.0


In [84]:
mich_sent = 'UMICH/SOC1'
mich_sent_df = quandl.get(mich_sent, start_date=start_date, end_date=end_date)

In [85]:
mich_sent_df.head()

Unnamed: 0_level_0,Index
Date,Unnamed: 1_level_1
2015-05-31,90.7
2015-06-30,96.1
2015-07-31,93.1
2015-08-31,91.9
2015-09-30,87.2


#### NYTimes Articles

In [130]:
#Import NYTimes API key
with open('nytimes.json') as f:
    nyt_keys= json.load(f)
nyt = articleAPI(nyt_keys['api_key'])

In [106]:
start_date

datetime.date(2015, 5, 12)

In [114]:
nyt

'1QKrhcJ2IMx7WPpuodhSE1yaa5UoFFL6'

In [179]:
subject ='Stocks'
start_date

datetime.date(2015, 5, 12)

In [190]:
articles = nyt.archive(q=subject,begin_date=start_date)

AttributeError: 'articleAPI' object has no attribute 'archive'

In [181]:
articles

{'status': 'OK',
 'copyright': 'Copyright (c) 2020 The New York Times Company. All Rights Reserved.',
 'response': {'docs': [{'abstract': 'Stocks edged lower Thursday. Healthcare stocks offset gains in technology stocks, and the splashy debuts of Square and Match. Bobbi Rebell reports.',
    'web_url': 'https://www.nytimes.com/video/multimedia/100000004048340/healthcare-weakness-weighs-on-wall-street.html',
    'snippet': 'Stocks edged lower Thursday. Healthcare stocks offset gains in technology stocks, and the splashy debuts of Square and Match. Bobbi Rebell reports.',
    'lead_paragraph': 'Stocks edged lower Thursday. Healthcare stocks offset gains in technology stocks, and the splashy debuts of Square and Match. Bobbi Rebell reports.',
    'source': 'Reuters',
    'multimedia': [{'rank': 0,
      'subtype': 'watch308',
      'caption': None,
      'credit': None,
      'type': 'image',
      'url': 'images/2015/11/19/multimedia/video-markets-close/video-markets-close-watch308.jpg',

In [188]:
#this goes through the return from the query
#['response']['docs'] returns a list of articles
#[Number] is each individual article
#['Final Field']- returns headline/snippet/data about each article
headlines=[]
for article in articles['response']['docs']:
    headlines.append(article['headline']['main'])

In [189]:
headlines

['Healthcare weakness weighs on Wall Street',
 'Stocks edge higher on Monday',
 'Markets rally after U.S. holiday',
 'Six losing days in a row for Dow',
 'Global stocks find support',
 'Stocks in China Continue to Fall',
 'Wall St. slips on Wal-Mart miss',
 'Hot Stocks Can Make You Rich. But They Probably Won’t.',
 'The Ease of Index Funds Comes With Risk ',
 'Tumultuous Week on Wall Street Ends With a Small Rally']