In [1]:
import sys, csv, json
import urllib
import pandas as pd
import time
import numpy as np

In [4]:
headlines_path = './data/days_headlines/'

start_date = '20100629'
end_date = '20170911'
days = pd.date_range(start_date, end_date)

In [3]:
## Preparing TESLA stock data

with open('./data/TSLA.csv', 'rb') as csvfile:
    data = list(csv.reader(csvfile, delimiter=','))
    
data = np.asarray(data[1:])
data = data[:, [0, 4]] # close price

df = pd.DataFrame(data=data[0:,1:],
                  index=data[0:,0],
                  columns=['close'], dtype='float64')

# Adding missing dates to the dataframe

df.index = pd.DatetimeIndex(df.index)
df = df.reindex(days, fill_value=np.NaN)
print('Without Interpolation: ', df.count()) # gives 2518 count
df = df.interpolate()
print('With Interpolation: ', df.count()) # gives 3651 count

('Without Interpolation: ', close    1814
dtype: int64)
('With Interpolation: ', close    2632
dtype: int64)


In [4]:
df.to_pickle('./data/TSLA_stock_information.pkl')

In [5]:
df

Unnamed: 0,close
2010-06-29,23.889999
2010-06-30,23.830000
2010-07-01,21.959999
2010-07-02,19.200001
2010-07-03,18.427501
2010-07-04,17.655001
2010-07-05,16.882501
2010-07-06,16.110001
2010-07-07,15.800000
2010-07-08,17.459999


In [5]:
## Downloading NY Times headers of TESLA Motors company
# The process of downlading and processing are two distinguished processes
# NT Times API https://developer.nytimes.com/article_search_v2.json#/Console
from datetime import datetime, timedelta
import os.path

art_start_date = '20150426'
art_end_date = '20170911'
art_days = pd.date_range(art_start_date, art_end_date)
#days_ny_api =[d.strftime('%Y%m%d') for d in art_days]
url = 'https://api.nytimes.com/svc/search/v2/articlesearch.json?%s'
values = {'api-key' : '0ba6dc04a8cb44e0a890c00df88c393a',
          #'api-key' : '3463a201dd674225acd31d3cf0cba48e',
          'q' : 'Elon Musk',
          'fq': 'news_desk:(\"Business\",\"Tech\",\"Science\",\"Automobiles\",\"Energy\",\"U.S.\")',
          'sort':'oldest',

         }

for day in art_days:
    ok_req = False
    while not ok_req:
        values['begin_date'] = day.strftime('%Y%m%d')
        next_day = day + timedelta(days=1)
        values['end_date'] = next_day.strftime('%Y%m%d')
        param = urllib.urlencode(values)
        f = urllib.urlopen(url % param)
        raw = f.read()
        if raw is not '':
            articles = json.loads(raw)
            ok_req = not articles.has_key('message') 
            if ok_req:
                print('DOWNOLOAD DAY ', day)
                file_str = headlines_path + day.strftime('%Y%m%d') +'.json'
                
                if os.path.isfile(file_str):
                    with open(file_str, 'r') as fout:
                        day_headers = json.load(fout)
                        if 'OK' in day_headers['status'] and len(articles['response']['docs']) > 0 :
                            day_headers['response']['docs'].append(articles['response']['docs'])
                            articles = day_headers
                        #os.remove(file_str)
                with open(file_str, 'w') as fout:
                    fout.seek(0)             #overwrite
                    json.dump(articles, fout)
                    fout.close()
                
            else:
                print('LIMIT ', day)
                time.sleep(1200)
        else:
            print('EMPTY')

('DOWNOLOAD DAY ', Timestamp('2015-04-26 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-04-27 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-04-28 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-04-29 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-04-30 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-05-01 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-05-02 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-05-03 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-05-04 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-05-05 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-05-06 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-05-07 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-05-08 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-05-09 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-05-10 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-05-11 00:00:00', fre

('DOWNOLOAD DAY ', Timestamp('2015-09-03 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-04 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-05 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-06 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-07 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-08 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-09 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-10 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-11 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-12 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-13 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-14 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-15 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-16 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-17 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2015-09-18 00:00:00', fre

('DOWNOLOAD DAY ', Timestamp('2016-01-11 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-12 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-13 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-14 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-15 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-16 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-17 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-18 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-19 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-20 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-21 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-22 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-23 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-24 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-25 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-01-26 00:00:00', fre

('DOWNOLOAD DAY ', Timestamp('2016-05-20 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-05-21 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-05-22 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-05-23 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-05-24 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-05-25 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-05-26 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-05-27 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-05-28 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-05-29 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-05-30 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-05-31 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-06-01 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-06-02 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-06-03 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-06-04 00:00:00', fre

('DOWNOLOAD DAY ', Timestamp('2016-09-28 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-09-29 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-09-30 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-01 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-02 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-03 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-04 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-05 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-06 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-07 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-08 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-09 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-10 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-11 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-12 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2016-10-13 00:00:00', fre

('DOWNOLOAD DAY ', Timestamp('2017-02-05 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-06 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-07 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-08 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-09 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-10 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-11 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-12 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-13 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-14 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-15 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-16 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-17 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-18 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-19 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-02-20 00:00:00', fre

('DOWNOLOAD DAY ', Timestamp('2017-06-16 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-17 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-18 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-19 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-20 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-21 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-22 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-23 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-24 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-25 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-26 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-27 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-28 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-29 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-06-30 00:00:00', freq='D'))
('DOWNOLOAD DAY ', Timestamp('2017-07-01 00:00:00', fre

In [29]:
## Prepaparing headers
df['headlines'] = ''
error_days= []
last_headers = ''
for day in df.index:
    file_str = headlines_path + day.strftime('%Y%m%d') +'.json'
    try:
        with open(file_str, 'r') as fout:
            day_headers = json.load(fout)
            if 'OK' in day_headers['status']:
                list_entries = day_headers['response']['docs']
                headers = ''
                if len(list_entries) > 0:
                    hline = header['headline']['main'].lstrip()
                    for header in list_entries:
                        if headers is '':
                            headers = hline
                        else:
                            headers = headers + '/_/' + hline
                    last_headers = headers
                else:
                    headers = last_headers
                df.at[day,'headlines'] = headers
            else:
                error_days.append(day)
    except IOError:
        continue
        

In [30]:
df

Unnamed: 0,close,headlines
2010-06-29,23.889999,Tesla Increases Its Share Price to $17/_/Share...
2010-06-30,23.830000,Shares of Tesla Increase 40% on First Day of T...
2010-07-01,21.959999,Shares of Tesla Increase 40% on First Day of T...
2010-07-02,19.200001,Venture-Backed I.P.O.s Show Rebound as M&#038;...
2010-07-03,18.427501,Venture-Backed I.P.O.s Show Rebound as M&#038;...
2010-07-04,17.655001,Venture-Backed I.P.O.s Show Rebound as M&#038;...
2010-07-05,16.882501,Venture-Backed I.P.O.s Show Rebound as M&#038;...
2010-07-06,16.110001,Venture-Backed I.P.O.s Show Rebound as M&#038;...
2010-07-07,15.800000,Tesla Motors Shares Drop Below Initial Sale Pr...
2010-07-08,17.459999,New Ventures Win Old Financiers /_/Big Compani...


In [31]:
df.to_pickle('./data/TSLA_stock_information.pkl')