### Collect historical data from yahoo finance

In [32]:
import yfinance as yf
import pandas as pd
## Getting stock price data
#
def prepPrices(price_history, n_days,  mov_avg=0, target_col='Close'):
        
    # Filling NaNs with the most recent values for any missing data
    prices = price_history.fillna(method='ffill')
    
    # Getting the N Day Moving Average and rounding the values for some light data preprocessing
    if mov_avg>0:
        prices['MA'] = prices[[target_col]].rolling(
        window=mov_avg).mean().apply(lambda x: round(x, 2))
    # Dropping the Nans
    prices.dropna(inplace=True)
    #ignore time in index to merge with other datasets later
    if isinstance(price_history.index, pd.DatetimeIndex):
        prices.index=[pd.to_datetime(str(x).split('T')[0]) for x in prices.index.values]
    else:
       prices.index=[pd.to_datetime(str(x).split()[0]) for x in prices.index.values]
    prices.index=prices.index.tz_localize(None)
 
    return prices

def getStockPrices(stock, n_days, mov_avg, target_col='Close'):
    """
    Gets stock prices from now to N days ago and training amount will be in addition 
    to the number of days to train.
    """
    
    # Designating the Ticker
    ticker = yf.Ticker(stock)

    # Getting all price history
    price_history = ticker.history(period=f"{n_days}d")
    
    # Check on length
    #if len(price_history)<n_days+training_days+mov_avg:
    #    return pd.DataFrame(), price_history
    
    prices=prepPrices(price_history, n_days, mov_avg, target_col)

    return price_history, prices


In [65]:
ticker='TSLA'
n_days=700
orig_stock, stock_prices = getStockPrices(ticker, n_days, mov_avg=5, target_col='Close') 

  prices = price_history.fillna(method='ffill')


In [66]:
stock_prices.to_csv(f'../data/{ticker}.csv')

In [67]:
str(stock_prices.index.min()).split()[0]

'2021-09-09'

In [68]:
365*2

730

### Collect historical news from The GDELT

In [93]:
from gdeltdoc import GdeltDoc, Filters
import numpy as np
import datetime


news_data=[]
news_vol=[]
start=stock_prices.index.min()
while start.date()<datetime.datetime.today().date():
    print(start)
    end=np.min([(start+datetime.timedelta(days=30)),datetime.datetime.today()])
    f = Filters(
        keyword = "Tesla tsla",
        start_date = str(start).split()[0],
        end_date = str(end).split()[0]
    )

    gd = GdeltDoc()

    # Search for articles matching the filters
    articles = gd.article_search(f)

    # Get a timeline of the number of articles matching the filters
    timeline = gd.timeline_search("timelinevol", f)
    news_data.append(articles)
    news_vol.append(timeline)
    start=end


2021-09-09 00:00:00
2021-10-09 00:00:00
2021-11-08 00:00:00
2021-12-08 00:00:00
2022-01-07 00:00:00
2022-02-06 00:00:00
2022-03-08 00:00:00
2022-04-07 00:00:00
2022-05-07 00:00:00
2022-06-06 00:00:00
2022-07-06 00:00:00
2022-08-05 00:00:00
2022-09-04 00:00:00
2022-10-04 00:00:00
2022-11-03 00:00:00
2022-12-03 00:00:00
2023-01-02 00:00:00
2023-02-01 00:00:00
2023-03-03 00:00:00
2023-04-02 00:00:00
2023-05-02 00:00:00
2023-06-01 00:00:00
2023-07-01 00:00:00
2023-07-31 00:00:00
2023-08-30 00:00:00
2023-09-29 00:00:00
2023-10-29 00:00:00
2023-11-28 00:00:00
2023-12-28 00:00:00
2024-01-27 00:00:00
2024-02-26 00:00:00
2024-03-27 00:00:00
2024-04-26 00:00:00
2024-05-26 00:00:00


In [94]:
news_data_df=pd.concat(news_data)
news_data_df['dt']=news_data_df['seendate'].apply(lambda x: datetime.datetime.strptime(x, "%Y%m%dT%H%M%SZ"))
news_data_df.shape

(8500, 9)

In [97]:
news_data_df_processed=news_data_df[['dt', 'title']].drop_duplicates().sort_values('dt')


In [98]:
news_data_df_processed.to_csv(f'../data/{ticker}_news.csv')