### Collect historical data from yahoo finance

In [4]:
import yfinance as yf
import pandas as pd
## Getting stock price data
#
def prepPrices(price_history,  mov_avg=0, target_col='Close'):
        
    # Filling NaNs with the most recent values for any missing data
    prices = price_history.fillna(method='ffill')
    
    # Getting the N Day Moving Average and rounding the values for some light data preprocessing
    if mov_avg>0:
        prices['MA'] = prices[[target_col]].rolling(
        window=mov_avg).mean().apply(lambda x: round(x, 2))
    # Dropping the Nans
    prices.dropna(inplace=True)
    #ignore time in index to merge with other datasets later
    if isinstance(price_history.index, pd.DatetimeIndex):
        prices.index=[pd.to_datetime(str(x).split('T')[0]) for x in prices.index.values]
    else:
       prices.index=[pd.to_datetime(str(x).split()[0]) for x in prices.index.values]
    prices.index=prices.index.tz_localize(None)
 
    return prices

def getStockPrices(stock, history_len, mov_avg, target_col='Close'):
    """
    Gets stock prices from now to N days ago and training amount will be in addition 
    to the number of days to train.
    """
    
    # Designating the Ticker
    ticker = yf.Ticker(stock)

    # Getting all price history
    price_history = ticker.history(period=history_len)
    
    # Check on length
    #if len(price_history)<n_days+training_days+mov_avg:
    #    return pd.DataFrame(), price_history
    
    prices=prepPrices(price_history, mov_avg, target_col)

    return price_history, prices


In [5]:
ticker='TSLA'
n_days=700
history_len=f'{n_days}d'
history_len='2y'
orig_stock, stock_prices = getStockPrices(ticker, history_len, mov_avg=5, target_col='Close') 

  prices = price_history.fillna(method='ffill')


In [6]:
stock_prices

Unnamed: 0,Open,High,Low,Close,Volume,Dividends,Stock Splits,MA
2022-06-27,249.366669,252.070007,242.566666,244.919998,89178300,0.0,0.0,239.76
2022-06-28,244.483337,249.970001,232.343338,232.663330,90391200,0.0,0.0,238.89
2022-06-29,230.500000,231.173340,222.273331,228.490005,82897200,0.0,0.0,237.37
2022-06-30,224.509995,229.456665,218.863327,224.473328,94600500,0.0,0.0,235.25
2022-07-01,227.000000,230.229996,222.119995,227.263336,74460300,0.0,0.0,231.56
...,...,...,...,...,...,...,...,...
2024-06-11,173.919998,174.750000,167.410004,170.660004,64761900,0.0,0.0,174.97
2024-06-12,171.119995,180.550003,169.800003,177.289993,90389400,0.0,0.0,175.43
2024-06-13,188.389999,191.080002,181.229996,182.470001,118984100,0.0,0.0,176.34
2024-06-14,185.800003,186.000000,176.919998,178.009995,81361700,0.0,0.0,176.44


In [7]:
stock_prices.to_csv(f'../data/{ticker}.csv')

In [8]:
str(stock_prices.index.min()).split()[0]

'2022-06-27'

### Collect historical news from The GDELT

In [9]:
from gdeltdoc import GdeltDoc, Filters
import numpy as np
import datetime


news_data=[]
news_vol=[]
start=stock_prices.index.min()
while start.date()<datetime.datetime.today().date():
    print(start)
    end=np.min([(start+datetime.timedelta(days=30)),datetime.datetime.today()])
    f = Filters(
        keyword = "Tesla tsla",
        start_date = str(start).split()[0],
        end_date = str(end).split()[0]
    )

    gd = GdeltDoc()

    # Search for articles matching the filters
    articles = gd.article_search(f)

    # Get a timeline of the number of articles matching the filters
    timeline = gd.timeline_search("timelinevol", f)
    news_data.append(articles)
    news_vol.append(timeline)
    start=end


2022-06-27 00:00:00
2022-07-27 00:00:00
2022-08-26 00:00:00
2022-09-25 00:00:00
2022-10-25 00:00:00
2022-11-24 00:00:00
2022-12-24 00:00:00
2023-01-23 00:00:00
2023-02-22 00:00:00
2023-03-24 00:00:00
2023-04-23 00:00:00
2023-05-23 00:00:00
2023-06-22 00:00:00
2023-07-22 00:00:00
2023-08-21 00:00:00
2023-09-20 00:00:00
2023-10-20 00:00:00
2023-11-19 00:00:00
2023-12-19 00:00:00
2024-01-18 00:00:00
2024-02-17 00:00:00
2024-03-18 00:00:00
2024-04-17 00:00:00
2024-05-17 00:00:00
2024-06-16 00:00:00


In [10]:
news_data_df=pd.concat(news_data)
news_data_df['dt']=news_data_df['seendate'].apply(lambda x: datetime.datetime.strptime(x, "%Y%m%dT%H%M%SZ"))
news_data_df.shape

(6012, 9)

In [11]:
news_data_df_processed=news_data_df[['dt', 'title']].drop_duplicates().sort_values('dt')


In [12]:
news_data_df_processed.to_csv(f'../data/{ticker}_news.csv')