# Getting data for machine learning model

In [None]:
import pandas as pd
import yfinance as yf
import os
import logging
from pathlib import Path

In [None]:
%run "Moving Average + Bollinger bands.ipynb"

In [None]:
all_companies = pd.read_csv("../Data/Symbols_data/AmericanAndCanadianCompanies.csv")

In [None]:
all_companies["Sector"] = all_companies["Sector"].fillna("Other")
necessary_data = all_companies[["Symbol", "Name", "Sector"]] 
necessary_data = all_companies[(all_companies["Name"].str.contains("Common Stock")) & (all_companies["Volume"] >= 1000000)]
len(all_companies), len(necessary_data)

In [None]:
necessary_data[:20]

In [None]:
def save_downloaded_data(symbols, dir_path = None):
    '''
        Downloads all data from Yahoo Finance based on the tickers inside given dataframe

        Arguments:
            symbol (dataframe) - list of symbols to download data for
    '''
    row = necessary_data.loc[necessary_data["Symbol"] == symbols]
    label = row["Symbol"].to_string(index = False)

    directory_name = row["Sector"].to_string(index = False)
    if dir_path == None:
        directory_path = f"../Data/Stocks_data/{directory_name}"
    else:
        directory_path = dir_path
    dir_path = Path(directory_path)
    dir_path.mkdir(exist_ok=True, parents=True)
    
    file_name = row["Name"].to_string(index = False)
    full_path = os.path.join(directory_path, file_name).replace('\\', "/")
    
    try:
        download_result = yf.download(label, period = 'max', auto_adjust = True)
        download_result.to_csv(f'{full_path}.csv')
        print(f'{full_path} downloaded')
        test = pd.read_csv(f'{full_path}.csv')
        calculate_all_stocks_related_data(test)
        test.to_csv(f'{full_path}.csv', index = False)
        
        cust_limit = 22 if len(test.index) > 1500 else 15 
        get_all_earnings_dates(label, cust_limit)
    except Exception as e:
        logging.exception("An Exception was thrown!")
    

In [None]:
test = necessary_data["Symbol"][1000:].map(save_downloaded_data)

In [None]:
test_ticker = yf.Ticker(necessary_data["Symbol"][1])

In [None]:
def get_all_earnings_dates(symbol, custom_limit = 21, directory_path = None):
    pd.options.mode.chained_assignment = None
    row = necessary_data.loc[necessary_data["Symbol"] == symbol]
    label = row["Symbol"].to_string(index = False)
    if directory_path == None:
        base_path = "../Data/Earnings Dates"
    else:
        base_path = directory_path
    
    dir_path = Path(base_path)
    dir_path.mkdir(exist_ok=True, parents=True)
    
    file_name = row["Name"].to_string(index = False)
    full_path = os.path.join(base_path, file_name).replace('\\', "/")
    try:
        test_ticker = yf.Ticker(label)
        full_path = f"{base_path}/{file_name}"
        result = test_ticker.get_earnings_dates(limit = custom_limit)
        result = result.iloc[4:]
       
        result["Earnings Date"] = result.index.values
        result["Earnings Date"] = pd.to_datetime(result["Earnings Date"])
    
        result["Earnings Date"] = result["Earnings Date"].dt.date
        result.to_csv(f'{full_path}_earnings.csv', index = False)
    except AttributeError as u:
        print("File was delisted, skip!")
    except Exception as e:
        try:
            result = test_ticker.get_earnings_dates(limit = 8)
            result = result.iloc[4:]
           
            result["Earnings Date"] = result.index.values
            result["Earnings Date"] = pd.to_datetime(result["Earnings Date"])
        
            result["Earnings Date"] = result["Earnings Date"].dt.date
            result.to_csv(f'{full_path}_earnings.csv', index = False)
        except Exception as ex:
            print(f"{label} File was skipped")
            print(str(ex))

In [None]:
test = necessary_data["Symbol"][1000:].map(get_all_earnings_dates)

In [None]:
def split_quarterly(stocks_prices, close_price, earning_dates):
    earning_dates["Earnings Date"] = earning_dates["Earnings Date"].values[::-1]
    
    X, y = [], []
    stocks_prices["Date"] = pd.to_datetime(stocks_prices["Date"])
    for i in range(0, len(earning_dates.index) - 2, 1):
      
        print(earning_dates.loc[i, 'Earnings Date'], stocks_prices["Date"] == earning_dates.loc[i, 'Earnings Date'])

        test = stocks_prices[stocks_prices["Date"].between(earning_dates.loc[i, 'Earnings Date'], earning_dates.loc[i + 1, 'Earnings Date'])]

        test = stocks_prices[(stocks_prices["Date"] >= earning_dates.loc[i, 'Earnings Date']) & (stocks_prices["Date"] <= earning_dates.loc[i + 1, 'Earnings Date'])]
        print(len(test))
 
        if len(test.index) >= 61:
            test =  test[:61]
        
        X.append(test)

        test = close_price["Close"][close_price["Date"].between(earning_dates.loc[i + 1, 'Earnings Date'], earning_dates.loc[i + 2, 'Earnings Date'])]
       
        if len(test.index) >= 61:
            test =  test[:61]
        
        y.append(test)
        
    return np.array(X), np.array(y)