In [14]:
# !pip install yfinance
# !pip install tqdm

In [15]:
# import dependencies
import yfinance as yf
import pandas as pd
import datetime
import os
from tqdm import tqdm

In [16]:
# scrape all stock data from Y FInance website
url = 'https://finance.yahoo.com/world-indices'
tables = pd.read_html(url)
world_stocks = tables[0]
world_stocks

Unnamed: 0,Symbol,Name,Last Price,Change,% Change,Volume,Intraday High/Low,52 Week Range,Day Chart
0,^GSPC,,,,,,,,
1,^DJI,,,,,,,,
2,^IXIC,,,,,,,,
3,^NYA,,,,,,,,
4,^XAX,,,,,,,,
5,^BUK100P,,,,,,,,
6,^RUT,,,,,,,,
7,^VIX,Vix,18.23,0.34,+1.90%,,,,
8,^FTSE,FTSE 100,7248.49,-61.72,-0.84%,,,,
9,^GDAXI,,,,,,,,


In [17]:
def fetch_data(ticker_symbol):
    try:
        ticker = yf.Ticker(ticker_symbol)
        today = datetime.date.today().strftime('%Y-%m-%d')
        data = ticker.history(start="1900-01-01", end=today)
        data.reset_index(inplace=True)
        data['ticker'] = ticker_symbol 
        return data
    except Exception as e:
        print(f"Error fetching data for {ticker_symbol}: {e}")
        return None

In [18]:
all_data = []
for symbol in tqdm(world_stocks['Symbol'], desc="Fetching data"):  # tqdm progress bar!
    single_data = fetch_data(symbol)
    if single_data is not None:
        all_data.append(single_data)
    
# Concatenate all the individual datasets into one
master_data_origin = pd.concat(all_data, ignore_index=True)
master_data_origin

Fetching data:   0%|          | 0/36 [00:00<?, ?it/s]

Fetching data:  94%|█████████▍| 34/36 [00:19<00:01,  1.71it/s]^CASE30: 1d data not available for startTime=-2208996300 and endTime=1692306000. Only 100 years worth of day granularity data are allowed to be fetched per request.
Fetching data: 100%|██████████| 36/36 [00:19<00:00,  1.80it/s]


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Adj Close
0,1927-12-30 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0.0,0.0,0.0,^GSPC,
1,1928-01-03 00:00:00-05:00,17.760000,17.760000,17.760000,17.760000,0.0,0.0,0.0,^GSPC,
2,1928-01-04 00:00:00-05:00,17.719999,17.719999,17.719999,17.719999,0.0,0.0,0.0,^GSPC,
3,1928-01-05 00:00:00-05:00,17.549999,17.549999,17.549999,17.549999,0.0,0.0,0.0,^GSPC,
4,1928-01-06 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0.0,0.0,0.0,^GSPC,
...,...,...,...,...,...,...,...,...,...,...
283938,2023-08-11 00:00:00+02:00,4117.830078,4119.009766,4051.540039,4066.860107,0.0,0.0,0.0,^JN0U.JO,
283939,2023-08-14 00:00:00+02:00,4042.120117,4061.399902,3945.500000,3979.110107,0.0,0.0,0.0,^JN0U.JO,
283940,2023-08-15 00:00:00+02:00,3983.780029,3994.810059,3906.199951,3933.360107,0.0,0.0,0.0,^JN0U.JO,
283941,2023-08-16 00:00:00+02:00,3944.840088,3944.840088,3867.570068,3911.010010,0.0,0.0,0.0,^JN0U.JO,


In [19]:
# Checking any missing values per column and per rows
def data_checking(master_data_check):
    # Identify NaN or empty values
    missing_values = master_data_check.isna().sum()

    # Identify incorrect value types
    incorrect_types = master_data_check.apply(lambda x: pd.to_numeric(x, errors='coerce').isna().sum())

    # Combine the results into a DataFrame
    cleaning_report = pd.DataFrame({'Missing Values': missing_values, 'Incorrect Types': incorrect_types})

    return cleaning_report

In [20]:
data_checking(master_data_origin)

Unnamed: 0,Missing Values,Incorrect Types
Date,0,283943
Open,0,0
High,0,0
Low,0,0
Close,0,0
Volume,0,0
Dividends,0,0
Stock Splits,0,0
ticker,0,283943
Adj Close,283943,283943


In [21]:
# Copy the dataframe
master_data = master_data_origin.copy()

In [22]:
# Drop off the Adj Close column
master_data = master_data.drop(columns=['Adj Close'])

# Correct the Date column data type
master_data['Date'] = pd.to_datetime(master_data['Date'], utc = True)
master_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
0,1927-12-30 05:00:00+00:00,17.66,17.66,17.66,17.66,0.0,0.0,0.0,^GSPC
1,1928-01-03 05:00:00+00:00,17.76,17.76,17.76,17.76,0.0,0.0,0.0,^GSPC
2,1928-01-04 05:00:00+00:00,17.719999,17.719999,17.719999,17.719999,0.0,0.0,0.0,^GSPC
3,1928-01-05 05:00:00+00:00,17.549999,17.549999,17.549999,17.549999,0.0,0.0,0.0,^GSPC
4,1928-01-06 05:00:00+00:00,17.66,17.66,17.66,17.66,0.0,0.0,0.0,^GSPC


In [23]:
# rearrange the columns and remove the hour from the date
master_data = master_data[['ticker', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
master_data['Date'] = pd.to_datetime((master_data['Date']).dt.date)

# remove '^' from the ticker
master_data['ticker'] = master_data['ticker'].str.replace('^', '').astype(str)
master_data

Unnamed: 0,ticker,Date,Open,High,Low,Close,Volume
0,GSPC,1927-12-30,17.660000,17.660000,17.660000,17.660000,0.0
1,GSPC,1928-01-03,17.760000,17.760000,17.760000,17.760000,0.0
2,GSPC,1928-01-04,17.719999,17.719999,17.719999,17.719999,0.0
3,GSPC,1928-01-05,17.549999,17.549999,17.549999,17.549999,0.0
4,GSPC,1928-01-06,17.660000,17.660000,17.660000,17.660000,0.0
...,...,...,...,...,...,...,...
283938,JN0U.JO,2023-08-10,4117.830078,4119.009766,4051.540039,4066.860107,0.0
283939,JN0U.JO,2023-08-13,4042.120117,4061.399902,3945.500000,3979.110107,0.0
283940,JN0U.JO,2023-08-14,3983.780029,3994.810059,3906.199951,3933.360107,0.0
283941,JN0U.JO,2023-08-15,3944.840088,3944.840088,3867.570068,3911.010010,0.0


In [24]:
master_data.dtypes

ticker            object
Date      datetime64[ns]
Open             float64
High             float64
Low              float64
Close            float64
Volume           float64
dtype: object

In [25]:
data_checking(master_data)

Unnamed: 0,Missing Values,Incorrect Types
ticker,0,283943
Date,0,0
Open,0,0
High,0,0
Low,0,0
Close,0,0
Volume,0,0


In [27]:
# Save the master dataframe to a CSV file
master_data.to_csv('Data\master_stock_data.csv', index=False)