In [2]:
!pip install yfinance

Collecting yfinance
  Obtaining dependency information for yfinance from https://files.pythonhosted.org/packages/e9/43/93ea65227c938a0a4a8925d7f054b050ea743044b887170da9eca210635a/yfinance-0.2.28-py2.py3-none-any.whl.metadata
  Downloading yfinance-0.2.28-py2.py3-none-any.whl.metadata (11 kB)
Collecting pandas>=1.3.0 (from yfinance)
  Obtaining dependency information for pandas>=1.3.0 from https://files.pythonhosted.org/packages/9e/71/756a1be6bee0209d8c0d8c5e3b9fc72c00373f384a4017095ec404aec3ad/pandas-2.0.3-cp311-cp311-win_amd64.whl.metadata
  Downloading pandas-2.0.3-cp311-cp311-win_amd64.whl.metadata (18 kB)
Collecting numpy>=1.16.5 (from yfinance)
  Obtaining dependency information for numpy>=1.16.5 from https://files.pythonhosted.org/packages/72/b2/02770e60c4e2f7e158d923ab0dea4e9f146a2dbf267fec6d8dc61d475689/numpy-1.25.2-cp311-cp311-win_amd64.whl.metadata
  Downloading numpy-1.25.2-cp311-cp311-win_amd64.whl.metadata (5.7 kB)
Collecting requests>=2.31 (from yfinance)
  Obtaining dep

In [5]:
!pip install tqdm

Collecting tqdm
  Obtaining dependency information for tqdm from https://files.pythonhosted.org/packages/00/e5/f12a80907d0884e6dff9c16d0c0114d81b8cd07dc3ae54c5e962cc83037e/tqdm-4.66.1-py3-none-any.whl.metadata
  Downloading tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     ----------------------------------- ---- 51.2/57.6 kB 2.6 MB/s eta 0:00:01
     ---------------------------------------- 57.6/57.6 kB 1.0 MB/s eta 0:00:00
Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)
   ---------------------------------------- 0.0/78.3 kB ? eta -:--:--
   ------------------------------------ --- 71.7/78.3 kB ? eta -:--:--
   ---------------------------------------- 78.3/78.3 kB 1.5 MB/s eta 0:00:00
Installing collected packages: tqdm
Successfully installed tqdm-4.66.1


In [1]:
# import dependencies
import yfinance as yf
import pandas as pd
import datetime
import os
from tqdm import tqdm

In [2]:
# scrape all stock data from Y FInance website
url = 'https://finance.yahoo.com/world-indices'
tables = pd.read_html(url)
world_stocks = tables[0]
world_stocks

Unnamed: 0,Symbol,Name,Last Price,Change,% Change,Volume,Intraday High/Low,52 Week Range,Day Chart
0,^GSPC,,,,,,,,
1,^DJI,,,,,,,,
2,^IXIC,,,,,,,,
3,^NYA,,,,,,,,
4,^XAX,,,,,,,,
5,^BUK100P,,,,,,,,
6,^RUT,,,,,,,,
7,^VIX,Vix,17.89,0.0,0.00%,,,,
8,^FTSE,FTSE 100,7278.41,-31.8,-0.44%,,,,
9,^GDAXI,,,,,,,,


In [3]:
def fetch_data(ticker_symbol):
    try:
        ticker = yf.Ticker(ticker_symbol)
        today = datetime.date.today().strftime('%Y-%m-%d')
        data = ticker.history(start="1900-01-01", end=today)
        data.reset_index(inplace=True)
        data['ticker'] = ticker_symbol 
        return data
    except Exception as e:
        print(f"Error fetching data for {ticker_symbol}: {e}")
        return None

In [17]:
all_data = []
for symbol in tqdm(world_stocks['Symbol'], desc="Saving individual datasets"):  # tqdm progress bar!
    single_data = fetch_data(symbol)
    if single_data is not None:
        all_data.append(single_data)
    
# Concatenate all the individual datasets into one
master_data_origin = pd.concat(all_data, ignore_index=True)
master_data_origin

Saving individual datasets:  94%|█████████▍| 34/36 [00:19<00:00,  2.24it/s]^CASE30: 1d data not available for startTime=-2208996300 and endTime=1692306000. Only 100 years worth of day granularity data are allowed to be fetched per request.
Saving individual datasets: 100%|██████████| 36/36 [00:19<00:00,  1.83it/s]


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Adj Close
0,1927-12-30 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0.0,0.0,0.0,^GSPC,
1,1928-01-03 00:00:00-05:00,17.760000,17.760000,17.760000,17.760000,0.0,0.0,0.0,^GSPC,
2,1928-01-04 00:00:00-05:00,17.719999,17.719999,17.719999,17.719999,0.0,0.0,0.0,^GSPC,
3,1928-01-05 00:00:00-05:00,17.549999,17.549999,17.549999,17.549999,0.0,0.0,0.0,^GSPC,
4,1928-01-06 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0.0,0.0,0.0,^GSPC,
...,...,...,...,...,...,...,...,...,...,...
283938,2023-08-11 00:00:00+02:00,4117.830078,4119.009766,4051.540039,4066.860107,0.0,0.0,0.0,^JN0U.JO,
283939,2023-08-14 00:00:00+02:00,4042.120117,4061.399902,3945.500000,3979.110107,0.0,0.0,0.0,^JN0U.JO,
283940,2023-08-15 00:00:00+02:00,3983.780029,3994.810059,3906.199951,3933.360107,0.0,0.0,0.0,^JN0U.JO,
283941,2023-08-16 00:00:00+02:00,3944.840088,3944.840088,3867.570068,3911.010010,0.0,0.0,0.0,^JN0U.JO,


In [5]:
# Checking any missing values per column and per rows
def data_checking(master_data_check):
    # Identify NaN or empty values
    missing_values = master_data_check.isna().sum()

    # Identify incorrect value types
    incorrect_types = master_data_check.apply(lambda x: pd.to_numeric(x, errors='coerce').isna().sum())

    # Combine the results into a DataFrame
    cleaning_report = pd.DataFrame({'Missing Values': missing_values, 'Incorrect Types': incorrect_types})

    return cleaning_report

In [6]:
data_checking(master_data_origin)

Unnamed: 0,Missing Values,Incorrect Types
Date,0,283943
Open,0,0
High,0,0
Low,0,0
Close,0,0
Volume,0,0
Dividends,0,0
Stock Splits,0,0
ticker,0,283943
Adj Close,283943,283943


In [27]:
# Copy the dataframe
master_data = master_data_origin.copy()

In [28]:
# Drop off the Adj Close column
master_data = master_data.drop(columns=['Adj Close'])

# Correct the Date column data type
master_data['Date'] = pd.to_datetime(master_data['Date'], utc = True)
master_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
0,1927-12-30 05:00:00+00:00,17.66,17.66,17.66,17.66,0.0,0.0,0.0,^GSPC
1,1928-01-03 05:00:00+00:00,17.76,17.76,17.76,17.76,0.0,0.0,0.0,^GSPC
2,1928-01-04 05:00:00+00:00,17.719999,17.719999,17.719999,17.719999,0.0,0.0,0.0,^GSPC
3,1928-01-05 05:00:00+00:00,17.549999,17.549999,17.549999,17.549999,0.0,0.0,0.0,^GSPC
4,1928-01-06 05:00:00+00:00,17.66,17.66,17.66,17.66,0.0,0.0,0.0,^GSPC


In [36]:
# rearrange the columns and remove the hour from the date
master_data = master_data[['ticker', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
master_data['Date'] = pd.to_datetime((master_data['Date']).dt.date)

# remove '^' from the ticker
master_data['ticker'] = master_data['ticker'].str.replace('^', '').astype(str)
master_data

Unnamed: 0,ticker,Date,Open,High,Low,Close,Volume
0,GSPC,1927-12-30,17.660000,17.660000,17.660000,17.660000,0.0
1,GSPC,1928-01-03,17.760000,17.760000,17.760000,17.760000,0.0
2,GSPC,1928-01-04,17.719999,17.719999,17.719999,17.719999,0.0
3,GSPC,1928-01-05,17.549999,17.549999,17.549999,17.549999,0.0
4,GSPC,1928-01-06,17.660000,17.660000,17.660000,17.660000,0.0
...,...,...,...,...,...,...,...
283938,JN0U.JO,2023-08-10,4117.830078,4119.009766,4051.540039,4066.860107,0.0
283939,JN0U.JO,2023-08-13,4042.120117,4061.399902,3945.500000,3979.110107,0.0
283940,JN0U.JO,2023-08-14,3983.780029,3994.810059,3906.199951,3933.360107,0.0
283941,JN0U.JO,2023-08-15,3944.840088,3944.840088,3867.570068,3911.010010,0.0


In [37]:
master_data.dtypes

ticker            object
Date      datetime64[ns]
Open             float64
High             float64
Low              float64
Close            float64
Volume           float64
dtype: object

In [38]:
data_checking(master_data)

Unnamed: 0,Missing Values,Incorrect Types
ticker,0,283943
Date,0,0
Open,0,0
High,0,0
Low,0,0
Close,0,0
Volume,0,0


In [None]:
# Save the master dataframe to a CSV file
master_data.to_csv('Data\master_stock_data.csv', index=False)