In [95]:
# !pip install yfinance
# !pip install tqdm
!pip install sqlalchemy
!pip install psycopg2-binary



In [96]:
# import dependencies
import yfinance as yf
import pandas as pd
import datetime
import os
from tqdm import tqdm

# import sqlalchemy for database connection
from sqlalchemy import create_engine
from sqlalchemy import inspect
from sqlalchemy.orm import sessionmaker

#Config should contain database username as username and database password as password
from config import password 

In [97]:
# scrape all stock data from Y FInance website
url = 'https://finance.yahoo.com/world-indices'
tables = pd.read_html(url)
world_stocks = tables[0]
world_stocks

Unnamed: 0,Symbol,Name,Last Price,Change,% Change,Volume,Intraday High/Low,52 Week Range,Day Chart
0,^GSPC,,,,,,,,
1,^DJI,,,,,,,,
2,^IXIC,,,,,,,,
3,^NYA,,,,,,,,
4,^XAX,,,,,,,,
5,^BUK100P,,,,,,,,
6,^RUT,,,,,,,,
7,^VIX,Vix,16.88,-0.32,-1.86%,,,,
8,^FTSE,FTSE 100,7375.01,41.38,+0.56%,,,,
9,^GDAXI,,,,,,,,


In [98]:
def fetch_data(ticker_symbol):
    try:
        ticker = yf.Ticker(ticker_symbol)
        today = datetime.date.today().strftime('%Y-%m-%d')
        data = ticker.history(start="1900-01-01", end=today)
        data.reset_index(inplace=True)
        data['ticker'] = ticker_symbol 
        return data
    except Exception as e:
        print(f"Error fetching data for {ticker_symbol}: {e}")
        return None

In [99]:
all_data = []
for symbol in tqdm(world_stocks['Symbol'], desc="Fetching data"):  # tqdm progress bar!
    single_data = fetch_data(symbol)
    if single_data is not None:
        all_data.append(single_data)
    
# Concatenate all the individual datasets into one
master_data_origin = pd.concat(all_data, ignore_index=True)
master_data_origin

Fetching data:  94%|█████████▍| 34/36 [01:01<00:01,  1.36it/s]^CASE30: 1d data not available for startTime=-2208996300 and endTime=1692910800. Only 100 years worth of day granularity data are allowed to be fetched per request.
Fetching data: 100%|██████████| 36/36 [01:01<00:00,  1.72s/it]


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker,Adj Close
0,1927-12-30 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0.0,0.0,0.0,^GSPC,
1,1928-01-03 00:00:00-05:00,17.760000,17.760000,17.760000,17.760000,0.0,0.0,0.0,^GSPC,
2,1928-01-04 00:00:00-05:00,17.719999,17.719999,17.719999,17.719999,0.0,0.0,0.0,^GSPC,
3,1928-01-05 00:00:00-05:00,17.549999,17.549999,17.549999,17.549999,0.0,0.0,0.0,^GSPC,
4,1928-01-06 00:00:00-05:00,17.660000,17.660000,17.660000,17.660000,0.0,0.0,0.0,^GSPC,
...,...,...,...,...,...,...,...,...,...,...
284107,2023-08-18 00:00:00+02:00,3888.899902,3889.550049,3799.290039,3833.129883,0.0,0.0,0.0,^JN0U.JO,
284108,2023-08-21 00:00:00+02:00,3818.739990,3889.449951,3816.969971,3861.469971,0.0,0.0,0.0,^JN0U.JO,
284109,2023-08-22 00:00:00+02:00,3891.000000,3950.280029,3873.770020,3883.989990,0.0,0.0,0.0,^JN0U.JO,
284110,2023-08-23 00:00:00+02:00,3887.750000,3977.760010,3886.830078,3975.689941,0.0,0.0,0.0,^JN0U.JO,


In [100]:
# Checking any missing values per column and per rows
def data_checking(master_data_check):
    # Identify NaN or empty values
    missing_values = master_data_check.isna().sum()

    # Identify incorrect value types
    incorrect_types = master_data_check.apply(lambda x: pd.to_numeric(x, errors='coerce').isna().sum())

    # Combine the results into a DataFrame
    cleaning_report = pd.DataFrame({'Missing Values': missing_values, 'Incorrect Types': incorrect_types})

    return cleaning_report

In [101]:
data_checking(master_data_origin)

Unnamed: 0,Missing Values,Incorrect Types
Date,0,284112
Open,0,0
High,0,0
Low,0,0
Close,0,0
Volume,0,0
Dividends,0,0
Stock Splits,0,0
ticker,0,284112
Adj Close,284112,284112


In [102]:
# Copy the dataframe
master_data = master_data_origin.copy()

In [103]:
# Drop off the Adj Close column
master_data = master_data.drop(columns=['Adj Close'])

# Correct the Date column data type
master_data['Date'] = pd.to_datetime(master_data['Date'], utc = True)
master_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,ticker
0,1927-12-30 05:00:00+00:00,17.66,17.66,17.66,17.66,0.0,0.0,0.0,^GSPC
1,1928-01-03 05:00:00+00:00,17.76,17.76,17.76,17.76,0.0,0.0,0.0,^GSPC
2,1928-01-04 05:00:00+00:00,17.719999,17.719999,17.719999,17.719999,0.0,0.0,0.0,^GSPC
3,1928-01-05 05:00:00+00:00,17.549999,17.549999,17.549999,17.549999,0.0,0.0,0.0,^GSPC
4,1928-01-06 05:00:00+00:00,17.66,17.66,17.66,17.66,0.0,0.0,0.0,^GSPC


In [104]:
# rearrange the columns and remove the hour from the date
master_data = master_data[['ticker', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
master_data['Date'] = pd.to_datetime((master_data['Date']).dt.date)

# remove '^' from the ticker
master_data['ticker'] = master_data['ticker'].str.replace('^', '').astype(str)
master_data

Unnamed: 0,ticker,Date,Open,High,Low,Close,Volume
0,GSPC,1927-12-30,17.660000,17.660000,17.660000,17.660000,0.0
1,GSPC,1928-01-03,17.760000,17.760000,17.760000,17.760000,0.0
2,GSPC,1928-01-04,17.719999,17.719999,17.719999,17.719999,0.0
3,GSPC,1928-01-05,17.549999,17.549999,17.549999,17.549999,0.0
4,GSPC,1928-01-06,17.660000,17.660000,17.660000,17.660000,0.0
...,...,...,...,...,...,...,...
284107,JN0U.JO,2023-08-17,3888.899902,3889.550049,3799.290039,3833.129883,0.0
284108,JN0U.JO,2023-08-20,3818.739990,3889.449951,3816.969971,3861.469971,0.0
284109,JN0U.JO,2023-08-21,3891.000000,3950.280029,3873.770020,3883.989990,0.0
284110,JN0U.JO,2023-08-22,3887.750000,3977.760010,3886.830078,3975.689941,0.0


In [105]:
master_data.dtypes

ticker            object
Date      datetime64[ns]
Open             float64
High             float64
Low              float64
Close            float64
Volume           float64
dtype: object

In [106]:
data_checking(master_data)

Unnamed: 0,Missing Values,Incorrect Types
ticker,0,284112
Date,0,0
Open,0,0
High,0,0
Low,0,0
Close,0,0
Volume,0,0


In [107]:
# Save the master dataframe to a CSV file
# master_data.to_csv('Data\master_stock_data.csv', index=False)

In [108]:
# connect to SQL database 

protocol = 'postgresql'
host = 'localhost'
port = 5432
database_name = 'yahoo_stock_db'
rds_connection_string = f'{protocol}://postgres:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)
insp = inspect(engine)

In [109]:
# check tables
insp.get_table_names()

['stocks']

In [110]:
# For the initial data collection and storage without creating a schema in the database
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, String, Float, DateTime

Base = declarative_base()

class Table(Base):
    __tablename__ = 'stocks'
    ticker = Column(String, primary_key=True)
    Date = Column(DateTime, primary_key=True)
    Open = Column(Float)
    High = Column(Float)
    Low = Column(Float)
    Close = Column(Float)
    Volume = Column(Float)

Base.metadata.create_all(engine)

# Save data to database
Session = sessionmaker(bind=engine)
session = Session()

  Base = declarative_base()


In [111]:
#checking data
pd.read_sql_query('select * from stocks', con=engine).head()

Unnamed: 0,ticker,Date,Open,High,Low,Close,Volume
0,GSPC,1927-12-30,17.66,17.66,17.66,17.66,0.0
1,GSPC,1928-01-03,17.76,17.76,17.76,17.76,0.0
2,GSPC,1928-01-04,17.719999,17.719999,17.719999,17.719999,0.0
3,GSPC,1928-01-05,17.549999,17.549999,17.549999,17.549999,0.0
4,GSPC,1928-01-06,17.66,17.66,17.66,17.66,0.0


In [112]:
# Pushing data to database
# master_data.to_sql(name="stocks", con=engine, if_exists="replace", index=False)

In [113]:
# Create a list of dictionaries for new and updated records
records_to_insert = []
records_to_update = []

for index, row in master_data.iterrows():
    existing_data = session.query(Table).filter_by(ticker=row['ticker'], Date=row['Date']).first()
    
    if existing_data:
        # Compare values and update if needed
        if existing_data.Open != row['Open'] or \
           existing_data.High != row['High'] or \
           existing_data.Low != row['Low'] or \
           existing_data.Close != row['Close'] or \
           existing_data.Volume != row['Volume']:
            records_to_update.append({'ticker': row['ticker'], 'Date': row['Date'],
                                      'Open': row['Open'], 'High': row['High'], 'Low': row['Low'],
                                      'Close': row['Close'], 'Volume': row['Volume']})
    else:
        # If data doesn't exist, add it to the session
        records_to_insert.append({'ticker': row['ticker'], 'Date': row['Date'],
                                  'Open': row['Open'], 'High': row['High'], 'Low': row['Low'],
                                  'Close': row['Close'], 'Volume': row['Volume']})
    break


In [115]:
# Bulk update existing records
if records_to_update:
    session.bulk_update_mappings(Table, records_to_update)

# Bulk insert new records
if records_to_insert:
    session.bulk_insert_mappings(Table, records_to_insert)

session.commit()

In [118]:
# retrieve data from postgresql database
data_sql= pd.read_sql_query('select * from stocks', con=engine)
data_sql

Unnamed: 0,ticker,Date,Open,High,Low,Close,Volume
0,GSPC,1927-12-30,17.660000,17.660000,17.660000,17.660000,0.0
1,GSPC,1928-01-03,17.760000,17.760000,17.760000,17.760000,0.0
2,GSPC,1928-01-04,17.719999,17.719999,17.719999,17.719999,0.0
3,GSPC,1928-01-05,17.549999,17.549999,17.549999,17.549999,0.0
4,GSPC,1928-01-06,17.660000,17.660000,17.660000,17.660000,0.0
...,...,...,...,...,...,...,...
284037,JN0U.JO,2023-08-15,3944.840088,3944.840088,3867.570068,3911.010010,0.0
284038,JN0U.JO,2023-08-16,3890.179932,3920.449951,3857.129883,3887.399902,0.0
284039,JN0U.JO,2023-08-17,3888.899902,3889.550049,3799.290039,3833.129883,0.0
284040,JN0U.JO,2023-08-20,3818.739990,3889.449951,3816.969971,3861.469971,0.0


In [123]:
# create a list of unique stock symbols
stock_symbols = data_sql['ticker'].unique().tolist()

# Split data into individual stocks
individual_stocks = {}
for ticker_name in data_sql['ticker'].unique():
    individual_stocks[ticker_name] = data_sql[data_sql['ticker'] == ticker_name]
    individual_stocks[ticker_name].reset_index(inplace=True, drop=True)

individual_stocks['GSPC']

Unnamed: 0,ticker,Date,Open,High,Low,Close,Volume
0,GSPC,1927-12-30,17.660000,17.660000,17.660000,17.660000,0.000000e+00
1,GSPC,1928-01-03,17.760000,17.760000,17.760000,17.760000,0.000000e+00
2,GSPC,1928-01-04,17.719999,17.719999,17.719999,17.719999,0.000000e+00
3,GSPC,1928-01-05,17.549999,17.549999,17.549999,17.549999,0.000000e+00
4,GSPC,1928-01-06,17.660000,17.660000,17.660000,17.660000,0.000000e+00
...,...,...,...,...,...,...,...
24020,GSPC,2023-08-16,4433.790039,4449.950195,4403.549805,4404.330078,3.753910e+09
24021,GSPC,2023-08-17,4416.319824,4421.169922,4364.830078,4370.359863,3.943700e+09
24022,GSPC,2023-08-18,4344.879883,4381.819824,4335.310059,4369.709961,3.940400e+09
24023,GSPC,2023-08-21,4380.279785,4407.549805,4360.299805,4399.770020,3.726850e+09


In [124]:
print (stock_symbols)

['GSPC', 'DJI', 'IXIC', 'NYA', 'XAX', 'BUK100P', 'RUT', 'VIX', 'FTSE', 'GDAXI', 'FCHI', 'STOXX50E', 'N100', 'BFX', 'IMOEX.ME', 'N225', 'HSI', '000001.SS', '399001.SZ', 'STI', 'AXJO', 'AORD', 'BSESN', 'JKSE', 'KLSE', 'NZ50', 'KS11', 'TWII', 'GSPTSE', 'BVSP', 'MXX', 'IPSA', 'MERV', 'TA125.TA', 'JN0U.JO']
