In [1]:
# !pip install yfinance
import yfinance as yf

In [2]:
import os, json, time, datetime as dt, csv, pathlib
from typing import Dict, List
import requests
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

DATA_RAW = pathlib.Path("data/raw")
DATA_RAW.mkdir(parents=True, exist_ok=True)

load_dotenv()
ALPHA_KEY = os.getenv("ALPHAVANTAGE_API_KEY")
print("Loaded ALPHAVANTAGE_API_KEY?", bool(ALPHA_KEY))

Loaded ALPHAVANTAGE_API_KEY? True


In [3]:
def safe_stamp():
    return dt.datetime.now().strftime("%Y%m%d-%H%M%S")
def safe_filename(prefix: str, meta: Dict[str,str]) -> str:
    mid="-".join([f"{k}-{str(v).replace(' ','-')[:20]}" for k,v in meta.items()])
    return f"{prefix}_{mid}_{safe_stamp()}.csv"
def validate_df(df: pd.DataFrame, required_cols: List[str], dtypes_map: Dict[str,str])->Dict[str,str]:
    msgs = {}
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        msgs['missing_cols']=f"Missing columns: {missing}"
    for col,dtype in dtypes_map.items():
        if col in df.columns:
            try:
                if dtype == 'datetime64[ns]':
                    pd.to_datetime(df[col])
                elif dtype == 'float':
                    pd.to_numeric(df[col])
            except Exception as e:
                msgs[f'dtype_{col}']=f"Failed to coerce {col} to {dtype} : {e}"
    na_counts = df.isna().sum().sum()
    msgs['na_total'] = f"Total NA values: {na_counts}"
    return msgs


In [4]:
SYMBOL = "MSFT"
# use_alpha = bool(ALPHA_KEY)
# print("Using Alpha Vantage: :", use_alpha)
# if use_alpha:
#     url = "https://www.alphavantage.co/query?"
#     params={
#         "function": "TIME SERIES DAILY ADJUSTED",
#         "symbol": SYMBOL,
#         "outputsize":"compact",
#         "apikey": ALPHA_KEY,
#         "datatype": "json"
#     }
#     r = requests.get(url,params=params,timeout=20)
#     r.raise_for_status()
#     js=r.json()
#     key=[k for k in js.keys() if "Time Series" in k]
#     assert key, f"Unexpected response keys: {list(js.keys())}"
#     series=js[keys[0]]
#     df_api=(pd.DataFrame(series).T
#             .rename_axis('date')
#             .reset_index())
#     df_api=df.api[['date','5. adjusted close']].rename(columns={'5. adjusted close': 'adj_close'})
#     df_api['date']=pd.to_datetime(df_api['date'])
#     df_api['adj_close']=pd.to_numeric(df_api['adj_close'])
# else:
import yfinance as yf
df_api=yf.download(SYMBOL, period="6mo", interval="1d").reset_index()[['Date','Close']]
df_api.columns = ['date', 'close']
df_api = df_api.sort_values('date').reset_index(drop=True)
msgs = validate_df(df_api, required_cols=['date','close'], dtypes_map={'date':'datetime64[ns]','close':'float'})
print(msgs)

fname = safe_filename(prefix="api", meta={"source":"yfinance", "symbol": SYMBOL})
out_path = DATA_RAW / fname
df_api.to_csv(out_path, index=False)
print("Saved:", out_path)
df_api.head()

  df_api=yf.download(SYMBOL, period="6mo", interval="1d").reset_index()[['Date','Close']]
[*********************100%***********************]  1 of 1 completed

{'na_total': 'Total NA values: 0'}
Saved: data\raw\api_source-yfinance-symbol-MSFT_20250819-230230.csv





Unnamed: 0,date,close
0,2025-02-20,415.367462
1,2025-02-21,407.461945
2,2025-02-24,403.259674
3,2025-02-25,397.170837
4,2025-02-26,398.997498


Scrapping Website

In [5]:
SCRAPE_URL = "https://en.wikipedia.org/wiki/NIFTY_50"
headers = {"User-Agent": "AFE-Course-Notebook/1.0"}
resp = requests.get(SCRAPE_URL, headers=headers, timeout=30)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, 'html.parser')
table = soup.find('table', {'class': 'wikitable'})
rows = []
for tr in table.find_all('tr'):
    cells = [td.get_text(strip=True) for td in tr.find_all(['td','th'])]
    if cells:
        rows.append(cells)
header, *data = rows
df_scrape = pd.DataFrame(data, columns=header)
df_scrape.columns = df_scrape.columns.str.replace(r"\[\w+\]", "", regex=True).str.strip()
df_scrape = df_scrape.replace(r"\[\w+\]", "", regex=True)

if 'Date added' in df_scrape.columns:
    df_scrape['Date added'] = pd.to_datetime(df_scrape['Date added'], errors='coerce')
msgs2 = validate_df(df_scrape, required_cols=list(df_scrape.columns), dtypes_map={})
print(msgs2)

fname2 = safe_filename(prefix="scrape", meta={"site": "example", "table": "markets"})
out_path2 = DATA_RAW / fname2
df_scrape.to_csv(out_path2, index=False)
print("Saved:", out_path2)
df_scrape.head()

{'na_total': 'Total NA values: 0'}
Saved: data\raw\scrape_site-example-table-markets_20250819-230230.csv


Unnamed: 0,Company name,Symbol,Sector,Date added
0,Adani Enterprises,ADANIENT,Metals & Mining,2022-09-30
1,Adani Ports & SEZ,ADANIPORTS,Services,2015-09-28
2,Apollo Hospitals,APOLLOHOSP,Healthcare,2022-03-31
3,Asian Paints,ASIANPAINT,Consumer Durables,2012-04-27
4,Axis Bank,AXISBANK,Financial Services,2009-03-27
