In [7]:
%pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import os
import yaml
import pandas as pd
from tqdm import tqdm

# Define paths
RAW_DATA_PATH = r"D:\data-driven-stock-analysis\data\raw\data"
PROCESSED_PATH = r"D:\data-driven-stock-analysis\data\processed"

def load_yaml_file(file_path):
    try:
        with open(file_path, 'r') as file:
            data = yaml.safe_load(file)
            if isinstance(data, list):
                return data
            else:
                print(f"Unexpected format in file: {file_path}")
                return []
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return []

def extract_all_data(raw_path):
    stock_data = {}

    for root, _, files in os.walk(raw_path):
        for file in tqdm(files, desc="Reading YAML files"):
            if file.endswith('.yaml'):
                file_path = os.path.join(root, file)
                records = load_yaml_file(file_path)
                
                for record in records:
                    try:
                        symbol = record.get('Ticker')
                        if not symbol:
                            continue
                        if symbol not in stock_data:
                            stock_data[symbol] = []
                        stock_data[symbol].append(record)
                    except Exception as e:
                        print(f"Error in record from {file_path}: {e}")
    return stock_data

def save_to_csvs(stock_data, output_path):
    os.makedirs(output_path, exist_ok=True)

    for symbol, records in stock_data.items():
        if not records:
            continue
        df = pd.DataFrame(records)

        # Normalize column names
        df.rename(columns={'Ticker': 'symbol'}, inplace=True)

        if 'date' in df.columns:
            df.sort_values('date', inplace=True)

        output_file = os.path.join(output_path, f"{symbol}.csv")
        try:
            df.to_csv(output_file, index=False)
            print(f" Saved: {symbol}.csv ({len(df)} rows)")
        except Exception as e:
            print(f" Failed to save {symbol}.csv: {e}")

if __name__ == "__main__":
    print(" Starting YAML to CSV conversion...\n")
    all_stock_data = extract_all_data(RAW_DATA_PATH)
    print(f"\n Symbols found: {list(all_stock_data.keys())}")
    save_to_csvs(all_stock_data, PROCESSED_PATH)
    print("\n All done!")


 Starting YAML to CSV conversion...



Reading YAML files: 0it [00:00, ?it/s]
Reading YAML files: 100%|██████████| 20/20 [00:00<00:00, 34.00it/s]
Reading YAML files: 100%|██████████| 21/21 [00:00<00:00, 36.02it/s]
Reading YAML files: 100%|██████████| 20/20 [00:00<00:00, 42.66it/s]
Reading YAML files: 100%|██████████| 22/22 [00:00<00:00, 39.93it/s]
Reading YAML files: 100%|██████████| 21/21 [00:00<00:00, 45.46it/s]
Reading YAML files: 100%|██████████| 19/19 [00:00<00:00, 41.16it/s]
Reading YAML files: 100%|██████████| 20/20 [00:00<00:00, 46.35it/s]
Reading YAML files: 100%|██████████| 22/22 [00:00<00:00, 44.93it/s]
Reading YAML files: 100%|██████████| 19/19 [00:00<00:00, 44.49it/s]
Reading YAML files: 100%|██████████| 22/22 [00:00<00:00, 42.10it/s]
Reading YAML files: 100%|██████████| 21/21 [00:00<00:00, 46.31it/s]
Reading YAML files: 100%|██████████| 21/21 [00:00<00:00, 47.46it/s]
Reading YAML files: 100%|██████████| 22/22 [00:00<00:00, 46.90it/s]
Reading YAML files: 100%|██████████| 14/14 [00:00<00:00, 47.27it/s]



 Symbols found: ['SBIN', 'BAJFINANCE', 'TITAN', 'ITC', 'TCS', 'LT', 'TATACONSUM', 'RELIANCE', 'HCLTECH', 'JSWSTEEL', 'ULTRACEMCO', 'POWERGRID', 'INFY', 'TRENT', 'BHARTIARTL', 'TATAMOTORS', 'WIPRO', 'TECHM', 'NTPC', 'HINDUNILVR', 'APOLLOHOSP', 'M&M', 'GRASIM', 'ICICIBANK', 'ADANIENT', 'ADANIPORTS', 'BEL', 'BAJAJFINSV', 'EICHERMOT', 'COALINDIA', 'MARUTI', 'INDUSINDBK', 'ASIANPAINT', 'TATASTEEL', 'HDFCLIFE', 'DRREDDY', 'SUNPHARMA', 'KOTAKBANK', 'SHRIRAMFIN', 'NESTLEIND', 'ONGC', 'CIPLA', 'BPCL', 'BRITANNIA', 'SBILIFE', 'HINDALCO', 'HEROMOTOCO', 'AXISBANK', 'HDFCBANK', 'BAJAJ-AUTO']
 Saved: SBIN.csv (284 rows)
 Saved: BAJFINANCE.csv (284 rows)
 Saved: TITAN.csv (284 rows)
 Saved: ITC.csv (284 rows)
 Saved: TCS.csv (284 rows)
 Saved: LT.csv (284 rows)
 Saved: TATACONSUM.csv (284 rows)
 Saved: RELIANCE.csv (284 rows)
 Saved: HCLTECH.csv (284 rows)
 Saved: JSWSTEEL.csv (284 rows)
 Saved: ULTRACEMCO.csv (284 rows)
 Saved: POWERGRID.csv (284 rows)
 Saved: INFY.csv (284 rows)
 Saved: TRENT.csv 