In [None]:
# 01_data_collection
Fetch commodity prices (gold, oil, wheat) and load Kaggle datasets (geopolitical risk, global news).


In [None]:
# If you don't have yfinance or kaggle installed, uncomment and run these:
 pip install yfinance kaggle


In [None]:
import os
import pandas as pd
import numpy as np
import yfinance as yf
from datetime import datetime

# paths
ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))  # if notebook lives in notebooks/
DATA_DIR = os.path.join(ROOT, "data")
os.makedirs(DATA_DIR, exist_ok=True)

# choose date range
START = "2000-01-01"
END = None  # None -> yfinance takes up to today


In [None]:
tickers = {
    "Gold": "GC=F",      # Gold futures
    "WTI":  "CL=F",      # WTI crude oil future
    "Wheat": "ZW=F"      # Wheat futures
}

def fetch_save(ticker_symbol, shortname):
    print(f"Downloading {shortname} ({ticker_symbol}) ...")
    df = yf.download(ticker_symbol, start=START, end=END, progress=False, auto_adjust=True)
    df['Return'] = df['Close'].pct_change()
    df['Vol_5'] = df['Return'].rolling(window=5).std()

    if df.empty:
        print("Warning: empty dataframe for", ticker_symbol)
    else:
        df.reset_index(inplace=True)                 # Date as column
        df.to_csv(os.path.join(DATA_DIR, f"{shortname}.csv"), index=False)
    return df

# fetch
df_gold = fetch_save(tickers["Gold"], "gold_futures")
df_wti  = fetch_save(tickers["WTI"], "wti_crude")
df_wheat = fetch_save(tickers["Wheat"], "wheat_futures")



In [None]:
for name, df in [("Gold", df_gold), ("WTI", df_wti), ("Wheat", df_wheat)]:
    print(name, "rows:", 0 if df is None else len(df))
    display(df.head())


In [None]:
# run in terminal (not in notebook) from repo root
# pip install kaggle
# mkdir -p ~/.kaggle && cp /path/to/kaggle.json ~/.kaggle/kaggle.json && chmod 600 ~/.kaggle/kaggle.json

# Cell 6: Download all required Kaggle datasets

import os

# Make sure we have a "data" directory
os.makedirs("../../data/data", exist_ok=True)

# 1. Gold & Silver Price vs Geopolitical Risk Index
!kaggle datasets download -d shreyanshdangi/gold-silver-price-vs-geopolitical-risk-19852025 -p data --unzip

# 2. Global News Dataset
!kaggle datasets download -d everydaycodings/global-news-dataset -p data --unzip

# 3. Gold Price Prediction (LSTM reference dataset)
!kaggle kernels pull farzadnekouei/gold-price-prediction-lstm-96-accuracy -p data/lstm_reference



In [None]:
# list files so you can adapt filenames
for f in sorted(os.listdir(DATA_DIR)):
    print(f)
DATA_DIR = "data"

# Example load (replace filenames with actual ones you see)
gpr = pd.read_csv(os.path.join(DATA_DIR, "Gold-Silver-GeopoliticalRisk_HistoricalData.csv"))
global_news = pd.read_csv(os.path.join(DATA_DIR, 'data.csv'))

# Simple placeholder if you haven't downloaded: create an empty dataframe
gpr = None
global_news = None


In [None]:
def add_basic_features(df):
    df = df.copy()
    df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values('Date', inplace=True)
    df.set_index('Date', inplace=False)
    price_col = 'Adj Close' if 'Adj Close' in df.columns else 'Close'
    df[price_col] = df[price_col].astype(float)
    df['Return'] = df[price_col].pct_change()
    df['MA_5'] = df[price_col].rolling(window=5).mean()
    df['Vol_5'] = df['Return'].rolling(window=5).std()
    return df

gold_feat = add_basic_features(df_gold) if not df_gold.empty else pd.DataFrame()
wti_feat  = add_basic_features(df_wti) if not df_wti.empty else pd.DataFrame()
wheat_feat= add_basic_features(df_wheat) if not df_wheat.empty else pd.DataFrame()

# Save processed
gold_feat.to_csv(os.path.join(DATA_DIR, "gold_processed.csv"), index=False)
wti_feat.to_csv(os.path.join(DATA_DIR, "wti_processed.csv"), index=False)
wheat_feat.to_csv(os.path.join(DATA_DIR, "wheat_processed.csv"), index=False)

print("Saved processed files to", DATA_DIR)
print(gpr.head())


In [None]:
# Example: if you have a geopolitcal risk csv with 'Date' and 'GPR' columns
# --- Prepare GPR data ---
gpr['DATE'] = pd.to_datetime(gpr['DATE'])
gpr_daily = gpr.set_index('DATE').resample('D').ffill().reset_index()

# --- Prepare gold_feat ---
# Only reset index if 'Date' is still the index (to avoid duplicates)
if gold_feat.index.name == 'Date':
    gold_feat = gold_feat.reset_index()

print("Gold feature columns:", gold_feat.columns)
print("GPR columns:", gpr_daily.columns)


# --- Merge on Date ---
merged_gold = pd.merge(
    gold_feat,
    gpr_daily,
    left_on='Date',
    right_on='DATE',
    how='left'
)

print("Merged dataset preview:")
print(merged_gold.head())

In [None]:
# --- Prepare GPR data ---
gpr['DATE'] = pd.to_datetime(gpr['DATE'])
gpr_daily = gpr.set_index('DATE').resample('D').ffill().reset_index()

def flatten_columns(df):
    """Flatten MultiIndex columns if necessary."""
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [
            '_'.join([str(c) for c in col if c]).strip()
            for col in df.columns.values
        ]
    return df

def safe_reset(df):
    if df.index.name == 'Date':
        return df.reset_index()
    return df

# --- Gold ---
gold_feat = flatten_columns(gold_feat)
gold_feat = safe_reset(gold_feat)
merged_gold = pd.merge(
    gold_feat,
    gpr_daily,
    left_on='Date',
    right_on='DATE',
    how='left'
)

# --- Oil (WTI) ---
wti_feat = flatten_columns(wti_feat)
wti_feat = safe_reset(wti_feat)
merged_wti = pd.merge(
    wti_feat,
    gpr_daily,
    left_on='Date',
    right_on='DATE',
    how='left'
)

# --- Wheat ---
wheat_feat = flatten_columns(wheat_feat)
wheat_feat = safe_reset(wheat_feat)
merged_wheat = pd.merge(
    wheat_feat,
    gpr_daily,
    left_on='Date',
    right_on='DATE',
    how='left'
)

# --- Quick checks ---
print("Gold merged shape:", merged_gold.shape)
print("Oil merged shape:", merged_wti.shape)
print("Wheat merged shape:", merged_wheat.shape)

display(merged_gold.head())
display(merged_wti.head())
display(merged_wheat.head())


In [None]:
# quick check of saved files
for fname in ["gold_processed.csv", "wti_processed.csv", "wheat_processed.csv"]:
    print(fname, "->", os.path.exists(os.path.join(DATA_DIR,fname)))


In [None]:
git add data/*.csv notebooks/01_data_collection.ipynb
git commit -m "Add data collection notebook + initial processed commodity files"
git push origin main
