# Step 0: Overview
In this notebook, we:
- Load processed commodity data and merged GPR from previous notebook.
- Preprocess Global News Dataset: clean, extract geopolitical keyword counts per day.
- Merge news features with commodity + GPR data.
 - Perform EDA: plots, correlations, distributions, stationarity tests.
 - Add advanced features (lags, rolling averages, event dummy).
 - Save final merged datasets and figures.

# Step 1: Import Libraries and Set Paths

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
#from statsmodels.tsa.stattools import adfuller  # For stationarity test
# Optional: for sentiment (install if needed:
!pip install textblob
from textblob import TextBlob  # Uncomment after install
# Paths (adjust if needed)
ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_DIR = os.path.join(os.getcwd(), "data")  # Points to notebooks/data
FIGURES_DIR = os.path.join(ROOT, "figures")
os.makedirs(FIGURES_DIR, exist_ok=True)



In [3]:
import os

print("Current working directory:", os.getcwd())
print("Proposed ROOT:", os.path.abspath(os.path.join(os.getcwd(), "..")))
print("Proposed DATA_DIR:", os.path.join(os.path.abspath(os.path.join(os.getcwd(), "..")), "data"))

# Check if data folder exists
if os.path.exists(os.path.join(os.path.abspath(os.path.join(os.getcwd(), "..")), "data")):
    print("Data folder exists. Contents:", os.listdir(os.path.join(os.path.abspath(os.path.join(os.getcwd(), "..")), "data")))
else:
    print("Data folder does not existâ€”create it or adjust paths.")

Current working directory: C:\Users\taton\PycharmProjects\capstone-data-science\notebooks
Proposed ROOT: C:\Users\taton\PycharmProjects\capstone-data-science
Proposed DATA_DIR: C:\Users\taton\PycharmProjects\capstone-data-science\data
Data folder exists. Contents: ['.gitkeep', 'lstm_reference']



# Step 2: Load Processed Data from Previous Notebook
Load merged commodities with GPR (assume saved from prev notebook)

In [4]:
merged_gold = pd.read_csv(os.path.join(DATA_DIR, "gold_merged.csv"))
merged_wti = pd.read_csv(os.path.join(DATA_DIR, "wti_merged.csv"))
merged_wheat = pd.read_csv(os.path.join(DATA_DIR, "wheat_merged.csv"))

# Ensure 'Date' is datetime
for df in [merged_gold, merged_wti, merged_wheat]:
    df['Date'] = pd.to_datetime(df['Date'])

print("Loaded merged shapes:", merged_gold.shape, merged_wti.shape, merged_wheat.shape)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\taton\\PycharmProjects\\capstone-data-science\\data\\gold_merged.csv'

# Step 3: Load and Preprocess Global News Dataset

In [None]:
news_path = os.path.join(DATA_DIR,"Global News dataset", "data.csv")  # From Kaggle download
news_df = pd.read_csv(news_path)

# Assume columns: 'published_at', 'title', 'short_description', 'source_name', etc.
# Convert 'published_at' to datetime and extract date
news_df['published_at'] = pd.to_datetime(news_df['published_at'])
news_df['date'] = news_df['published_at'].dt.date  # Group by date

# Clean text: lowercase titles and descriptions
news_df['title'] = news_df['title'].str.lower().fillna('')
news_df['short_description'] = news_df['short_description'].str.lower().fillna('')

# Expanded keywords for robustness
keywords = [
    'war', 'sanctions', 'conflict', 'geopolitical', 'tension', 'embargo', 'crisis', 'invasion',
    'terrorism', 'opec', 'blockade', 'dispute', 'escalation', 'hostility', 'unrest', 'strike',
    'alliance', 'treaty', 'summit', 'diplomacy', 'opep', 'Iran', 'Syria', 'Lybia', 'North Korea'  # Added more for geopolitical context
]

# Create indicator for geopolitical news (1 if any keyword in title or desc)
news_df['is_geopolitical'] = news_df.apply(
    lambda row: any(kw in row['title'] or kw in row['short_description'] for kw in keywords), axis=1
)

Bonus: Sentiment on geopolitical articles (uncomment after installing TextBlob)
def get_sentiment(text):
     return TextBlob(text).sentiment.polarity if text else 0
 news_df['sentiment'] = news_df.apply(lambda row: get_sentiment(row['title'] + ' ' + row['short_description']) if row['is_geopolitical'] else np.nan, axis=1)

# Aggregate per day: count of geopolitical articles (and mean sentiment if added)
news_agg = news_df.groupby('date').agg(
    geo_news_count=('is_geopolitical', 'sum'),
     geo_avg_sentiment=('sentiment', 'mean')  # Uncomment if using sentiment
).reset_index()

# Convert date to datetime for merging
news_agg['date'] = pd.to_datetime(news_agg['date'])
news_agg.rename(columns={'date': 'Date'}, inplace=True)

print("News aggregate preview:")
print(news_agg.head())

# Save processed news for reference
news_agg.to_csv(os.path.join(DATA_DIR, "news_processed.csv"), index=False)

# Step 4: Merge News with Commodity + GPR Data
Merge on 'Date' (left join to keep all trading days; fill missing with 0)

In [None]:
def merge_with_news(df):
    merged = pd.merge(df, news_agg, on='Date', how='left')
    merged['geo_news_count'] = merged['geo_news_count'].fillna(0)
    # merged['geo_avg_sentiment'] = merged['geo_avg_sentiment'].fillna(0)  # If using
    return merged

merged_gold_with_news = merge_with_news(merged_gold)
merged_wti_with_news = merge_with_news(merged_wti)
merged_wheat_with_news = merge_with_news(merged_wheat)

# Save merged datasets
merged_gold_with_news.to_csv(os.path.join(DATA_DIR, "merged_gold_with_news.csv"), index=False)
merged_wti_with_news.to_csv(os.path.join(DATA_DIR, "merged_wti_with_news.csv"), index=False)
merged_wheat_with_news.to_csv(os.path.join(DATA_DIR, "merged_wheat_with_news.csv"), index=False)

print("Merged with news shapes:", merged_gold_with_news.shape, merged_wti_with_news.shape, merged_wheat_with_news.shape)

# Step 5: Feature Engineering (Step 3 from Plan)
 Price returns already in data ('Return')
 Rolling averages and volatility (5-day already in; add 30-day)
 Lag features (past returns; add 1-day)
 Geopolitical indicators: GPRD (already in), news count (added), event dummy (from 'EVENT')

In [None]:
for df in [merged_gold_with_news, merged_wti_with_news, merged_wheat_with_news]:
    # Rolling volatility (30-day)
    df['vol_30'] = df['Return'].rolling(window=30).std()

    # Lag features
    df['return_lag1'] = df['Return'].shift(1)
    df['gpr_lag1'] = df['GPRD'].shift(1)  # GPR lag
    df['news_count_lag1'] = df['geo_news_count'].shift(1)  # News lag for delayed reactions

    # Event dummy (1 if EVENT not NaN)
    df['event_dummy'] = df['EVENT'].notna().astype(int)

# Resave after feature engineering
merged_gold_with_news.to_csv(os.path.join(DATA_DIR, "merged_gold_with_news.csv"), index=False)
merged_wti_with_news.to_csv(os.path.join(DATA_DIR, "merged_wti_with_news.csv"), index=False)
merged_wheat_with_news.to_csv(os.path.join(DATA_DIR, "merged_wheat_with_news.csv"), index=False)

# Step 6: Exploratory Data Analysis (EDA) - Week 9
6.1: Time Series Plots (Prices with GPR and News Overlaid)

In [1]:
def plot_time_series(df, commodity, price_col='Close', gpr_col='GPRD', news_col='geo_news_count'):
    fig, ax1 = plt.subplots(figsize=(12, 6))

    # Price on left axis (adjust price_col based on your flattened columns, e.g., 'Close_GC=F')
    ax1.plot(df['Date'], df[price_col], color='blue', label=f'{commodity} Price')
    ax1.set_xlabel('Date')
    ax1.set_ylabel(f'{commodity} Price', color='blue')
    ax1.tick_params(axis='y', labelcolor='blue')

    # GPR on right axis
    ax2 = ax1.twinx()
    ax2.plot(df['Date'], df[gpr_col], color='red', label='GPR Index', alpha=0.7)
    ax2.set_ylabel('GPR Index', color='red')
    ax2.tick_params(axis='y', labelcolor='red')

    # News count as scatter (for highlights)
    ax2.scatter(df['Date'], df[news_col] * 10, color='green', label='Geo News Count (scaled)', s=10, alpha=0.5)  # Scaled for visibility

    fig.suptitle(f'{commodity} Price vs. Geopolitical Risk and News Counts')
    fig.legend(loc='upper right')
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, f'{commodity.lower()}_time_series.png'))
    plt.show()

# Generate plots (adjust price_col if needed based on your column names)
plot_time_series(merged_gold_with_news, 'Gold', price_col='Close_GC=F')
plot_time_series(merged_wti_with_news, 'WTI Oil', price_col='Close_CL=F')
plot_time_series(merged_wheat_with_news, 'Wheat', price_col='Close_ZW=F')

NameError: name 'merged_gold_with_news' is not defined

 6.2: Correlations (Table)

In [None]:
def get_correlations(df, key_cols=['Return', 'GPRD', 'GPRD_ACT', 'GPRD_THREAT', 'geo_news_count', 'event_dummy']):
    corr = df[key_cols].corr()
    print(f"Correlation Matrix for {commodity}:\n{corr}")
    # Save as heatmap
    plt.figure(figsize=(8, 6))
    plt.imshow(corr, cmap='coolwarm', interpolation='none')
    plt.colorbar()
    plt.xticks(range(len(key_cols)), key_cols, rotation=45)
    plt.yticks(range(len(key_cols)), key_cols)
    plt.title('Correlation Heatmap')
    plt.savefig(os.path.join(FIGURES_DIR, 'correlation_heatmap.png'))
    plt.show()

# Example for Gold (repeat for others if needed)
get_correlations(merged_gold_with_news)

6.3: Distributions (Histograms)

In [None]:
def plot_distributions(df, cols=['Return', 'GPRD', 'geo_news_count']):
    fig, axes = plt.subplots(1, len(cols), figsize=(15, 5))
    for i, col in enumerate(cols):
        axes[i].hist(df[col].dropna(), bins=50)
        axes[i].set_title(f'Distribution of {col}')
    plt.tight_layout()
    plt.savefig(os.path.join(FIGURES_DIR, 'distributions.png'))
    plt.show()

plot_distributions(merged_gold_with_news)  # Repeat for others if needed

6.4: Stationarity Test (ADF for prices/returns)

In [None]:
def test_stationarity(series, name):
    result = adfuller(series.dropna())
    print(f'ADF Statistic for {name}: {result[0]}')
    print(f'p-value: {result[1]}')
    if result[1] <= 0.05:
        print(f"{name} is stationary (reject null).")
    else:
        print(f"{name} is non-stationary.")

# Test on Gold prices and returns (example; repeat for others)
test_stationarity(merged_gold_with_news['Close_GC=F'], 'Gold Price')  # Likely non-stationary
test_stationarity(merged_gold_with_news['Return'], 'Gold Returns')  # Likely stationary