In [23]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.cluster import KMeans
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Set paths
ROOT = os.getcwd()
DATA_DIR = os.path.join(ROOT, "data")
RAW_DIR = os.path.join(DATA_DIR, "raw")
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")
MERGED_DIR = os.path.join(DATA_DIR, "merged")
ENRICHED_DIR  = os.path.join(DATA_DIR, "enriched")

for folder in [DATA_DIR, RAW_DIR, PROCESSED_DIR, MERGED_DIR, ENRICHED_DIR]:
    os.makedirs(folder, exist_ok=True)



In [24]:
def load_gpr():
    """
    Loads and preprocesses Geopolitical Risk Index daily data.
    Returns a DataFrame indexed by daily dates with forward-filled values.
    """
    gpr_path = os.path.join(RAW_DIR, "All_Historical_Data_Separately", "Geopolitical Risk Index Daily.csv")
    if not os.path.exists(gpr_path):
        print(f"GPR dataset not found: {gpr_path}")
        return pd.DataFrame()

    gpr = pd.read_csv(gpr_path)
    if 'DATE' not in gpr.columns:
        print("GPR dataset missing required 'DATE' column")
        return pd.DataFrame()

    gpr['DATE'] = pd.to_datetime(gpr['DATE'], errors='coerce')
    gpr = gpr.dropna(subset=['DATE'])
    gpr = gpr.drop_duplicates(subset=['DATE']).sort_values('DATE').reset_index(drop=True)

    # Resample daily and forward-fill missing dates
    gpr_daily = gpr.set_index('DATE').resample('D').ffill().reset_index()

    # Keep only relevant columns if present
    keep_cols = [col for col in ['DATE', 'GPRD', 'GPRD_THREAT', 'EVENT'] if col in gpr_daily.columns]
    return gpr_daily[keep_cols].rename(columns={'DATE':'Date'})

def extract_news_features(news_df, keywords):
    """
    Extracts daily aggregated geopolitical keyword hits and sentiment polarity from news headlines.
    Args:
        news_df (DataFrame): Raw news data with 'headline_text' and 'date' columns.
        keywords (list of str): List of geopolitical keywords to count in headlines.

    Returns:
        DataFrame: daily aggregated features with columns ['Date', 'geo_keyword_hits', 'sentiment']
    """
    news_df = news_df.copy()

    # Count geopolitics keyword hits in each headline
    news_df['geo_keyword_hits'] = news_df['headline_text'].apply(
        lambda text: sum(kw in text.lower() for kw in keywords) if isinstance(text, str) else 0
    )

    # Calculate sentiment polarity of each headline's text (TextBlob)
    news_df['sentiment'] = news_df['headline_text'].apply(
        lambda x: TextBlob(x).sentiment.polarity if isinstance(x, str) else 0
    )

    # Aggregate daily by summing keyword hits and averaging sentiment
    news_daily = news_df.groupby('date').agg({
        'geo_keyword_hits': 'sum',
        'sentiment': 'mean'
    }).reset_index().dropna(subset=['date']).sort_values('date')

    return news_daily.rename(columns={'date':'Date'})

In [25]:
# Assuming gpr_daily and news_features are loaded and preprocessed by your defined functions:
# Example: load raw news data CSV into news_df
abcnews_path = os.path.join(RAW_DIR, "abcnews-date-text.csv")
news_df = pd.read_csv(news_path)

# Proceed to extract features
news_features = extract_news_features(news_df, geo_keywords)

gpr_daily = load_gpr()
print(gpr_daily.columns)
print(gpr_daily['Date'].dtype)

news_features = extract_news_features(news_df, geo_keywords)


print(news_features.columns)
print(news_features['Date'].dtype)



FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\taton\\PycharmProjects\\Commodity project\\data\\raw\\All_Historical_Data_Separately\\raw_news_data.csv'

In [26]:
import tensorflow as tf
print(tf.__version__)

2.20.0


In [None]:
# Load or define tickers and price_cols dictionaries as per your project

merged_data = {}

for name, ticker in tickers.items():
    price_col = price_cols[name]
    df_price = download_commodity(ticker, name)
    if df_price.empty:
        continue
    df_feat = feature_engineer(df_price, price_col)

    df_feat['Date'] = pd.to_datetime(df_feat['Date'])
    df_feat = df_feat.sort_values('Date').reset_index(drop=True)

    df_merge = pd.merge_asof(df_feat, gpr_daily, on='Date', direction='backward')
    df_merge = pd.merge_asof(df_merge, news_features, on='Date', direction='backward')

    if 'EVENT' in df_merge.columns:
        df_merge['event_dummy'] = df_merge['EVENT'].notna().astype(int)
    else:
        df_merge['event_dummy'] = 0

    # Save enriched data
    fname = f"{name.lower()}_enriched.csv"
    df_merge.to_csv(os.path.join(ENRICHED_DIR, fname), index=False)

    merged_data[name] = df_merge

In [None]:
# Now modeling pipeline setup

def prepare_features_targets(df, features, target):
    X = df[features]
    y = df[target]
    return X, y

# Example modeling for one commodity (Gold)
commodity = 'Gold'
df = merged_data[commodity]

# Select features and target
features_baseline = ['Return_lag1']
features_enhanced = ['Return_lag1', 'GPRD', 'geo_keyword_hits', 'sentiment', 'event_dummy']
target = 'Return'

# Split train and test sets by date (no random shuffle since time series data)
split_date = pd.to_datetime('2019-01-01')
train_df = df[df['Date'] < split_date]
test_df = df[df['Date'] >= split_date]

# Prepare data
X_train_base, y_train = prepare_features_targets(train_df, features_baseline, target)
X_test_base, y_test = prepare_features_targets(test_df, features_baseline, target)

X_train_enh, _ = prepare_features_targets(train_df, features_enhanced, target)
X_test_enh, _ = prepare_features_targets(test_df, features_enhanced, target)

# Scaling (important for models sensitive to scale)
scaler = StandardScaler()
X_train_enh_scaled = scaler.fit_transform(X_train_enh)
X_test_enh_scaled = scaler.transform(X_test_enh)


In [None]:
# Baseline Linear Regression
lr_base = LinearRegression()
lr_base.fit(X_train_base, y_train)
y_pred_base = lr_base.predict(X_test_base)
print(f"Baseline model RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_base)):.4f}")

# Enhanced Random Forest Regression
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_enh_scaled, y_train)
y_pred_rf = rf.predict(X_test_enh_scaled)
print(f"Enhanced model RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_rf)):.4f}")


In [None]:
# Classification: Up/Down movement
train_df['Return_binary'] = (train_df['Return'] > 0).astype(int)
test_df['Return_binary'] = (test_df['Return'] > 0).astype(int)

X_train_class = scaler.fit_transform(train_df[features_enhanced])
X_test_class = scaler.transform(test_df[features_enhanced])
y_train_class = train_df['Return_binary']
y_test_class = test_df['Return_binary']

logreg = LogisticRegression(max_iter=200)
logreg.fit(X_train_class, y_train_class)
y_pred_class = logreg.predict(X_test_class)
print(f"Classification Accuracy: {accuracy_score(y_test_class, y_pred_class):.4f}")

# Clustering for regime detection
kmeans = KMeans(n_clusters=2, random_state=42)
regime_features = df[['Vol_5', 'GPRD', 'geo_keyword_hits']].fillna(0)
df['Regime'] = kmeans.fit_predict(regime_features)

In [None]:
# Optional LSTM for sequence forecasting
def create_sequences(data, feature_cols, target_col, seq_length=10):
    xs, ys = [], []
    for i in range(len(data) - seq_length):
        x = data[feature_cols].iloc[i:(i + seq_length)].values
        y = data[target_col].iloc[i + seq_length]
        xs.append(x)
        ys.append(y)
    return np.array(xs), np.array(ys)

seq_length = 10
feature_cols = features_enhanced
target_col = target

X_seq, y_seq = create_sequences(df.reset_index(drop=True), feature_cols, target_col, seq_length)

split_idx = len(train_df) - seq_length  # Adjust split for sequences
X_train_seq, y_train_seq = X_seq[:split_idx], y_seq[:split_idx]
X_test_seq, y_test_seq = X_seq[split_idx:], y_seq[split_idx:]

model = Sequential([
    LSTM(50, input_shape=(seq_length, len(feature_cols))),
    Dense(1)
])
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train_seq, y_train_seq, epochs=10, batch_size=32, verbose=2)

y_pred_lstm = model.predict(X_test_seq)
print(f"LSTM Test RMSE: {np.sqrt(mean_squared_error(y_test_seq, y_pred_lstm)):.4f}")