In [1]:
import boto3
import pandas as pd
import io

# AWS S3 Credentials
aws_access_key_id = 'YOUR_KEY_ID'
aws_secret_access_key = 'YOUR_ACCESS_KEY'
endpoint_url = 'YOUR_ENDPOINT'
bucket_name = 'fred'

s3 = boto3.resource(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    endpoint_url=endpoint_url
)
bucket = s3.Bucket(bucket_name)

def s3_csv_to_df(key: str) -> pd.DataFrame:

    body = bucket.Object(key).get()["Body"].read()
    return pd.read_csv(io.BytesIO(body))

In [2]:
import pandas as pd
from functools import reduce

kpi_files = {
    'DCOILWTICO.csv': 'crude_oil_price',
    'CPIAUCSL.csv': 'cpi',
    'GASREGW.csv': 'retail_gas_price',
    'GDP.csv': 'gdp',
    'INDPRO.csv': 'indpro',
    'PPIACO.csv': 'ppi',
    'TWEXAFEGSMTH.csv': 'usd_index',
    'UNRATE.csv': 'unrate'
}
def load_and_merge_data():
    dfs = []
    for key, col_name in kpi_files.items():
        try:
            df = s3_csv_to_df(f"observations/{key}")   # add .csv
        except Exception as e:
            print(f"⚠  {key} missing → {e}")
            continue

        df.columns = df.columns.str.strip().str.lower()
        date_col  = 'observation_date' if 'observation_date' in df.columns else 'date'
        value_col = 'value' if 'value' in df.columns else next(c for c in df.columns if c != date_col)

        df = (
            df[[date_col, value_col]]
              .rename(columns={date_col: 'date', value_col: col_name})
        )
        df['date'] = pd.to_datetime(df['date'])
        df = df.set_index('date').resample('W').ffill().reset_index()
        dfs.append(df)

    if not dfs:
        raise RuntimeError("No KPI files were loaded from S3 — nothing to merge.")

    merged_df = reduce(lambda l, r: pd.merge(l, r, on='date', how='inner'), dfs)
    return merged_df.sort_values('date').reset_index(drop=True)



In [7]:
df = load_and_merge_data()
df

Unnamed: 0,date,crude_oil_price,cpi,retail_gas_price,gdp,indpro,ppi,usd_index,unrate
0,1990-08-26,31.10,131.600,1.191,6015.116,62.8704,116.500,,5.7
1,1990-09-02,27.45,132.500,1.245,6015.116,62.8839,118.400,,5.9
2,1990-09-09,30.09,132.500,1.242,6015.116,62.8839,118.400,,5.9
3,1990-09-16,31.79,132.500,1.252,6015.116,62.8839,118.400,,5.9
4,1990-09-23,36.21,132.500,1.266,6015.116,62.8839,118.400,,5.9
...,...,...,...,...,...,...,...,...,...
1789,2024-12-08,68.58,317.603,3.034,29723.864,103.0723,253.423,119.8771,4.1
1790,2024-12-15,71.54,317.603,3.008,29723.864,103.0723,253.423,119.8771,4.1
1791,2024-12-22,69.71,317.603,3.016,29723.864,103.0723,253.423,119.8771,4.1
1792,2024-12-29,71.28,317.603,3.024,29723.864,103.0723,253.423,119.8771,4.1


In [3]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Load and prepare data
df = load_and_merge_data()
df['target'] = np.where(df['crude_oil_price'].shift(-1) > df['crude_oil_price'], 1, 0)
df['month'] = df['date'].dt.month
df['is_summer'] = df['month'].isin([6,7,8]).astype(int)
df['month_sin'] = np.sin(2 * np.pi * df['month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['month']/12)
df['crude_oil_price_1w'] = df['crude_oil_price'].pct_change(1)
df['crude_oil_price_4w'] = df['crude_oil_price'].pct_change(4)
df['crude_oil_price_sma_4w'] = df['crude_oil_price'].rolling(4).mean()
df['crude_oil_price_sma_12w'] = df['crude_oil_price'].rolling(12).mean()
df['crude_oil_price_volatility_4w'] = df['crude_oil_price'].rolling(4).std()
df = df.dropna().reset_index(drop=True)

X = df.drop(columns=['date', 'target', 'crude_oil_price'])
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

# Model training
model = XGBClassifier(n_estimators=50, max_depth=3, learning_rate=0.1, scale_pos_weight=2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Report
print("Classification Report")
print(classification_report(y_test, y_pred, labels=[0,1]))

# Feature importance
import pandas as pd
importance = model.get_booster().get_score(importance_type='weight')
importance_df = pd.DataFrame(importance.items(), columns=['Feature', 'Importance']).sort_values(by='Importance', ascending=False)
print("\n=== Feature Importance ===")
print(importance_df)

Classification Report
              precision    recall  f1-score   support

           0       0.95      0.83      0.89       311
           1       0.29      0.60      0.39        35

    accuracy                           0.81       346
   macro avg       0.62      0.72      0.64       346
weighted avg       0.88      0.81      0.84       346


=== Feature Importance ===
                  Feature  Importance
15   lumber_volatility_4w        45.0
6             yield_curve        29.0
12   lumber_pct_change_4w        23.0
13          lumber_sma_4w        15.0
3           mortgage_rate        14.0
5                     ppi        13.0
9                   month        12.0
1          housing_starts        11.0
4        building_permits        11.0
10              month_sin        10.0
14         lumber_sma_12w         8.0
0                     cpi         7.0
2         m2_money_supply         5.0
11              month_cos         4.0
7   construction_spending         2.0
8       unemplo

In [5]:
import pathlib
out_dir = pathlib.Path("data"); out_dir.mkdir(exist_ok=True)
importance_df.to_csv(out_dir / "lumber_xgb.csv", index=False)


In [9]:

from sklearn.metrics import classification_report
report_dict = classification_report(y_test, y_pred, labels=[0, 1], output_dict=True)
pd.DataFrame(report_dict).transpose().reset_index().rename(
    columns={"index": "Class"}).to_csv("lumber_xgb_report.csv", index=False)
