In [1]:
#loading data from kaggle api
# !kaggle competitions download -c mitsui-commodity-prediction-challenge

In [2]:
#unzipping data
# !unzip mitsui-commodity-prediction-challenge.zip

In [3]:
# imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import STL
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from joblib import Parallel, delayed

plt.style.use('seaborn-v0_8-darkgrid')
pd.options.display.float_format = '{:,.4f}'.format

In [4]:
# 2. Load Data
target_pairs = pd.read_csv("../data/target_pairs.csv")
train_labels = pd.read_csv("../data/train_labels.csv")
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
test_labels_lag_1 = pd.read_csv("../data/lagged_test_labels/test_labels_lag_1.csv")
test_labels_lag_2 = pd.read_csv("../data/lagged_test_labels/test_labels_lag_2.csv")
test_labels_lag_3 = pd.read_csv("../data/lagged_test_labels/test_labels_lag_3.csv")
test_labels_lag_4 = pd.read_csv("../data/lagged_test_labels/test_labels_lag_4.csv")

In [5]:
# setting base date and conversion of dates

base_date = pd.Timestamp("2000-01-01")
train['date_id'] = pd.to_timedelta(train['date_id'], unit='D') + base_date
train_labels['date_id'] = pd.to_timedelta(train_labels['date_id'], unit='D') + base_date



In [6]:
#finding merge key

print("Train columns:", train.columns.tolist())
print("Train_labels columns:", train_labels.columns.tolist())

Train columns: ['date_id', 'LME_AH_Close', 'LME_CA_Close', 'LME_PB_Close', 'LME_ZS_Close', 'JPX_Gold_Mini_Futures_Open', 'JPX_Gold_Rolling-Spot_Futures_Open', 'JPX_Gold_Standard_Futures_Open', 'JPX_Platinum_Mini_Futures_Open', 'JPX_Platinum_Standard_Futures_Open', 'JPX_RSS3_Rubber_Futures_Open', 'JPX_Gold_Mini_Futures_High', 'JPX_Gold_Rolling-Spot_Futures_High', 'JPX_Gold_Standard_Futures_High', 'JPX_Platinum_Mini_Futures_High', 'JPX_Platinum_Standard_Futures_High', 'JPX_RSS3_Rubber_Futures_High', 'JPX_Gold_Mini_Futures_Low', 'JPX_Gold_Rolling-Spot_Futures_Low', 'JPX_Gold_Standard_Futures_Low', 'JPX_Platinum_Mini_Futures_Low', 'JPX_Platinum_Standard_Futures_Low', 'JPX_RSS3_Rubber_Futures_Low', 'JPX_Gold_Mini_Futures_Close', 'JPX_Gold_Rolling-Spot_Futures_Close', 'JPX_Gold_Standard_Futures_Close', 'JPX_Platinum_Mini_Futures_Close', 'JPX_Platinum_Standard_Futures_Close', 'JPX_RSS3_Rubber_Futures_Close', 'JPX_Gold_Mini_Futures_Volume', 'JPX_Gold_Rolling-Spot_Futures_Volume', 'JPX_Gold_Sta

In [7]:
#merging data and displaying as table
merge_key = 'row_id' if 'row_id' in train.columns else train.columns.intersection(train_labels.columns)[0]
train_full = pd.merge(train, train_labels, on=merge_key, how='left')
print(f"Merged shape: {train_full.shape}")
display(train_full.head())

Merged shape: (1961, 982)


Unnamed: 0,date_id,LME_AH_Close,LME_CA_Close,LME_PB_Close,LME_ZS_Close,JPX_Gold_Mini_Futures_Open,JPX_Gold_Rolling-Spot_Futures_Open,JPX_Gold_Standard_Futures_Open,JPX_Platinum_Mini_Futures_Open,JPX_Platinum_Standard_Futures_Open,...,target_414,target_415,target_416,target_417,target_418,target_419,target_420,target_421,target_422,target_423
0,2000-01-01,2264.5,7205.0,2570.0,3349.0,,,,,,...,,0.0212,-0.0056,,-0.0046,0.0338,,0.0382,,0.0273
1,2000-01-02,2228.0,7147.0,2579.0,3327.0,,,,,,...,0.0034,0.0214,-0.0015,0.0128,0.0105,0.0305,-0.0008,0.025,0.0035,0.0209
2,2000-01-03,2250.0,7188.5,2587.0,3362.0,4684.0,4691.0,4684.0,3363.0,3367.0,...,-0.0067,0.0093,0.0019,-0.0128,-0.0023,0.0175,-0.0054,0.0048,-0.0091,0.0017
3,2000-01-04,2202.5,7121.0,2540.0,3354.0,4728.0,4737.0,4729.0,3430.0,3426.0,...,,0.0369,-0.0152,,0.0081,0.0011,,-0.0151,,-0.033
4,2000-01-05,2175.0,7125.0,2604.0,3386.0,,,,,,...,,0.0049,,-0.0067,-0.0161,-0.0049,,,0.0095,


In [8]:
#basic info & missing data
print("\n--- Dataset Info ---")
train_full.info()

print("\n--- Missing Values ---")
print(train_full.isnull().sum().sort_values(ascending=False))



--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1961 entries, 0 to 1960
Columns: 982 entries, date_id to target_423
dtypes: datetime64[ns](1), float64(981)
memory usage: 14.7 MB

--- Missing Values ---
US_Stock_GOLD_adj_close     1713
US_Stock_GOLD_adj_volume    1713
US_Stock_GOLD_adj_low       1713
US_Stock_GOLD_adj_open      1713
US_Stock_GOLD_adj_high      1713
                            ... 
FX_EURUSD                      0
FX_GBPJPY                      0
FX_GBPAUD                      0
FX_GBPUSD                      0
date_id                        0
Length: 982, dtype: int64


In [7]:
#time handling

time_col = [c for c in train_full.columns if 'date' in c.lower() or 'time' in c.lower()][0]
train_full[time_col] = pd.to_datetime(train_full[time_col])
train_full = train_full.sort_values(by=time_col)

NameError: name 'train_full' is not defined

In [8]:
#example commodity

if 'commodity' in train_full.columns:
    sample_asset = train_full['commodity'].unique()[0]
    subset = train_full[train_full['commodity'] == sample_asset]
else:
    subset = train_full

plt.figure(figsize=(12,5))
plt.plot(subset[time_col], subset.select_dtypes('number').iloc[:,0])
plt.title(f"{sample_asset if 'commodity' in train_full.columns else 'Sample'} Price over Time")
plt.xlabel("Date")
plt.ylabel("Price")
plt.show()

NameError: name 'train_full' is not defined

In [9]:
#compute returns and volatility
numeric_cols = train_full.select_dtypes(include=np.number).columns
price_col = numeric_cols[1] if len(numeric_cols) > 1 else numeric_cols[0]

train_full['Return'] = train_full[price_col].pct_change()
train_full['Volatility'] = train_full['Return'].rolling(window=20).std()

fig, ax = plt.subplots(2,1,figsize=(12,8))
ax[0].plot(train_full[time_col], train_full['Return'], label='Daily Return', color='tab:blue')
ax[1].plot(train_full[time_col], train_full['Volatility'], label='20-day Rolling Volatility', color='tab:red')
ax[0].legend(); ax[1].legend()
plt.show()

NameError: name 'train_full' is not defined

In [10]:
#stationarity test
print("\n--- Augmented Dickey-Fuller Test on Returns ---")
adf_result = adfuller(train_full['Return'].dropna())
print(f"ADF Statistic: {adf_result[0]:.4f}")
print(f"p-value: {adf_result[1]:.4f}")
if adf_result[1] <= 0.05:
    print("Series is likely stationary.")
else:
    print("Series is non-stationary; consider differencing or transformation.")


--- Augmented Dickey-Fuller Test on Returns ---


NameError: name 'train_full' is not defined

In [11]:
#autocorrelation analysis
plot_acf(train_full['Return'].dropna(), lags=40, title="ACF: Returns")
plot_pacf(train_full['Return'].dropna(), lags=40, title="PACF: Returns")
plt.show()


NameError: name 'train_full' is not defined

In [12]:
#correlation matrix  (important features)

target_cols = [c for c in train_full.columns if c.startswith('target_')]
feature_cols = [c for c in train_full.columns if not c.startswith('target_') and c != 'date_id']

corr = train_full[feature_cols + target_cols].corr().loc[feature_cols, target_cols]

plt.figure(figsize=(12, 8))
sns.heatmap(corr.iloc[:20, :5], annot=False, cmap="coolwarm", fmt=".2f")
plt.title("Feature–Target Correlation Heatmap (Top 20 Features × 5 Targets)")
plt.xlabel("Target Variables")
plt.ylabel("Features")
plt.show()

NameError: name 'train_full' is not defined

In [13]:
#rolling mean & std visualization
window = 30
train_full['Rolling_Mean'] = train_full['Return'].rolling(window=window).mean()
train_full['Rolling_Std']  = train_full['Return'].rolling(window=window).std()

plt.figure(figsize=(12,5))
plt.plot(train_full[time_col], train_full['Return'], label='Return', alpha=0.4)
plt.plot(train_full[time_col], train_full['Rolling_Mean'], label=f'{window}-day Mean', color='red')
plt.plot(train_full[time_col], train_full['Rolling_Std'], label=f'{window}-day Std', color='orange')
plt.title("Rolling Mean & Volatility of Returns")
plt.legend()
plt.show()

NameError: name 'train_full' is not defined

In [14]:
#Summary
print(f"Records: {len(train_full):,}")
print(f"Date Range: {train_full[time_col].min()} → {train_full[time_col].max()}")
print(f"Missing Values: {train_full.isna().sum().sum()}")
print(f"Average Daily Return: {train_full['Return'].mean():.4f}")
print(f"Average Volatility (20-day): {train_full['Volatility'].mean():.4f}")


NameError: name 'train_full' is not defined

In [15]:
#handling missing values
# Sorting chronologically
train_full = train_full.sort_values(by=time_col)

# Forward-fill then backward-fill missing data
train_full = train_full.ffill().bfill()

# Dropping columns that are entirely NaN or mostly empty (>40%)
missing_ratio = train_full.isna().mean()
cols_to_drop = missing_ratio[missing_ratio > 0.4].index
train_full = train_full.drop(columns=cols_to_drop)
print(f"Dropped {len(cols_to_drop)} columns with >40% missing data.")
print(f"Remaining missing entries: {train_full.isna().sum().sum()}")

display(train_full.head())


NameError: name 'train_full' is not defined

In [16]:
# #handling outliers
# numeric_cols = train_full.select_dtypes(include=['float64', 'int64']).columns
# # Clip extreme values to 1st–99th percentile (winsorization)
# for col in numeric_cols:
#     q_low, q_high = train_full[col].quantile([0.01, 0.99])
#     train_full[col] = train_full[col].clip(lower=q_low, upper=q_high)

# print(f"Outliers clipped {len(numeric_cols)} numeric columns.")


In [17]:
#identifying "Close" cols
close_cols = [c for c in train_full.columns if 'close' in c.lower()]
print(f"Detected {len(close_cols)} 'Close' columns for processing.")

NameError: name 'train_full' is not defined

In [18]:
#STL decomposition
def clean_series(series, method='svm'):
    """
    Decompose a time series using STL and remove outliers from residuals.
    method: 'svm', 'quantile', 'iforest'
    """
    series = series.dropna()
    if len(series) < 60:
        return series  # skipping very short series

    #STL decomposition (period ~30 days typical)
    stl = STL(series, period=30, robust=True)
    res = stl.fit()
    trend, seasonal, resid = res.trend, res.seasonal, res.resid

    #outlier detection on residual
    X = resid.values.reshape(-1, 1)
    mask = np.ones(len(resid), dtype=bool)

    if method == 'quantile':
        low, high = resid.quantile([0.01, 0.99])
        mask = (resid >= low) & (resid <= high)

    elif method == 'iforest':
        clf = IsolationForest(contamination=0.01, random_state=42)
        mask = clf.fit_predict(X) == 1

    elif method == 'svm':
        clf = OneClassSVM(kernel='rbf', nu=0.01, gamma='scale')
        mask = clf.fit_predict(X) == 1

    #clean residual and reconstruct series
    clean_resid = pd.Series(np.where(mask, resid, np.nan), index=resid.index)
    clean_resid = clean_resid.interpolate().fillna(method='bfill').fillna(method='ffill')
    clean_series = trend + seasonal + clean_resid
    return clean_series

#run cleaning in parallel for all Close columns
def process_column(col):
    original = train_full[col].copy()
    cleaned = clean_series(original, method='svm')

    #plot comparison
    plt.figure(figsize=(10, 4))
    plt.plot(original, label='Original', alpha=0.5)
    plt.plot(cleaned, label='Cleaned (SVM)', linewidth=2)
    plt.title(f"{col} - Outlier Cleaning Comparison")
    plt.legend()
    plt.tight_layout()
    plt.show()

    return cleaned

results = Parallel(n_jobs=-1, backend='loky')(delayed(process_column)(col) for col in close_cols)


NameError: name 'close_cols' is not defined

In [19]:
#replacing cleaned series back into dataset
for i, col in enumerate(close_cols):
    train_full[col] = results[i]

NameError: name 'close_cols' is not defined

In [20]:
#checking data
#train_full[close_cols].iloc[:, :5].plot(figsize=(12,5), title="Sample of Cleaned Close Series")

train_full[close_cols].head()

NameError: name 'train_full' is not defined