# Training Machine Learning Models for Commodity Export Forecasting

This notebook demonstrates the process of training machine learning models to forecast commodity exports based on temporal, categorical, and geographical trends.

In [20]:
!pip install torch pandas numpy scikit-learn matplotlib seaborn transformers datasets accelerate 

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [21]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [22]:
# Load and explore the dataset
file_path = 'c:/Users/osmon/Desktop/custom_data/export/Illinois.csv'

# Load the CSV file
df = pd.read_csv(file_path, header=2)

# Clean up column names
df.columns = df.columns.str.strip()

# Display basic information about the dataset
print("Dataset Info:")
df.info()

# Display the first few rows of the dataset
df.head()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110711 entries, 0 to 110710
Data columns (total 8 columns):
 #   Column                                          Non-Null Count   Dtype 
---  ------                                          --------------   ----- 
 0   State                                           110711 non-null  object
 1   Commodity                                       110711 non-null  object
 2   Country                                         110711 non-null  object
 3   Time                                            110711 non-null  object
 4   Vessel Value ($US)                              110711 non-null  object
 5   Containerized Vessel Total Exports Value ($US)  104171 non-null  object
 6   Vessel SWT (kg)                                 110709 non-null  object
 7   Containerized Vessel Total Exports SWT (kg)     104170 non-null  object
dtypes: object(8)
memory usage: 6.8+ MB


Unnamed: 0,State,Commodity,Country,Time,Vessel Value ($US),Containerized Vessel Total Exports Value ($US),Vessel SWT (kg),Containerized Vessel Total Exports SWT (kg)
0,Illinois,01 Live Animals,Africa,Jul-16,7185,7185.0,6,6.0
1,Illinois,01 Live Animals,Africa,Feb-20,13792,,494,
2,Illinois,01 Live Animals,Asia - South,Jun-18,15087,15087.0,1422,1422.0
3,Illinois,01 Live Animals,Asia - Other,Mar-08,12515,,19958,
4,Illinois,01 Live Animals,Asia - Other,Jul-08,12135,,20284,


In [23]:
# Data Preprocessing
# Remove problematic rows if necessary
if df.iloc[-1]['State'] == '98 Special Classification Provisions, Nesoi':
    df = df.iloc[:-1]

# Handle missing values
df = df.dropna()

# Convert 'Time' column to datetime format if applicable
if 'Time' in df.columns:
    df['Time'] = pd.to_datetime(df['Time'], format='%b-%y')

# Clean and convert ALL numeric columns (detect and fix comma-separated numbers)
exclude_cols = ['Time', 'State', 'Commodity', 'Country']
for col in df.columns:
    if col not in exclude_cols:
        # Check if column contains string values that might be numbers with commas
        if df[col].dtype == 'object':
            # Try to clean and convert to numeric
            df[col] = df[col].astype(str).str.replace(',', '').replace('', '0').replace('nan', '0')
            df[col] = pd.to_numeric(df[col], errors='coerce')
        elif df[col].dtype in ['int64', 'float64']:
            # Already numeric, but ensure no missing values
            df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle any remaining missing values after conversion
df = df.dropna()

print("Data types after cleaning:")
print(df.dtypes)
print(f"\nDataset shape: {df.shape}")

# Display the cleaned dataset
df.head()

Data types after cleaning:
State                                                     object
Commodity                                                 object
Country                                                   object
Time                                              datetime64[ns]
Vessel Value ($US)                                         int64
Containerized Vessel Total Exports Value ($US)             int64
Vessel SWT (kg)                                            int64
Containerized Vessel Total Exports SWT (kg)                int64
dtype: object

Dataset shape: (104170, 8)


Unnamed: 0,State,Commodity,Country,Time,Vessel Value ($US),Containerized Vessel Total Exports Value ($US),Vessel SWT (kg),Containerized Vessel Total Exports SWT (kg)
0,Illinois,01 Live Animals,Africa,2016-07-01,7185,7185,6,6
2,Illinois,01 Live Animals,Asia - South,2018-06-01,15087,15087,1422,1422
5,Illinois,01 Live Animals,Asia - Other,2009-09-01,5560,5560,1236,1236
6,Illinois,01 Live Animals,Asia - Other,2011-12-01,123178,123178,1142,1142
7,Illinois,01 Live Animals,Asia - Other,2012-01-01,85860,85860,1681,1681


In [24]:
# Feature Engineering
# Ensure 'Time' column exists before proceeding
if 'Time' in df.columns:
    # Generate temporal features
    df['Year'] = pd.DatetimeIndex(df['Time']).year
    df['Month'] = pd.DatetimeIndex(df['Time']).month

    # Generate lag features
    for lag in [1, 2, 3, 6, 12]:
        df[f'export_value_lag_{lag}'] = df['Vessel Value ($US)'].shift(lag)
        df[f'export_weight_lag_{lag}'] = df['Vessel SWT (kg)'].shift(lag)

    # Generate rolling statistics
    for window in [3, 6, 12]:
        df[f'export_value_rolling_mean_{window}'] = df['Vessel Value ($US)'].rolling(window).mean()
        df[f'export_value_rolling_std_{window}'] = df['Vessel Value ($US)'].rolling(window).std()

    # Drop rows with NaN values generated by lagging
    df = df.dropna()

    # Display the dataset with new features
    df.head()
else:
    print("Error: 'Time' column is missing from the dataset.")
    print("Available columns:", df.columns.tolist())

In [25]:
# Define function to prepare data for LSTM
def prepare_lstm_data(df, sequence_length=12):
    """
    Prepare data for LSTM training by creating sequences
    Args:
        df: DataFrame with time series data
        sequence_length: Number of time steps to look back
    Returns:
        X: Input sequences (samples, time_steps, features)
        y_value: Target values for vessel value
        y_weight: Target values for vessel weight
        feature_cols: List of feature column names
        valid_indices: Indices of valid samples
    """
    # Sort by time to ensure proper sequence order
    if 'Time' in df.columns:
        df_sorted = df.sort_values('Time').reset_index(drop=True)
    else:
        df_sorted = df.copy()
    
    # Select numeric features for training (exclude non-numeric and target columns)
    exclude_cols = ['Time', 'State', 'Commodity', 'Country']
    feature_cols = [col for col in df_sorted.columns if col not in exclude_cols]
    
    print(f"Selected feature columns: {feature_cols}")
    print(f"Total features: {len(feature_cols)}")
    
    # Ensure we have the target columns
    if 'Vessel Value ($US)' not in feature_cols or 'Vessel SWT (kg)' not in feature_cols:
        raise ValueError("Target columns 'Vessel Value ($US)' and 'Vessel SWT (kg)' not found")
    
    # Create feature matrix
    features = df_sorted[feature_cols].values
    
    # Check for sufficient data
    if len(features) <= sequence_length:
        raise ValueError(f"Not enough data points. Need at least {sequence_length + 1}, got {len(features)}")
    
    # Create sequences
    X, y_value, y_weight = [], [], []
    valid_indices = []
    
    value_idx = feature_cols.index('Vessel Value ($US)')
    weight_idx = feature_cols.index('Vessel SWT (kg)')
    
    for i in range(sequence_length, len(features)):
        # Input sequence (past sequence_length time steps)
        X.append(features[i-sequence_length:i])
        
        # Target values (current time step)
        y_value.append(features[i, value_idx])
        y_weight.append(features[i, weight_idx])
        valid_indices.append(i)
    
    X = np.array(X, dtype=np.float32)
    y_value = np.array(y_value, dtype=np.float32)
    y_weight = np.array(y_weight, dtype=np.float32)
    
    print(f"Created {len(X)} sequences")
    print(f"Sequence shape: {X.shape}")
    
    return X, y_value, y_weight, feature_cols, valid_indices

In [26]:
# Train LSTM Model using PyTorch with CUDA support
# Prepare data for PyTorch
from torch.utils.data import DataLoader, TensorDataset

# Force CUDA usage if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
else:
    print("CUDA is not available, falling back to CPU")

sequence_length = 12
X, y_value, y_weight, feature_cols, indices = prepare_lstm_data(df, sequence_length=sequence_length)

print(f"Data shape: X={X.shape}, y_value={y_value.shape}, y_weight={y_weight.shape}")
print(f"Features used: {feature_cols}")

X_train, X_test, y_value_train, y_value_test, y_weight_train, y_weight_test = train_test_split(
    X, y_value, y_weight, test_size=0.2, random_state=42
)

# Convert data to PyTorch tensors and move to GPU
print("Moving data to GPU...")
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_value_train_tensor = torch.tensor(y_value_train, dtype=torch.float32).to(device)
y_weight_train_tensor = torch.tensor(y_weight_train, dtype=torch.float32).to(device)

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_value_test_tensor = torch.tensor(y_value_test, dtype=torch.float32).to(device)
y_weight_test_tensor = torch.tensor(y_weight_test, dtype=torch.float32).to(device)

print(f"Training data moved to: {X_train_tensor.device}")

# Create DataLoader for batching
dataset = TensorDataset(X_train_tensor, y_value_train_tensor, y_weight_train_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc_value = nn.Linear(hidden_size, output_size)
        self.fc_weight = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])  # Take the last output and apply dropout
        value = self.fc_value(out)
        weight = self.fc_weight(out)
        return value, weight

# Initialize the model, loss function, and optimizer
input_size = X.shape[2]  # Number of features (dynamic based on actual data)
hidden_size = 128
output_size = 1
model = LSTMModel(input_size, hidden_size, output_size).to(device)
criterion = nn.L1Loss()  # Using MAE (L1Loss) instead of MSE
optimizer = optim.Adam(model.parameters(), lr=0.001)

print(f"Model input size: {input_size}")
print(f"Model initialized on {device}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
print("Using MAE (L1Loss) as loss function")

if torch.cuda.is_available():
    print(f"CUDA memory after model creation: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")

# Train the model
num_epochs = 20  # Increased epochs for better training
print("Starting training...")

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch_idx, batch in enumerate(dataloader):
        X_batch, y_value_batch, y_weight_batch = batch

        # Forward pass
        value_pred, weight_pred = model(X_batch)
        loss_value = criterion(value_pred, y_value_batch.unsqueeze(1))
        loss_weight = criterion(weight_pred, y_weight_batch.unsqueeze(1))
        loss = loss_value + loss_weight

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(dataloader)
    
    # Evaluation on test set every 5 epochs
    if (epoch + 1) % 5 == 0:
        model.eval()
        with torch.no_grad():
            value_pred_test, weight_pred_test = model(X_test_tensor)
            test_loss_value = criterion(value_pred_test, y_value_test_tensor.unsqueeze(1))
            test_loss_weight = criterion(weight_pred_test, y_weight_test_tensor.unsqueeze(1))
            test_loss = test_loss_value + test_loss_weight
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss (MAE): {avg_loss:.4f}, Test Loss (MAE): {test_loss.item():.4f}")
    else:
        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss (MAE): {avg_loss:.4f}")

print("Training completed!")

# Final evaluation
model.eval()
with torch.no_grad():
    value_pred_test, weight_pred_test = model(X_test_tensor)
    
    # Move predictions back to CPU for evaluation
    value_pred_np = value_pred_test.cpu().numpy().flatten()
    weight_pred_np = weight_pred_test.cpu().numpy().flatten()
    y_value_test_np = y_value_test_tensor.cpu().numpy()
    y_weight_test_np = y_weight_test_tensor.cpu().numpy()
    
    # Calculate metrics
    value_mae = mean_absolute_error(y_value_test_np, value_pred_np)
    value_mse = mean_squared_error(y_value_test_np, value_pred_np)
    value_r2 = r2_score(y_value_test_np, value_pred_np)
    
    weight_mae = mean_absolute_error(y_weight_test_np, weight_pred_np)
    weight_mse = mean_squared_error(y_weight_test_np, weight_pred_np)
    weight_r2 = r2_score(y_weight_test_np, weight_pred_np)
    
    print("\n=== Final Model Performance ===")
    print(f"Vessel Value - MAE: {value_mae:.2f}, MSE: {value_mse:.2f}, R²: {value_r2:.4f}")
    print(f"Vessel Weight - MAE: {weight_mae:.2f}, MSE: {weight_mse:.2f}, R²: {weight_r2:.4f}")

if torch.cuda.is_available():
    print(f"\nFinal CUDA memory usage: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")

Using device: cpu
Selected feature columns: ['Vessel Value ($US)', 'Containerized Vessel Total Exports Value ($US)', 'Vessel SWT (kg)', 'Containerized Vessel Total Exports SWT (kg)', 'Year', 'Month', 'export_value_lag_1', 'export_weight_lag_1', 'export_value_lag_2', 'export_weight_lag_2', 'export_value_lag_3', 'export_weight_lag_3', 'export_value_lag_6', 'export_weight_lag_6', 'export_value_lag_12', 'export_weight_lag_12', 'export_value_rolling_mean_3', 'export_value_rolling_std_3', 'export_value_rolling_mean_6', 'export_value_rolling_std_6', 'export_value_rolling_mean_12', 'export_value_rolling_std_12']
Total features: 22
Created 104146 sequences
Sequence shape: (104146, 12, 22)
Data shape: X=(104146, 12, 22), y_value=(104146,), y_weight=(104146,)
Features used: ['Vessel Value ($US)', 'Containerized Vessel Total Exports Value ($US)', 'Vessel SWT (kg)', 'Containerized Vessel Total Exports SWT (kg)', 'Year', 'Month', 'export_value_lag_1', 'export_weight_lag_1', 'export_value_lag_2', 'ex

KeyboardInterrupt: 