In [1]:
import pandas as pd

# Load the CSV files, skipping the first row (metadata) and using the second row as the header
dfe = pd.read_csv("custom_data/export/Illinois.csv", skiprows=1, header=1)
dfi = pd.read_csv("custom_data/import/Illinois.csv", skiprows=1, header=1)

#Drop the missing things
dfe = dfe.dropna()
dfi = dfi.dropna()

# Display the first few rows to verify
print("Exports DataFrame:")
print(dfe.head())
print("\nImports DataFrame:")
print(dfi.head())

Exports DataFrame:
      State        Commodity       Country    Time Vessel Value ($US)  \
0  Illinois  01 Live Animals        Africa  Jul-16              7,185   
2  Illinois  01 Live Animals  Asia - South  Jun-18             15,087   
5  Illinois  01 Live Animals  Asia - Other  Sep-09              5,560   
6  Illinois  01 Live Animals  Asia - Other  Dec-11            123,178   
7  Illinois  01 Live Animals  Asia - Other  Jan-12             85,860   

  Containerized Vessel Total Exports Value ($US) Vessel SWT (kg)  \
0                                          7,185               6   
2                                         15,087           1,422   
5                                          5,560           1,236   
6                                        123,178           1,142   
7                                         85,860           1,681   

  Containerized Vessel Total Exports SWT (kg)  
0                                           6  
2                                    

In [2]:
print(dfe.columns)
dfe_value = dfe[['Containerized Vessel Total Exports Value ($US)','Containerized Vessel Total Exports SWT (kg)']]
print(dfe_value)

Index(['State', 'Commodity', 'Country', 'Time', 'Vessel Value ($US)',
       'Containerized Vessel Total Exports Value ($US)', 'Vessel SWT (kg)',
       'Containerized Vessel Total Exports SWT (kg)'],
      dtype='object')
       Containerized Vessel Total Exports Value ($US)  \
0                                               7,185   
2                                              15,087   
5                                               5,560   
6                                             123,178   
7                                              85,860   
...                                               ...   
110705                                          5,457   
110706                                         20,219   
110707                                          5,004   
110708                                         28,620   
110709                                         10,898   

       Containerized Vessel Total Exports SWT (kg)  
0                                      

In [3]:

dfe = dfe[['Time','Commodity','State','Country']]
print(dfe)

          Time                                    Commodity     State  \
0       Jul-16                              01 Live Animals  Illinois   
2       Jun-18                              01 Live Animals  Illinois   
5       Sep-09                              01 Live Animals  Illinois   
6       Dec-11                              01 Live Animals  Illinois   
7       Jan-12                              01 Live Animals  Illinois   
...        ...                                          ...       ...   
110705  Jul-24  98 Special Classification Provisions, Nesoi  Illinois   
110706  Dec-24  98 Special Classification Provisions, Nesoi  Illinois   
110707  Feb-25  98 Special Classification Provisions, Nesoi  Illinois   
110708  Mar-25  98 Special Classification Provisions, Nesoi  Illinois   
110709  Apr-25  98 Special Classification Provisions, Nesoi  Illinois   

              Country  
0              Africa  
2        Asia - South  
5        Asia - Other  
6        Asia - Other  
7  

In [4]:
import numpy as np

value = dfe_value.iloc[:, 0].str.replace(',', '').astype(float).to_numpy()
weight = dfe_value.iloc[:, 1].str.replace(',', '').astype(float).to_numpy()

dfe_per_kg_value = value / weight
print(dfe_per_kg_value)




[1197.5          10.60970464    4.49838188 ...    3.23464771    4.41258094
    4.8673515 ]


In [7]:
import torch
# Convert columns to categorical and get integer codes
state_ids = dfe['State'].astype('category').cat.codes
commodity_ids = dfe['Commodity'].astype('category').cat.codes
country_ids = dfe['Country'].astype('category').cat.codes
time_ids = dfe['Time'].astype('category').cat.codes  # Treat Time as categorical

# Convert to PyTorch tensors
state_tensor = torch.tensor(state_ids.values, dtype=torch.long)
time_tensor = torch.tensor(time_ids.values, dtype=torch.long)
commodity_tensor = torch.tensor(commodity_ids.values, dtype=torch.long)
country_tensor = torch.tensor(country_ids.values, dtype=torch.long)

# Combine tensors
combined_input = torch.stack([state_tensor, time_tensor, commodity_tensor, country_tensor], dim=1)

# Print shape and tensor
print(combined_input.shape)
print(combined_input)

# Display the first few rows of the DataFrame to verify
print("\nExports DataFrame:")
print(dfe.head())

torch.Size([104170, 4])
tensor([[  0,  96,   0,   0],
        [  0, 115,   0,   2],
        [  0, 193,   0,   1],
        ...,
        [  0,  69,  96,   8],
        [  0, 139,  96,   8],
        [  0,  17,  96,   8]])

Exports DataFrame:
     Time        Commodity     State       Country
0  Jul-16  01 Live Animals  Illinois        Africa
2  Jun-18  01 Live Animals  Illinois  Asia - South
5  Sep-09  01 Live Animals  Illinois  Asia - Other
6  Dec-11  01 Live Animals  Illinois  Asia - Other
7  Jan-12  01 Live Animals  Illinois  Asia - Other


In [1]:
# Import necessary libraries
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')  # Suppress warnings for cleaner output

# Set device for PyTorch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


In [2]:
# Simplified S4 Layer
class S4Layer(nn.Module):
    def __init__(self, input_dim, state_dim, output_dim):
        super(S4Layer, self).__init__()
        self.input_dim = input_dim
        self.state_dim = state_dim
        self.output_dim = output_dim
        
        # State-space parameters
        self.A = nn.Parameter(torch.randn(state_dim, state_dim))
        self.B = nn.Parameter(torch.randn(state_dim, input_dim))
        self.C = nn.Parameter(torch.randn(output_dim, state_dim))
        self.D = nn.Parameter(torch.randn(output_dim, input_dim))
        
    def forward(self, x):
        # x: (batch, seq_len, input_dim)
        batch, seq_len, _ = x.size()
        state = torch.zeros(batch, self.state_dim, device=x.device)
        outputs = []
        
        # Discrete-time state-space update
        for t in range(seq_len):
            state = state @ self.A + x[:, t, :] @ self.B.T
            output = state @ self.C.T + x[:, t, :] @ self.D.T
            outputs.append(output)
        
        return torch.stack(outputs, dim=1)  # (batch, seq_len, output_dim)

# S4-based Demand Forecasting Model
class S4DemandModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, state_dim, output_dim):
        super(S4DemandModel, self).__init__()
        self.input_layer = nn.Linear(input_dim, hidden_dim)
        self.s4_layer = S4Layer(hidden_dim, state_dim, hidden_dim)
        self.output_layer = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.input_layer(x))
        x = self.s4_layer(x)
        x = self.output_layer(x)
        return x

print("S4 model defined")

S4 model defined


In [4]:
# Load and preprocess data
def load_and_preprocess_data():
    # Load CSV (exports only for now)
    dfe = pd.read_csv("custom_data/export/Illinois.csv", skiprows=1, header=0)
    
    # Print column names to debug
    print("Column names in CSV:", dfe.columns.tolist())
    
    # Identify the 'Time' column (case-insensitive, strip spaces)
    time_col = None
    for col in dfe.columns:
        if col.strip().lower() == 'time':
            time_col = col
            break
    
    if time_col is None:
        raise ValueError("No 'Time' column found in the CSV. Please check column names.")
    
    # Convert 'Time' to datetime and sort
    dfe[time_col] = pd.to_datetime(dfe[time_col], format='%b-%y', errors='coerce')
    if dfe[time_col].isna().any():
        print(f"Warning: Some '{time_col}' values could not be parsed. Filling with forward-fill.")
        dfe[time_col] = dfe[time_col].fillna(method='ffill')
    
    dfe = dfe.sort_values(['State', 'Commodity', 'Country', time_col])
    
    # Encode categorical variables
    dfe['State_id'] = dfe['State'].astype('category').cat.codes
    dfe['Commodity_id'] = dfe['Commodity'].astype('category').cat.codes
    dfe['Country_id'] = dfe['Country'].astype('category').cat.codes
    
    # Convert Time to numeric (months since 2000-01-01)
    reference_date = pd.to_datetime('2000-01-01')
    dfe['Time_id'] = ((dfe[time_col].dt.year - reference_date.year) * 12 + dfe[time_col].dt.month).astype(int)
    
    # Normalize Vessel Value ($US)
    scaler = StandardScaler()
    dfe['Vessel Value ($US)'] = scaler.fit_transform(dfe[['Vessel Value ($US)']].fillna(0))
    
    return dfe, scaler, time_col

# Execute preprocessing
dfe, scaler, time_col = load_and_preprocess_data()
print("DataFrame Head:")
print(dfe[['State', 'Commodity', 'Country', time_col, 'Vessel Value ($US)', 'Vessel SWT (kg)']].head())
print("\nUnique Time Values:", dfe[time_col].unique())

Column names in CSV: ['Current date: 07/30/2025 2:49 PM (Eastern Daylight Time)', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7']


ValueError: No 'Time' column found in the CSV. Please check column names.