In [8]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

# Machine Learning imports
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression, RFE
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# Set random seed for reproducibility
np.random.seed(42)


In [9]:
# 1. DATA PREPARATION AND PROBLEM FORMULATION 

print("="*80)
print("WORLD TRADE PREDICTIVE MODELING")
print("="*80)

# Load cleaned data
df = pd.read_csv('../notebooks/world_trade_cleaned.csv')
country_df = df[df['Is_Country']].copy()

print(f"Dataset loaded: {country_df.shape[0]} rows, {country_df.shape[1]} columns")
print(f"Years available: {sorted(country_df['Year_Value'].unique())}")


WORLD TRADE PREDICTIVE MODELING
Dataset loaded: 7695 rows, 42 columns
Years available: [np.int64(1988), np.int64(1989), np.int64(1990), np.int64(1991), np.int64(1992), np.int64(1993), np.int64(1994), np.int64(1995), np.int64(1996), np.int64(1997), np.int64(1998), np.int64(1999), np.int64(2000), np.int64(2001), np.int64(2002), np.int64(2003), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2009), np.int64(2010), np.int64(2011), np.int64(2012), np.int64(2013), np.int64(2014), np.int64(2015), np.int64(2016), np.int64(2017), np.int64(2018), np.int64(2019), np.int64(2020), np.int64(2021)]


In [10]:
# 2. PROBLEM FORMULATION 

print("\n2. PROBLEM FORMULATION")
print("-"*40)

# We'll create three predictive problems:
# 1. Regression: Predict export value
# 2. Classification: Predict trade balance direction (surplus/deficit)
# 3. Time Series: Predict next year's trade metrics

print("Three modeling approaches will be implemented:")
print("1. Export Value Prediction (Regression)")
print("2. Trade Balance Direction Prediction (Classification)")
print("3. Trade Growth Rate Prediction (Regression)")




2. PROBLEM FORMULATION
----------------------------------------
Three modeling approaches will be implemented:
1. Export Value Prediction (Regression)
2. Trade Balance Direction Prediction (Classification)
3. Trade Growth Rate Prediction (Regression)


In [11]:
# 3. FEATURE ENGINEERING 

print("\n3. FEATURE ENGINEERING")
print("-"*40)

# Create lag features for time series analysis
country_df = country_df.sort_values(['Partner Name', 'Year_Value'])

# Create lag features (previous year's values)
lag_features = [
    'Export (US$ Thousand)_imputed',
    'Import (US$ Thousand)_imputed',
    'Trade_Balance',
    'Total_Trade',
    'AHS Simple Average (%)',
    'MFN Simple Average (%)'
]

for feature in lag_features:
    if feature in country_df.columns:
        country_df[f'{feature}_lag1'] = country_df.groupby('Partner Name')[feature].shift(1)
        country_df[f'{feature}_lag2'] = country_df.groupby('Partner Name')[feature].shift(2)

# Create growth rate features
country_df['Export_Growth_Rate'] = country_df.groupby('Partner Name')['Export (US$ Thousand)_imputed'].pct_change()
country_df['Import_Growth_Rate'] = country_df.groupby('Partner Name')['Import (US$ Thousand)_imputed'].pct_change()

# Create rolling statistics
country_df['Export_3yr_avg'] = country_df.groupby('Partner Name')['Export (US$ Thousand)_imputed'].transform(
    lambda x: x.rolling(3, min_periods=1).mean())
country_df['Import_3yr_avg'] = country_df.groupby('Partner Name')['Import (US$ Thousand)_imputed'].transform(
    lambda x: x.rolling(3, min_periods=1).mean())

# Create interaction features
country_df['Export_Import_Ratio'] = country_df['Export (US$ Thousand)_imputed'] / (country_df['Import (US$ Thousand)_imputed'] + 1)
country_df['Tariff_Differential'] = country_df['AHS Simple Average (%)'] - country_df['MFN Simple Average (%)']

# Create regional aggregates as features
regional_stats = country_df.groupby(['Region', 'Year_Value']).agg({
    'Export (US$ Thousand)_imputed': ['mean', 'std'],
    'Import (US$ Thousand)_imputed': ['mean', 'std']
}).reset_index()

regional_stats.columns = ['Region', 'Year_Value', 
                         'Region_Export_Mean', 'Region_Export_Std',
                         'Region_Import_Mean', 'Region_Import_Std']

country_df = pd.merge(country_df, regional_stats, on=['Region', 'Year_Value'], how='left')

# Normalize by regional statistics
country_df['Export_Relative_to_Region'] = country_df['Export (US$ Thousand)_imputed'] / country_df['Region_Export_Mean']
country_df['Import_Relative_to_Region'] = country_df['Import (US$ Thousand)_imputed'] / country_df['Region_Import_Mean']

print(f"Total features after engineering: {len(country_df.columns)}")
print(f"Sample engineered features: {list(country_df.columns[-10:])}")




3. FEATURE ENGINEERING
----------------------------------------
Total features after engineering: 66
Sample engineered features: ['Export_3yr_avg', 'Import_3yr_avg', 'Export_Import_Ratio', 'Tariff_Differential', 'Region_Export_Mean', 'Region_Export_Std', 'Region_Import_Mean', 'Region_Import_Std', 'Export_Relative_to_Region', 'Import_Relative_to_Region']


In [None]:
# 4. PROBLEM 1: EXPORT VALUE PREDICTION 

print("\n4. PROBLEM 1: EXPORT VALUE PREDICTION (REGRESSION)")
print("-"*40)

# Prepare dataset for export prediction
export_features = [
    # Lag features
    'Export (US$ Thousand)_imputed_lag1',
    'Import (US$ Thousand)_imputed_lag1',
    'Trade_Balance_lag1',
    
    # Current year features (excluding export)
    'Import (US$ Thousand)_imputed',
    'AHS Simple Average (%)',
    'MFN Simple Average (%)',
    
    # Engineered features
    'Export_Import_Ratio',
    'Tariff_Differential',
    'Export_Relative_to_Region',
    
    # Regional features
    'Region_Export_Mean',
    'Region_Import_Mean'
]

# Target variable
target = 'Export (US$ Thousand)_imputed'

# Filter data with required features
export_df = country_df[export_features + [target] + ['Year_Value', 'Partner Name']].copy()
export_df = export_df.dropna()

print(f"Export prediction dataset: {export_df.shape}")

# Separate features and target
X_export = export_df[export_features]
y_export = export_df[target]

# Split data (by year for temporal validation)
train_mask = export_df['Year_Value'] < export_df['Year_Value'].max()
X_train_export = X_export[train_mask]
X_test_export = X_export[~train_mask]
y_train_export = y_export[train_mask]
y_test_export = y_export[~train_mask]

print(f"Train size: {X_train_export.shape}, Test size: {X_test_export.shape}")

# Scale features
scaler_export = StandardScaler()
X_train_scaled_export = scaler_export.fit_transform(X_train_export)
X_test_scaled_export = scaler_export.transform(X_test_export)

