In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import json
from datetime import datetime

# Advanced ML imports
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_predict
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
import shap
import joblib
import pickle

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical analysis
import scipy.stats as stats
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
np.random.seed(42)


In [2]:
# 1. SETUP AND DATA LOADING  

print("="*80)
print("ADVANCED MODEL BUILDING AND EVALUATION")
print("="*80)

# Load pre-trained models and data
print("Loading pre-trained models and data...")

try:
    best_export_model = joblib.load('best_export_model.pkl')
    best_class_model = joblib.load('best_classification_model.pkl')
    export_scaler = joblib.load('export_scaler.pkl')
    class_scaler = joblib.load('classification_scaler.pkl')
    
    # Load training data
    df = pd.read_csv('world_trade_cleaned.csv')
    country_df = df[df['Is_Country']].copy()
    
    # Load feature lists
    with open('export_features.txt', 'r') as f:
        export_features = [line.strip() for line in f.readlines()]
    
    with open('classification_features.txt', 'r') as f:
        class_features = [line.strip() for line in f.readlines()]
    
    print("✓ Models and data loaded successfully")
    
except Exception as e:
    print(f"Error loading saved models: {e}")
    print("Please run Notebook 03 first to train models")
    raise


ADVANCED MODEL BUILDING AND EVALUATION
Loading pre-trained models and data...
Error loading saved models: [Errno 2] No such file or directory: 'best_export_model.pkl'
Please run Notebook 03 first to train models


FileNotFoundError: [Errno 2] No such file or directory: 'best_export_model.pkl'

In [None]:
# 2. HYPERPARAMETER OPTIMIZATION 

print("\n2. HYPERPARAMETER OPTIMIZATION")
print("-"*40)

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor
import optuna

print("Performing advanced hyperparameter optimization...")

# Prepare data (using subset for faster optimization)
X_train_export = pd.read_csv('world_trade_countries_only.csv')
# Filter to get features needed for export prediction
X_train_export = X_train_export[export_features].dropna()
y_train_export = X_train_export['Export (US$ Thousand)_imputed'] if 'Export (US$ Thousand)_imputed' in X_train_export.columns else None

if len(X_train_export) > 1000:
    X_train_sample = X_train_export.sample(1000, random_state=42)
    if y_train_export is not None:
        y_train_sample = y_train_export.loc[X_train_sample.index]
    else:
        y_train_sample = None
else:
    X_train_sample = X_train_export
    y_train_sample = y_train_export

if y_train_sample is not None and len(X_train_sample) > 100:
    # Scale features
    X_scaled_sample = export_scaler.transform(X_train_sample)
    
    # Define Optuna objective function
    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 20),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
            'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
            'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
        }
        
        model = RandomForestRegressor(**params, random_state=42, n_jobs=-1)
        
        # Use cross-validation
        scores = cross_val_score(model, X_scaled_sample, y_train_sample, 
                                cv=3, scoring='r2', n_jobs=-1)
        return scores.mean()
    
    # Run optimization
    print("Running Bayesian optimization with Optuna...")
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20, show_progress_bar=True)
    
    print(f"\nBest hyperparameters found:")
    for key, value in study.best_params.items():
        print(f"  {key}: {value}")
    print(f"Best CV R²: {study.best_value:.4f}")
    
    # Train optimized model
    best_params = study.best_params
    optimized_model = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
    optimized_model.fit(X_scaled_sample, y_train_sample)
    
    # Save optimized model
    joblib.dump(optimized_model, 'optimized_export_model.pkl')
    print("✓ Optimized model saved")
else:
    print("Insufficient data for hyperparameter optimization")



In [3]:
# 3. MODEL COMPARISON WITH STATISTICAL TESTS 


print("\n3. MODEL COMPARISON WITH STATISTICAL TESTS")
print("-"*40)

# Load or create multiple model predictions
models_to_compare = {
    'Random Forest': best_export_model,
    'Linear Regression': joblib.load('best_export_model.pkl') if 'linear' in str(best_export_model).lower() else None,
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

# Prepare test data
test_data = country_df[export_features + ['Export (US$ Thousand)_imputed', 'Year_Value']].dropna()
test_data = test_data[test_data['Year_Value'] == test_data['Year_Value'].max()]

if len(test_data) > 0:
    X_test = test_data[export_features]
    y_test = test_data['Export (US$ Thousand)_imputed']
    X_test_scaled = export_scaler.transform(X_test)
    
    # Collect predictions
    predictions = {}
    for name, model in models_to_compare.items():
        if model is not None:
            try:
                if name != 'Random Forest':
                    model.fit(X_test_scaled[:100], y_test[:100])  # Quick fit
                preds = model.predict(X_test_scaled)
                predictions[name] = preds
                print(f"{name:20} R²: {r2_score(y_test, preds):.4f}")
            except:
                continue
    
    # Perform Diebold-Mariano test for model comparison
    if len(predictments) >= 2:
        print("\nPerforming model comparison tests...")
        
        # Calculate errors
        errors = {}
        for name, preds in predictions.items():
            errors[name] = y_test - preds
        
        # Compare each pair
        model_names = list(errors.keys())
        for i in range(len(model_names)):
            for j in range(i+1, len(model_names)):
                m1, m2 = model_names[i], model_names[j]
                
                # Calculate difference in squared errors
                diff = errors[m1]**2 - errors[m2]**2
                
                # Perform t-test
                t_stat, p_value = stats.ttest_1samp(diff, 0)
                
                print(f"\n{m1} vs {m2}:")
                print(f"  t-statistic: {t_stat:.4f}")
                print(f"  p-value: {p_value:.4f}")
                if p_value < 0.05:
                    better = m1 if diff.mean() < 0 else m2
                    print(f"  Statistically significant difference - {better} is better")
                else:
                    print("  No statistically significant difference")





3. MODEL COMPARISON WITH STATISTICAL TESTS
----------------------------------------


NameError: name 'best_export_model' is not defined