In [1]:
## Env alias - mlcw

## Utilities
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error

In [15]:
# Load the dataset
data = pd.read_csv('my_data/dataset_full_features.csv')

### Pre-processing

In [16]:
# Drop exchange_data and food inflation data as do not provide valuable information
data = data.drop(['Exchange Data', 'Food Inflation'], axis=1)

# Define the columns to be used as features (excluding 'Export Value' and 'Area')
feature_columns = data.columns.difference(['Export Value', 'Area'])

# Split the data based on the year
# Training set from 1980 to 2017
X_train = data.loc[data['Year'].between(1980, 2017), feature_columns]
y_train = data.loc[data['Year'].between(1980, 2017), 'Export Value']

# Testing set from 2018 to 2022
X_test = data.loc[data['Year'].between(2018, 2022), feature_columns]
y_test = data.loc[data['Year'].between(2018, 2022), 'Export Value']

### Scaling the data and training the model

In [20]:
# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Scaling the target
target_scaler = StandardScaler()
y_train_scaled = target_scaler.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled = target_scaler.transform(y_test.values.reshape(-1, 1))

# Train the MLP Regressor
mlp = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam',
                   alpha=0.1, max_iter=500, random_state=5, early_stopping=True)

# Use ravel() to flatten the array back to 1D
mlp.fit(X_train_scaled, y_train_scaled.ravel())

# Predict the scaled export values for 2018 to 2022
y_pred_scaled = mlp.predict(X_test_scaled)

# Inverse transform the predicted scaled values back to the original scale
y_pred = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))

# Evaluate the model using Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error on Test Set:", mse)

# Optional: Display actual vs predicted values for visual comparison
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred.ravel()})
print(results.head())

Mean Squared Error on Test Set: 2734255323835.7236
     Actual      Predicted
38   591.29  127061.364117
39  8996.17  114620.992069
40  2044.86  132612.705539
41  1200.19   76635.415979
42  1168.42 -137481.058510


In [21]:
# Assuming y_test and y_pred are defined
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r_squared = r2_score(y_test, y_pred)

print("MAE:", mae)
print("RMSE:", rmse)
print("R-squared:", r_squared)

MAE: 428901.78384146636
RMSE: 1653558.3823487223
R-squared: 0.6941543023681334




In [19]:
# Create the csv file with the predicted data and the actual data
results.to_csv('my_data/model_predictions.csv', index=False)