In [6]:
import dask.dataframe as dd
# set path
trade_data_files = '/BACI_HS12_Y*.csv'
country_codes_file = '/country_codes_V202001.csv'
product_codes_file = '/product_codes_HS12_V202001.csv'

# read data
trade_data = dd.read_csv(trade_data_files, dtype={'k': 'object'}, encoding='latin1')
country_codes = dd.read_csv(country_codes_file, encoding='latin1')
product_codes = dd.read_csv(product_codes_file, dtype={'code': 'object'}, encoding='latin1')

In [10]:
# Merge country codes with trade data
trade_data = trade_data.merge(country_codes, left_on='i', right_on='country_code', how='left')
trade_data = trade_data.merge(country_codes, left_on='j', right_on='country_code', how='left', suffixes=('_exporter', '_importer'))

# Merge product codes with trade data
trade_data = trade_data.merge(product_codes, left_on='k', right_on='code', how='left')

In [12]:
# new file
econ_data = dd.read_csv('/EconMap_3.1_ssp2.csv', usecols=['code_wb', 'year', 'gdp', 'population'])

In [13]:
# Convert data types
econ_data['year'] = econ_data['year'].astype(int)
econ_data['code_wb'] = econ_data['code_wb'].astype(str)
# Ensure trade_data columns are correctly typed
trade_data['t'] = trade_data['t'].astype(int)
trade_data['iso_3digit_alpha_exporter'] = trade_data['iso_3digit_alpha_exporter'].astype(str)
trade_data['iso_3digit_alpha_importer'] = trade_data['iso_3digit_alpha_importer'].astype(str)

In [14]:
# Merge for exporters
trade_data = trade_data.merge(
    econ_data,
    left_on=['iso_3digit_alpha_exporter', 't'],
    right_on=['code_wb', 'year'],
    how='left',

)

In [15]:
trade_data = trade_data.rename(columns={'gdp': 'gdp_exporter', 'population': 'population_exporter'})

In [16]:
trade_data = trade_data.drop(['code_wb', 'year'], axis=1)

In [17]:
# Merge for importers
trade_data = trade_data.merge(
    econ_data,
    left_on=['iso_3digit_alpha_importer', 't'],
    right_on=['code_wb', 'year'],
    how='left')

In [18]:
trade_data = trade_data.rename(columns={'gdp': 'gdp_importer', 'population': 'population_importer'})

In [19]:
trade_data = trade_data.drop(['code_wb', 'year'], axis=1)

In [20]:
trade_data

Unnamed: 0_level_0,t,i,j,k,v,q,country_code_exporter,country_name_abbreviation_exporter,country_name_full_exporter,iso_2digit_alpha_exporter,iso_3digit_alpha_exporter,country_code_importer,country_name_abbreviation_importer,country_name_full_importer,iso_2digit_alpha_importer,iso_3digit_alpha_importer,code,description,gdp_exporter,population_exporter,gdp_importer,population_importer
npartitions=3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
,int64,int64,int64,string,float64,float64,int64,string,string,string,string,int64,string,string,string,string,string,string,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [21]:
new_data = trade_data.drop(['v','country_code_exporter','country_name_abbreviation_exporter','country_name_full_exporter','iso_2digit_alpha_exporter'
                           ,'iso_3digit_alpha_exporter','country_code_importer','country_name_abbreviation_importer','country_name_full_importer',
                           'iso_2digit_alpha_importer','iso_3digit_alpha_importer','code','description'], axis=1)

In [22]:
new_data['k'] = new_data['k'].astype(int)

In [28]:
# Convert int64 columns to int32
new_data['t'] = new_data['t'].astype('int32')
new_data['i'] = new_data['i'].astype('int32')
new_data['j'] = new_data['j'].astype('int32')
new_data['k'] = new_data['k'].astype('int32')

# Convert float64 columns to float32
float_cols = ['q', 'gdp_exporter', 'population_exporter', 'gdp_importer', 'population_importer']
for col in float_cols:
    new_data[col] = new_data[col].astype('float32')

In [29]:
new_data

Unnamed: 0_level_0,t,i,j,k,q,gdp_exporter,population_exporter,gdp_importer,population_importer
npartitions=3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,int32,int32,int32,int32,float32,float32,float32,float32,float32
,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...


In [30]:
# Drop rows with any missing values
new_data_clean = new_data.dropna()

# If you want to ensure that changes are computed and stored back to new_data
new_data_clean = new_data_clean.persist()

In [31]:
new_data_clean

Unnamed: 0_level_0,t,i,j,k,q,gdp_exporter,population_exporter,gdp_importer,population_importer
npartitions=3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,int32,int32,int32,int32,float32,float32,float32,float32,float32
,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...


In [33]:
pip install dask-ml

Collecting dask-ml
  Downloading dask_ml-2024.4.4-py3-none-any.whl (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.8/149.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dask-glm>=0.2.0 (from dask-ml)
  Downloading dask_glm-0.3.2-py2.py3-none-any.whl (13 kB)
Collecting sparse>=0.7.0 (from dask-glm>=0.2.0->dask-ml)
  Downloading sparse-0.15.1-py2.py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sparse, dask-glm, dask-ml
Successfully installed dask-glm-0.3.2 dask-ml-2024.4.4 sparse-0.15.1


In [35]:
from dask_ml.model_selection import train_test_split

# 'q' is the target variable and the rest are features
X = new_data_clean.drop('q', axis=1)
y = new_data_clean['q']

# splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [36]:
from sklearn.preprocessing import StandardScaler
from dask_ml.wrappers import Incremental

#feature engineering： scaling the data.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Converting back to Dask DataFrame for compatibility with dask-ml
X_train_scaled = dd.from_array(X_train_scaled, columns=X_train.columns)
X_test_scaled = dd.from_array(X_test_scaled, columns=X_test.columns)

We'll use GridSearchCV from Dask-ML to find the optimal parameters for the Random Forest and Gradient Boosting models. For the neural network, due to its typically higher complexity and computational demand, a simple parameter set will be used.

In [37]:
from dask_ml.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor

# Random Forest
rf = RandomForestRegressor(random_state=42)
param_grid_rf = {'n_estimators': [50, 100], 'max_depth': [10, 20]}
grid_rf = GridSearchCV(rf, param_grid_rf, cv=3)
grid_rf.fit(X_train_scaled.compute(), y_train.compute())  # Using compute for compatibility with Scikit-Learn's GridSearchCV

# Gradient Boosting
gb = GradientBoostingRegressor(random_state=42)
param_grid_gb = {'n_estimators': [50, 100], 'learning_rate': [0.1, 0.01]}
grid_gb = GridSearchCV(gb, param_grid_gb, cv=3)
grid_gb.fit(X_train_scaled.compute(), y_train.compute())

# Neural Network
nn = MLPRegressor(random_state=42, max_iter=500)
nn.fit(X_train_scaled.compute(), y_train.compute())

In [38]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predictions
y_pred_rf = grid_rf.predict(X_test_scaled.compute())
y_pred_gb = grid_gb.predict(X_test_scaled.compute())
y_pred_nn = nn.predict(X_test_scaled.compute())

# Evaluation
mse_rf = mean_squared_error(y_test.compute(), y_pred_rf)
mae_rf = mean_absolute_error(y_test.compute(), y_pred_rf)
r2_rf = r2_score(y_test.compute(), y_pred_rf)

mse_gb = mean_squared_error(y_test.compute(), y_pred_gb)
mae_gb = mean_absolute_error(y_test.compute(), y_pred_gb)
r2_gb = r2_score(y_test.compute(), y_pred_gb)

mse_nn = mean_squared_error(y_test.compute(), y_pred_nn)
mae_nn = mean_absolute_error(y_test.compute(), y_pred_nn)
r2_nn = r2_score(y_test.compute(), y_pred_nn)

print("Random Forest - MSE:", mse_rf, "MAE:", mae_rf, "R^2:", r2_rf)
print("Gradient Boosting - MSE:", mse_gb, "MAE:", mae_gb, "R^2:", r2_gb)
print("Neural Network - MSE:", mse_nn, "MAE:", mae_nn, "R^2:", r2_nn)

Random Forest - MSE: 248434160117.4171 MAE: 3726.5339390363015 R^2: -0.7616149848191338
Gradient Boosting - MSE: 154185221037.16415 MAE: 6320.245772361284 R^2: -0.09330776286298836
Neural Network - MSE: 140923606551.18076 MAE: 5813.59412791831 R^2: 0.000728656244437853
