In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

### Load datasets

In [None]:
def to_int(x):
    if pd.isna(x):
        return x
    if isinstance(x, int):
        return x
    return int(x.replace(" ", ""))

In [None]:
df = pd.read_csv("datasets/train.csv")
df['Month 4'] = df['Month 4'].apply(lambda x: to_int(x))
df

## Extra Datasets

In [None]:
gspci = pd.read_csv("datasets/extra-dataset/GSCPI_data.csv", sep=",")
gspci['Year'] = gspci['Year-Month'].apply(lambda x: x.split('-')[0]).astype(int)
gspci['Month'] = gspci['Year-Month'].apply(lambda x: x.split('-')[1]).astype(int)
gspci.drop(columns=['Year-Month'], inplace=True)
gspci.head(3)

In [None]:
lpi = pd.read_csv("datasets/extra-dataset/LPIextend.csv")
lpi.drop(columns=["Unnamed: 0", "ID"], inplace=True)
lpi.loc[lpi.Country == 'Taiwan, China', 'Country'] = 'Taiwan'
lpi.head(3)

In [None]:
wb_eco = pd.read_csv("datasets/extra-dataset/worldbank_economic_data.csv")
wb_eco.head(3)

In [None]:
wb_inf = pd.read_csv("datasets/extra-dataset/worldbank_inflation_data.csv")
wb_inf['Year'] = wb_inf['Year-Month'].apply(lambda x: x.split('-')[0])
wb_inf['Month'] = wb_inf['Year-Month'].apply(lambda x: x.split('-')[1])
wb_inf.drop(columns=['Year-Month'], inplace=True)
wb_inf.head(3)

## Add from extra-datasets

In [None]:
def get_mean_gspci(date):
    months, year = date.split(" ")
    year = int(year)
    gscpi_months = []
    if months == 'may-aug':
        gscpi_months = [5, 6, 7, 8]
    elif months == 'sep-dec':
        gscpi_months = [9, 10, 11, 12]
    elif months == 'jan-apr':
        gscpi_months = [1, 2, 3, 4]
    elif months == 'may-jul':
        gscpi_months = [5, 6, 7]
        
    return gspci.loc[(gspci.Year == year) & (gspci.Month.isin(gscpi_months))].GSCPI.mean()

In [None]:
df['gscpi'] = [0]*len(df)

In [None]:
for date in df.Date.unique():
    df.loc[df.Date == date, 'gscpi'] = get_mean_gspci(date)

### Add country codes to extra datasets

In [None]:
country_codes = pd.read_csv('datasets/country_codes.txt', sep=',')
country_codes.head(3)

In [None]:
def get_country_code(country):
    df_country = country_codes.loc[country_codes.Name.str.contains(country)]
    if len(df_country) == 0:
        return "NoCode"
    return df_country.Code.iloc[0]

def insert_code(df, index):
    df.insert(index, "Country_code", df.Country.apply(lambda country: get_country_code(country)))

In [None]:
insert_code(lpi, 1)

In [None]:
insert_code(wb_eco, 1)

In [None]:
insert_code(wb_inf, 1)

## Add LPI

In [None]:
lpi_col_to_add = ['Customs Score', 'Logistics Competence and Quality Score', 'International Shipments Score']

In [None]:
for col in lpi_col_to_add:
    df[col] = ['']*len(df)

In [None]:
for country_code in df.Country.unique():
    lpi_country = lpi.loc[lpi.Country_code == country_code]
    for col in lpi_col_to_add:
        df.loc[df.Country == country_code, col] = lpi_country[col].iloc[0]

## Split Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df.dropna(inplace=True)
X = df[df.columns[2:]]
X.drop(columns=['Month 4'], inplace=True)

In [None]:
y = df['Month 4']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
print(f"X_train shape -> {X_train.shape}, X_test shape -> {X_test.shape}")

## Target Encoding

In [None]:
from sklearn.preprocessing import TargetEncoder

In [None]:
enc = TargetEncoder(target_type='continuous')

In [None]:
index_fst_not_encoded = list(X.columns).index('Month 1')

In [None]:
X_train_not_encoded = X_train[X_train.columns[index_fst_not_encoded:]]
X_train = enc.fit_transform(X_train[X_train.columns[:index_fst_not_encoded]], y_train)
X_train = np.hstack((X_train, X_train_not_encoded))

In [None]:
X_test_not_encoded = X_test[X_test.columns[index_fst_not_encoded:]]
X_test = enc.fit_transform(X_test[X_test.columns[:index_fst_not_encoded]], y_test)
X_test = np.hstack((X_test, X_test_not_encoded))

## Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

In [None]:
clf = RandomForestRegressor(n_estimators=150, verbose=10, n_jobs=-1)
#clf = HistGradientBoostingRegressor(max_iter=1000, verbose=10, learning_rate=1e-3)
# categorical_mask = np.array([True] * 14 + [False] * 3)
# clf = HistGradientBoostingRegressor(max_iter=10_000, verbose=1, learning_rate=1e-3, categorical_features=categorical_mask, warm_start=True)

In [None]:
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

def scores(model):
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    pearson, _ = pearsonr(y_test, y_pred)
    return r2, mse, pearson

In [None]:
scores(clf)

In [None]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)

In [None]:
forest_importances = pd.Series(importances, index=X.columns)

fig, ax = plt.subplots(figsize=(12, 8))
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
plt.grid('on')
fig.tight_layout()