In [None]:
!pip install scikit-learn==1.3.0

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [None]:
import sklearn
sklearn.__version__

### Load datasets

In [None]:
def to_int(x):
    if pd.isna(x):
        return x
    if isinstance(x, int):
        return x
    return int(x.replace(" ", ""))

In [None]:
df = pd.read_csv("datasets/train.csv")
df['Month 4'] = df['Month 4'].apply(lambda x: to_int(x))
df

## Extra Datasets

In [None]:
gspci = pd.read_csv("datasets/extra-dataset/GSCPI_data.csv", sep=",")
gspci['Year'] = gspci['Year-Month'].apply(lambda x: x.split('-')[0]).astype(int)
gspci['Month'] = gspci['Year-Month'].apply(lambda x: x.split('-')[1]).astype(int)
gspci.drop(columns=['Year-Month'], inplace=True)
gspci.head(3)

In [None]:
lpi = pd.read_csv("datasets/extra-dataset/LPIextend.csv")
lpi.drop(columns=["Unnamed: 0", "ID"], inplace=True)
lpi.loc[lpi.Country == 'Taiwan, China', 'Country'] = 'Taiwan'
lpi.head(3)

In [None]:
wb_eco = pd.read_csv("datasets/extra-dataset/worldbank_economic_data.csv")
wb_eco.head(3)

In [None]:
wb_inf = pd.read_csv("datasets/extra-dataset/worldbank_inflation_data.csv")
wb_inf['Year'] = wb_inf['Year-Month'].apply(lambda x: x.split('-')[0])
wb_inf['Month'] = wb_inf['Year-Month'].apply(lambda x: x.split('-')[1])
wb_inf.drop(columns=['Year-Month'], inplace=True)
wb_inf.head(3)

### Add country codes to extra datasets

In [None]:
country_codes = pd.read_csv('datasets/country_codes.txt', sep=',')
country_codes.loc[country_codes.Name == 'Namibia', 'Code'] = 'NA'
country_codes.loc[country_codes.Code == 'ID', 'Code'] = 'Id'
country_codes.head(3)

In [None]:
def get_country_code(country):
    df_country = country_codes.loc[country_codes.Name.str.contains(country)]
    if len(df_country) == 0:
        return "NoCode"
    return (df_country.Code.iloc[0])

def insert_code(df, index):
    df.insert(index, "Country_code", df.Country.apply(lambda country: get_country_code(country)))

In [None]:
insert_code(lpi, 1)

In [None]:
insert_code(wb_eco, 1)

In [None]:
insert_code(wb_inf, 1)

## Add from extra-datasets

In [None]:
def get_mean_gspci(date):
    months, year = date.split(" ")
    year = int(year)
    gscpi_months = []
    if months == 'may-aug':
        gscpi_months = [5, 6, 7, 8]
    elif months == 'sep-dec':
        gscpi_months = [9, 10, 11, 12]
    elif months == 'jan-apr':
        gscpi_months = [1, 2, 3, 4]
    elif months == 'may-jul':
        gscpi_months = [5, 6, 7]
        
    return gspci.loc[(gspci.Year == year) & (gspci.Month.isin(gscpi_months))].GSCPI.mean()

def add_gscpi_to_df(df):
    df['gscpi'] = [0]*len(df)
    for date in df.Date.unique():
        df.loc[df.Date == date, 'gscpi'] = get_mean_gspci(date)

In [None]:
add_gscpi_to_df(df)

## Add LPI

In [None]:
def add_lpi_to_df(df):
    lpi_col_to_add = ['Customs Score', 'Logistics Competence and Quality Score', 'International Shipments Score']
    for col in lpi_col_to_add:
        df[col] = ['']*len(df)
    for country_code in df.Country.unique():
        lpi_country = lpi.loc[lpi.Country_code == country_code]
        try:
            for col in lpi_col_to_add:
                df.loc[df.Country == country_code, col] = lpi_country[col].iloc[0]
        except:
            print(country_code)

In [None]:
add_lpi_to_df(df)

## Split Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
features_to_keep = ['Site', 'Reference proxy', 'Customer Persona proxy', 'Strategic Product Family proxy', 'Date', 'Month 1', 'Month 2', 'Month 3', 'gscpi']
#features_to_keep = list(df.columns)[2:]

In [None]:
X = df[features_to_keep]
X.dropna(inplace=True)
X.shape

In [None]:
y = df['Month 4']
X.drop(columns=['Month 4'], errors='ignore', inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
print(f"X_train shape -> {X_train.shape}, X_test shape -> {X_test.shape}")

## Target Encoding

In [None]:
from sklearn.preprocessing import TargetEncoder

In [None]:
enc = TargetEncoder(target_type='continuous')

In [None]:
index_fst_not_encoded = list(X.columns).index('Month 1')

In [None]:
X_train_not_encoded = X_train[X_train.columns[index_fst_not_encoded:]]
X_train = enc.fit_transform(X_train[X_train.columns[:index_fst_not_encoded]], y_train)
X_train = np.hstack((X_train, X_train_not_encoded))

In [None]:
X_test_not_encoded = X_test[X_test.columns[index_fst_not_encoded:]]
X_test = enc.transform(X_test[X_test.columns[:index_fst_not_encoded]])
X_test = np.hstack((X_test, X_test_not_encoded))

## Scores

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr

def scores(model, X, y):
    y_pred = model.predict(X)
    r2 = r2_score(y, y_pred)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    pearson, _ = pearsonr(y, y_pred)
    return mse, np.sqrt(mse), mae, r2, pearson

def print_scores(model):
    train_scores = scores(model, X_train, y_train)
    test_scores = scores(model, X_test, y_test)
    print(f"Train scores: MSE={train_scores[0]}, RMSE={train_scores[1]}, MAE={train_scores[2]}, R2={train_scores[3]}, Pearson={train_scores[4]}")
    print(f"Test scores: MSE={test_scores[0]}, RMSE={test_scores[1]}, MAE={test_scores[2]}, R2={test_scores[3]}, Pearson={test_scores[4]}")

In [None]:
def estim_score_hfactory(model):
    r0 = np.sqrt(mean_squared_error(y_test, [0]*len(y_test)))
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    return (r0 - 0.8*rmse) / r0

## Voting

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor, GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor

In [None]:
gb_model = GradientBoostingRegressor(n_estimators=200, learning_rate=1e-2, validation_fraction=0.15)
rf_model = RandomForestRegressor(n_estimators=400, verbose=10, n_jobs=-1,
                           max_depth=10, min_samples_leaf=2, oob_score=True, min_samples_split=0.3,
                           criterion="friedman_mse")

ensemble_model = VotingRegressor([('rf', rf_model), ('gb', gb_model)], n_jobs=-1, verbose=True)

ensemble_model.fit(X_train, y_train)

In [None]:
print_scores(ensemble_model)

In [None]:
estim_score_hfactory(ensemble_model)

In [None]:
print_scores(ensemble_model)

## Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

In [None]:
clf = RandomForestRegressor(n_estimators=250, verbose=2, n_jobs=-1,
                           max_depth=5, max_features='log2', min_samples_split=0.3,
                           criterion="friedman_mse")

In [None]:
clf.fit(X_train, y_train)

In [None]:
print_scores(clf)

In [None]:
print(f"Estimated HFactory score={estim_score_hfactory(clf)}")

In [None]:
importances = clf.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)

In [None]:
forest_importances = pd.Series(importances, index=X.columns)

fig, ax = plt.subplots(figsize=(12, 8))
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
plt.grid('on')
fig.tight_layout()

## Feature importance

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
feature_importances_permutation = permutation_importance(clf, X_train, y_train, n_repeats=3, n_jobs=-1)

# Predict X_test file

In [None]:
real_test = pd.read_csv("datasets/X_test_working.csv")

In [None]:
add_gscpi_to_df(real_test)

In [None]:
add_lpi_to_df(real_test)

In [None]:
real_test = real_test[features_to_keep]
real_test.shape

In [None]:
real_test_not_encoded = real_test[real_test.columns[index_fst_not_encoded:]]
real_test = enc.transform(real_test[real_test.columns[:index_fst_not_encoded]])
real_test = np.hstack((real_test, real_test_not_encoded))

In [None]:
real_pred = clf.predict(real_test)

In [None]:
real_test_results = pd.DataFrame()
real_test_results['index'] = pd.read_csv("datasets/X_test_working.csv")['index'].values
real_test_results['Month 4'] = real_pred

In [None]:
real_test_results