In [1]:
## Common python packages
import numpy as np
import pandas as pd

## For plotting
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline
import seaborn as sns
sns.set_style("white")
sns.set_context("notebook")
sns.set_color_codes()

## sklearn - ML tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, precision_recall_curve
from sklearn.utils import resample, shuffle

from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

## weighted stats
from statsmodels.stats.weightstats import DescrStatsW

## "-" sign for graphs
rcParams['axes.unicode_minus'] = False

## Some extra styling
def namestr(obj, namespace = globals()):
    "Prints the name of a variable"
    return [name for name in namespace if namespace[name] is obj][0]

## For time
from dateutil.relativedelta import relativedelta

## For country encoding
from dataprep.clean import clean_country

In [2]:
## Univar Tools
def data_stats(df, cols = None):
    cols = df.columns if cols is None else cols
    return pd.DataFrame({"Mean": df[cols].mean, "Med": df[cols].median(), "STD": df[cols].std, 
                         "Min": df[cols].min(), "Max": df[cols].max()})
              
def hist_plotter(df, cols = None, range_x = None, n_std = 1, size = None, nbin = 100):
    cols = df.select_dtypes(include=np.number).columns if cols is None else cols
    for col in cols:
        range_ = [df[col].min() + n_std * df[col].std(), df[col].max() - n_std * df[col].std()] if range_x is None else range_x
        fig, ax = plt.subplots(1,1)
        df[col].plot(kind = "hist", range = range_, edgecolor = "blue", alpha = 1, bins = nbin, density = 1, ax = ax, figsize = size)
        plt.xlabel(col)
        plt.show()
    
def box_plotter(df, cols = None):
    cols = df.columns if cols is None else cols
    fig, ax = plt.subplots(1, 1)
    df[cols].boxplot(ax=ax)
    plt.xticks(rotation = 90)
    plt.show()
    
## Bivar Tools
def data_corr(df, size = None, cols = None):
    cols = df.columns if cols is None else cols 
    size = (len(cols), len(cols)) if size is None else size
    plt.figure(figsize = size)
    sns.heatmap(df.corr(), cmap = "coolwarm", square = True, vmin = -1, vmax = 1, annot=True)
    plt.show()
        
## For date conversion
month_lib = {
    "jan": 0,
    "feb": 1,    
    "mar": 2,
    "apr": 3,    
    "may": 4,
    "jun": 5,    
    "jul": 6,
    "aug": 7,    
    "sep": 8,
    "oct": 9,    
    "nov": 10,
    "dec": 11
}

In [3]:
df = pd.read_csv("datasets_hi4/train-data.csv", sep=";")

date = df["Date"].str.split(n=1, expand=True)
df["Year"] = date[1]
df["Trisem"] = pd.to_numeric(date[0].str.slice(stop=3).replace(month_lib))//3
df["Trisem"] = df["Trisem"].astype(str)

df.drop_duplicates(inplace=True)

for col in ["Month 1", "Month 2", "Month 3", "Month 4"]:
    df[col] = pd.to_numeric(df[col].str.replace(" ", ""))
       
#df['Month 1'] = df.groupby('Strategic Product Family proxy')['Month 1'].transform(lambda x: x.fillna(x.mean()))
df["Product Life cycel status"] = df["Product Life cycel status"].fillna("ACT")

In [4]:
df_GSCPI = pd.read_csv("datasets_hi4/extra-dataset/GSCPI_data.csv")

df_GSCPI["Year"] = df_GSCPI["Year-Month"].str.slice(stop=4)
df_GSCPI["Trisem"] = pd.to_numeric(df_GSCPI["Year-Month"].str.slice(start=5))//3
df_GSCPI["Trisem"] = df_GSCPI["Trisem"].astype(str)

df_GSCPI = df_GSCPI.groupby(["Year", "Trisem"], as_index=False)["GSCPI"].mean()

In [5]:
df_LPI = pd.read_csv("datasets_hi4/extra-dataset/LPIextend.csv")

cols = []
for col in df_LPI.columns:
    if "Score" in col:
            cols.append(col)
            
df_LPI["LogPerf"] = df_LPI[cols].mean(axis=1)
df_LPI = df_LPI.replace('TC<rkiye',"Turkey")
df_LPI = clean_country(df_LPI, "Country", output_format="alpha-2", inplace=True)
df_LPI["Country_clean"] = df_LPI["Country_clean"].fillna('NA')

df_LPI = df_LPI[["Country_clean", "LogPerf"]].rename(columns={"Country_clean": "Country"}) 
df_LPI["LogPerf"] = df_LPI["LogPerf"].transform(lambda x: x.fillna(x.mean() - (1/5) * x.std()))



  0%|                                                                                            | 0/8 [00:00<…

Country Cleaning Report:
	249 values cleaned (99.6%)
	1 values unable to be parsed (0.4%), set to NaN
Result contains 249 (99.6%) values in the correct format and 1 null values (0.4%)


In [6]:
df_inf = pd.read_csv("datasets_hi4/extra-dataset/worldbank_inflation_data.csv")

inf_years = np.array(list(map(lambda x: x.split('-')[0], df_inf["Year-Month"].to_list())))
inf_months = np.array(list(map(lambda x: int(x.split('-')[1]), df_inf["Year-Month"].to_list())))

df_inf["Year"] = inf_years
df_inf["Trisem"] = list(map(str, (inf_months - 1)//4))

df_inf = df_inf.replace('SÃ£o TomÃ© and Principe',"Sao Tome and Principe")
df_inf = clean_country(df_inf, "Country", output_format="alpha-2")
df_inf.drop(columns = ['Country', 'Year-Month'], inplace = True)
df_inf.rename(columns = {"Country_clean": "Country"}, inplace = True)
df_inf["Country"] = df_inf["Country"].fillna('NA')

df_inf["Energy Price Index"] = df_inf["Energy Price Index"].transform(lambda x: x.fillna(x.mean()))
df_inf["Headline Consumer Price Index"] = df_inf["Headline Consumer Price Index"].transform(lambda x: x.fillna(x.mean()))

dfi_grouped = df_inf.groupby(["Year", "Trisem", "Country"])["Energy Price Index", "Headline Consumer Price Index"].mean()



  0%|                                                                                            | 0/8 [00:00<…

Country Cleaning Report:
	8084 values cleaned (99.47%)
	43 values unable to be parsed (0.53%), set to NaN
Result contains 8084 (99.47%) values in the correct format and 43 null values (0.53%)


  dfi_grouped = df_inf.groupby(["Year", "Trisem", "Country"])["Energy Price Index", "Headline Consumer Price Index"].mean()


In [7]:
df_epi = pd.read_csv("datasets_hi4/extra-dataset/epi2022results05302022.csv")

df_epi = clean_country(df_epi, "country", output_format="alpha-2", inplace=True)
df_epi = df_epi[["country_clean", "SDA.new", "NXA.new", "CDA.new", "CHA.new", "NDA.new",
                 "BCA.new", "GIB.new", "GHP.new"]]
df_epi = df_epi.rename(columns = {"country_clean": "Country"})
df_epi["EmAv"] = df_epi[["NXA.new", "CDA.new", "SDA.new", "BCA.new", "NDA.new"]].mean(axis=1)
df_epi = df_epi.drop(["NXA.new","CDA.new","SDA.new","BCA.new","NDA.new"],axis=1)

df_epi["Country"] = df_epi["Country"].fillna('NA')



  0%|                                                                                            | 0/8 [00:00<…

Country Cleaning Report:
	179 values cleaned (99.44%)
	1 values unable to be parsed (0.56%), set to NaN
Result contains 179 (99.44%) values in the correct format and 1 null values (0.56%)


In [8]:
df_econ = pd.read_csv("datasets_hi4/extra-dataset/worldbank_economic_data.csv")

df_econ = df_econ[df_econ["Country"].str.contains("Macao")==False ]

df_econ = df_econ.replace('Turkiye',"Turkey")
df_econ = clean_country(df_econ, "Country", output_format="alpha-2")
df_econ = df_econ.dropna(subset = "Country_clean")
df_econ.drop(columns = ['Country'], inplace = True)
df_econ.rename(columns = {"Country_clean": "Country"}, inplace = True)

df_econ["Year"] = df_econ["Year"].astype(str)

df_econ = df_econ[['Country', "Year", 
                   'Final consumption expenditure (annual % growth)', 
                   'GDP (current US$)', 
                   'Imports of goods and services (annual % growth)']].drop_duplicates()

df_econ["Final consumption expenditure (annual % growth)"] = df_econ["Final consumption expenditure (annual % growth)"].transform(lambda x: x.fillna(x.mean()))
df_econ["Imports of goods and services (annual % growth)"] = df_econ["Imports of goods and services (annual % growth)"].transform(lambda x: x.fillna(x.mean()))
df_econ["GDP (current US$)"] = df_econ["GDP (current US$)"].transform(lambda x: x.fillna(x.mean()))

TW_rows = pd.DataFrame([["TW", str(2020 + i), 
                         df_econ["Final consumption expenditure (annual % growth)"].mean(), 
                         df_econ["GDP (current US$)"].mean(),
                         df_econ["Imports of goods and services (annual % growth)"].mean()] for i in range(4)], 
                       columns = df_econ.columns)

df_econ = df_econ.append(TW_rows, ignore_index = True)



  0%|                                                                                            | 0/8 [00:00<…

Country Cleaning Report:
	856 values cleaned (80.75%)
	204 values unable to be parsed (19.25%), set to NaN
Result contains 856 (80.75%) values in the correct format and 204 null values (19.25%)


  df_econ = df_econ.append(TW_rows, ignore_index = True)


In [9]:
mdf = pd.merge(df, df_GSCPI, on=["Year", "Trisem"])
mdf = pd.merge(mdf, df_LPI, on =["Country"])
mdf = pd.merge(mdf, df_epi, on=["Country"])
mdf = pd.merge(mdf, df_econ, on = ['Country', 'Year'])
mdf_train = pd.merge(mdf, dfi_grouped, how = "left", on = ["Year", "Trisem", "Country"])

In [10]:
X_test = pd.read_csv("datasets_hi4/X_test.csv", sep=";")
y_test = pd.read_csv("datasets_hi4/y_test_example.csv", sep=";")

In [11]:
for col in ["Month 1", "Month 2", "Month 3"]:
    X_test[col] = pd.to_numeric(X_test[col].str.replace(" ", ""))

date = X_test["Date"].str.split(n=1, expand=True)
X_test["Year"] = date[1]
X_test["Trisem"] = pd.to_numeric(date[0].str.slice(stop=3).replace(month_lib))//3
X_test["Trisem"] = X_test["Trisem"].astype(str)

In [12]:
X_test = pd.merge(X_test, df_GSCPI, on=["Year", "Trisem"])
X_test = pd.merge(X_test, df_LPI, on =["Country"])
X_test = pd.merge(X_test, df_epi, on=["Country"])
X_test = pd.merge(X_test, df_econ, on = ['Country', 'Year'])
X_test = pd.merge(X_test, dfi_grouped, how = "left", on = ["Year", "Trisem", "Country"])

In [23]:
dataX, datay = mdf_train.drop(columns = ["Month 4"]), mdf_train["Month 4"]

X_train, X_test, y_train, y_test = train_test_split(dataX, datay, test_size = .2)
X_train['Month 1'] = X_train.groupby('Strategic Product Family proxy')['Month 1'].transform(lambda x: x.fillna(x.mean()))
X_test['Month 1'] = X_test.groupby('Strategic Product Family proxy')['Month 1'].transform(lambda x: x.fillna(x.mean()))

In [15]:
#model_cbr = CatBoostRegressor()

In [16]:
#model_cbr.fit(X_train, y_train)

In [17]:
#model_xgb = xgb.train(dtrain, params = {})