In [32]:
## Common python packages
import numpy as np
import pandas as pd

## For plotting
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline
import seaborn as sns
sns.set_style("white")
sns.set_context("notebook")
sns.set_color_codes()

## sklearn - ML tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, precision_recall_curve
from sklearn.utils import resample, shuffle

from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from xgboost import 
from catboost import CatBoostRegressor

## weighted stats
from statsmodels.stats.weightstats import DescrStatsW

## "-" sign for graphs
rcParams['axes.unicode_minus'] = False

## Some extra styling
def namestr(obj, namespace = globals()):
    "Prints the name of a variable"
    return [name for name in namespace if namespace[name] is obj][0]

## For time
from dateutil.relativedelta import relativedelta

## For country encoding
from dataprep.clean import clean_country

In [2]:
## Univar Tools
def data_stats(df, cols = None):
    cols = df.columns if cols is None else cols
    return pd.DataFrame({"Mean": df[cols].mean, "Med": df[cols].median(), "STD": df[cols].std, 
                         "Min": df[cols].min(), "Max": df[cols].max()})
              
def hist_plotter(df, cols = None, range_x = None, n_std = 1, size = None, nbin = 100):
    cols = df.select_dtypes(include=np.number).columns if cols is None else cols
    for col in cols:
        range_ = [df[col].min() + n_std * df[col].std(), df[col].max() - n_std * df[col].std()] if range_x is None else range_x
        fig, ax = plt.subplots(1,1)
        df[col].plot(kind = "hist", range = range_, edgecolor = "blue", alpha = 1, bins = nbin, density = 1, ax = ax, figsize = size)
        plt.xlabel(col)
        plt.show()
    
def box_plotter(df, cols = None):
    cols = df.columns if cols is None else cols
    fig, ax = plt.subplots(1, 1)
    df[cols].boxplot(ax=ax)
    plt.xticks(rotation = 90)
    plt.show()

In [3]:
## Bivar Tools
def data_corr(df, size = None, cols = None):
    cols = df.columns if cols is None else cols 
    size = (len(cols), len(cols)) if size is None else size
    plt.figure(figsize = size)
    sns.heatmap(df.corr(), cmap = "coolwarm", square = True, vmin = -1, vmax = 1, annot=True)
    plt.show()

In [4]:
month_lib = {
    "jan": 0,
    "feb": 1,    
    "mar": 2,
    "apr": 3,    
    "may": 4,
    "jun": 5,    
    "jul": 6,
    "aug": 7,    
    "sep": 8,
    "oct": 9,    
    "nov": 10,
    "dec": 11
}

In [5]:
df = pd.read_csv("datasets_hi4/train-data.csv", sep=";")
df_GSCPI = pd.read_csv("datasets_hi4/extra-dataset/GSCPI_data.csv")
df_LPI = pd.read_csv("datasets_hi4/extra-dataset/LPIextend.csv")
df_econ = pd.read_csv("datasets_hi4/extra-dataset/worldbank_economic_data.csv")
df_inf = pd.read_csv("datasets_hi4/extra-dataset/worldbank_inflation_data.csv")

In [6]:
date = df["Date"].str.split(n=1, expand=True)
df["Year"] = date[1]
df["Trisem"] = pd.to_numeric(date[0].str.slice(stop=3).replace(month_lib))//3
df["Trisem"] = df["Trisem"].astype(str)

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
for col in ["Month 1", "Month 2", "Month 3", "Month 4"]:
    df[col] = pd.to_numeric(df[col].str.replace(" ", ""))

In [9]:
df_GSCPI["Year"] = df_GSCPI["Year-Month"].str.slice(stop=4)
df_GSCPI["Trisem"] = pd.to_numeric(df_GSCPI["Year-Month"].str.slice(start=5))//3
df_GSCPI["Trisem"] = df_GSCPI["Trisem"].astype(str)
df_GSCPI.head()

Unnamed: 0,Year-Month,GSCPI,Year,Trisem
0,2020-01,0.090108,2020,0
1,2020-02,1.191976,2020,0
2,2020-03,2.546002,2020,1
3,2020-04,3.161703,2020,1
4,2020-05,2.53711,2020,1


In [10]:
df_GSCPI = df_GSCPI.groupby(["Year", "Trisem"], as_index=False)["GSCPI"].mean()

In [11]:
mdf_c = pd.merge(df, df_GSCPI, on=["Year", "Trisem"])

In [12]:
cols = []
for col in df_LPI.columns:
    if "Score" in col:
            cols.append(col)
            
df_LPI["LogPerf"] = df_LPI[cols].mean(axis=1)
df_LPI = clean_country(df_LPI, "Country", output_format="alpha-2", inplace=True)
df_LPI = df_LPI[["Country_clean", "LogPerf"]].rename(columns={"Country_clean": "Country"}) 



  0%|                                                     | 0/8 [00:00<?, ?it/s]

Country Cleaning Report:
	248 values cleaned (99.2%)
	2 values unable to be parsed (0.8%), set to NaN
Result contains 248 (99.2%) values in the correct format and 2 null values (0.8%)


In [13]:
mdf = pd.merge(mdf_c, df_LPI, on = "Country")

In [15]:
inf_years = np.array(list(map(lambda x: x.split('-')[0], df_inf["Year-Month"].to_list())))
inf_months = np.array(list(map(lambda x: int(x.split('-')[1]), df_inf["Year-Month"].to_list())))

df_inf["Year"] = inf_years
df_inf["Trisem"] = list(map(str, (inf_months - 1)//4))

df_inf =  clean_country(df_inf, "Country", output_format="alpha-2").dropna(subset = "Country")
df_inf.drop(columns = ['Country', 'Year-Month'], inplace = True)
df_inf.rename(columns = {"Country_clean": "Country"}, inplace = True)

dfi_grouped = df_inf.groupby(["Year", "Trisem", "Country"])["Energy Price Index", "Headline Consumer Price Index"].mean()



  0%|                                                     | 0/8 [00:00<?, ?it/s]

Country Cleaning Report:
	8041 values cleaned (98.94%)
	86 values unable to be parsed (1.06%), set to NaN
Result contains 8041 (98.94%) values in the correct format and 86 null values (1.06%)


  dfi_grouped = df_inf.groupby(["Year", "Trisem", "Country"])["Energy Price Index", "Headline Consumer Price Index"].mean()


In [17]:
mdf2 = pd.merge(mdf, dfi_grouped, how = "left", on = ["Year", "Trisem", "Country"])

In [18]:
X_test = pd.read_csv("datasets_hi4/X_test.csv", sep=";")
y_test = pd.read_csv("datasets_hi4/y_test_example.csv", sep=";")

In [24]:
date = X_test["Date"].str.split(n=1, expand=True)
X_test["Year"] = date[1]
X_test["Trisem"] = pd.to_numeric(date[0].str.slice(stop=3).replace(month_lib))//3
X_test["Trisem"] = X_test["Trisem"].astype(str)

In [25]:
mdf_test = pd.merge(X_test, df_GSCPI, on=["Year", "Trisem"])
mdff_test = pd.merge(mdf_test, df_LPI, on = "Country")
mX_test = pd.merge(mdff_test, dfi_grouped, how = "left", on = ["Year", "Trisem", "Country"])

In [26]:
X_train = mdf2.drop(columns = ["Month 4"])
y_train = mdf2["Month 4"]

In [43]:
#model_cbr = CatBoostRegressor()

In [45]:
#model_cbr.fit(X_train, y_train)

In [None]:
#model_xgb = xgb.train(dtrain, params = {})