In [2]:
## Common python packages
import numpy as np
import pandas as pd

## For plotting
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline
import seaborn as sns
sns.set_style("white")
sns.set_context("notebook")
sns.set_color_codes()
import hist, mplhep

## sklearn - ML tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, precision_recall_curve
from sklearn.utils import resample, shuffle

from xgboost.sklearn import XGBClassifier
import xgboost as xgb

## weighted stats
from statsmodels.stats.weightstats import DescrStatsW

## "-" sign for graphs
rcParams['axes.unicode_minus'] = False

## Some extra styling
def namestr(obj, namespace = globals()):
    "Prints the name of a variable"
    return [name for name in namespace if namespace[name] is obj][0]

## For time
from time import time
from dateutil.relativedelta import relativedelta

##
from dataprep.clean import clean_country

In [3]:
## Univar Tools
def data_stats(df, cols = None):
    cols = df.columns if cols is None else cols
    return pd.DataFrame({"Mean": df[cols].mean, "Med": df[cols].median(), "STD": df[cols].std, 
                         "Min": df[cols].min(), "Max": df[cols].max()})
              
def hist_plotter(df, cols = None, range_x = None, n_std = 1, size = None, nbin = 100):
    cols = df.select_dtypes(include=np.number).columns if cols is None else cols
    for col in cols:
        range_ = [df[col].min() + n_std * df[col].std(), df[col].max() - n_std * df[col].std()] if range_x is None else range_x
        fig, ax = plt.subplots(1,1)
        df[col].plot(kind = "hist", range = range_, edgecolor = "blue", alpha = 1, bins = nbin, density = 1, ax = ax, figsize = size)
        plt.xlabel(col)
        plt.show()
    
def box_plotter(df, cols = None):
    cols = df.columns if cols is None else cols
    fig, ax = plt.subplots(1, 1)
    df[cols].boxplot(ax=ax)
    plt.xticks(rotation = 90)
    plt.show()

In [4]:
## Bivar Tools
def data_corr(df, size = None, cols = None):
    cols = df.columns if cols is None else cols 
    size = (len(cols), len(cols)) if size is None else size
    plt.figure(figsize = size)
    sns.heatmap(df.corr(), cmap = "coolwarm", square = True, vmin = -1, vmax = 1, annot=True)
    plt.show()

In [5]:
month_lib = {
    "jan": "-01",
    "feb": "-02",    
    "mar": "-03",
    "apr": "-04",    
    "may": "-05",
    "jun": "-06",    
    "jul": "-07",
    "aug": "-08",    
    "sep": "-09",
    "oct": "-10",    
    "nov": "-11",
    "dec": "-12"
}

In [6]:
df = pd.read_csv("datasets_hi4/train-data.csv", sep=";")
df_GSCPI = pd.read_csv("datasets_hi4/extra-dataset/GSCPI_data.csv")
df_LPI = pd.read_csv("datasets_hi4/extra-dataset/LPIextend.csv")

In [9]:
df_GSCPI["Date"] = pd.to_datetime(df_GSCPI["Year-Month"] + "-01", yearfirst=True)
df_GSCPI = df_GSCPI.drop(columns = ["Year-Month"])

KeyError: 'Year-Month'

In [11]:
cols = []
for col in df_LPI.columns:
    if "Score" in col:
            cols.append(col)
            
df_LPI["LogPerf"] = df_LPI[cols].mean(axis=1)

In [12]:
df_LPI = clean_country(df_LPI, "Country", output_format="alpha-2", inplace=True)



  0%|                                                     | 0/8 [00:00<?, ?it/s]

Country Cleaning Report:
	248 values cleaned (99.2%)
	2 values unable to be parsed (0.8%), set to NaN
Result contains 248 (99.2%) values in the correct format and 2 null values (0.8%)


In [13]:
df_LPI = df_LPI[["Country_clean", "LogPerf"]].rename(columns={"Country_clean": "Country"}) 

In [17]:
mdf = pd.merge(df, df_LPI, on = "Country")

In [18]:
mdf.head()

Unnamed: 0,index,id_product,Region,Country,Site,Operations,Zone,Cluster,Reference proxy,Product Line proxy,Division proxy,Customer Persona proxy,Strategic Product Family proxy,Product Life cycel status,Date,Month 1,Month 2,Month 3,Month 4,LogPerf
0,645874,156160,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-13523,Product Line-4,Division-3,Customer Segmentation-3,Strategic Product Family-12,,may-aug 2021,0,0,0,0,4.066667
1,1119813,140385,EUROPE,DE,DE_FO_BNDch,Europe Operations,DACH,Germany,reference-6404,Product Line-4,Division-3,Customer Segmentation-1,Strategic Product Family-12,,jan-apr 2022,0,0,0,0,4.066667
2,384038,139181,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-6321,Product Line-4,Division-3,Customer Segmentation-15,Strategic Product Family-12,,jan-apr 2021,0,0,0,0,4.066667
3,392116,147259,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-11181,Product Line-4,Division-3,Customer Segmentation-4,Strategic Product Family-9,,jan-apr 2021,0,0,0,0,4.066667
4,159613,159613,EUROPE,DE,NL_DC_HLD,Europe Operations,DACH,Germany,reference-10010,Product Line-3,Division-2,Customer Segmentation-33,Strategic Product Family-7,,sep-dec 2020,0,0,0,0,4.066667


In [19]:
date = mdf["Date"].str.split(n=1, expand=True)
mdf["Date_in"] = pd.to_datetime(date[1] + date[0].str.slice(stop=3).replace(month_lib) + "-01")
mdf = mdf.drop(columns = ["Date"])

In [21]:
mdf.head()

Unnamed: 0,index,id_product,Region,Country,Site,Operations,Zone,Cluster,Reference proxy,Product Line proxy,Division proxy,Customer Persona proxy,Strategic Product Family proxy,Product Life cycel status,Month 1,Month 2,Month 3,Month 4,LogPerf,Date_in
0,645874,156160,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-13523,Product Line-4,Division-3,Customer Segmentation-3,Strategic Product Family-12,,0,0,0,0,4.066667,2021-05-01
1,1119813,140385,EUROPE,DE,DE_FO_BNDch,Europe Operations,DACH,Germany,reference-6404,Product Line-4,Division-3,Customer Segmentation-1,Strategic Product Family-12,,0,0,0,0,4.066667,2022-01-01
2,384038,139181,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-6321,Product Line-4,Division-3,Customer Segmentation-15,Strategic Product Family-12,,0,0,0,0,4.066667,2021-01-01
3,392116,147259,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-11181,Product Line-4,Division-3,Customer Segmentation-4,Strategic Product Family-9,,0,0,0,0,4.066667,2021-01-01
4,159613,159613,EUROPE,DE,NL_DC_HLD,Europe Operations,DACH,Germany,reference-10010,Product Line-3,Division-2,Customer Segmentation-33,Strategic Product Family-7,,0,0,0,0,4.066667,2020-09-01
