In [2]:
## Common python packages
import numpy as np
import pandas as pd

## For plotting
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline
import seaborn as sns
sns.set_style("white")
sns.set_context("notebook")
sns.set_color_codes()
import hist, mplhep

## sklearn - ML tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, precision_recall_curve
from sklearn.utils import resample, shuffle

from xgboost.sklearn import XGBClassifier
import xgboost as xgb

## weighted stats
from statsmodels.stats.weightstats import DescrStatsW

## "-" sign for graphs
rcParams['axes.unicode_minus'] = False

## Some extra styling
def namestr(obj, namespace = globals()):
    "Prints the name of a variable"
    return [name for name in namespace if namespace[name] is obj][0]

## For time
from time import time
from dateutil.relativedelta import relativedelta

##
from dataprep.clean import clean_country

In [3]:
## Univar Tools
def data_stats(df, cols = None):
    cols = df.columns if cols is None else cols
    return pd.DataFrame({"Mean": df[cols].mean, "Med": df[cols].median(), "STD": df[cols].std, 
                         "Min": df[cols].min(), "Max": df[cols].max()})
              
def hist_plotter(df, cols = None, range_x = None, n_std = 1, size = None, nbin = 100):
    cols = df.select_dtypes(include=np.number).columns if cols is None else cols
    for col in cols:
        range_ = [df[col].min() + n_std * df[col].std(), df[col].max() - n_std * df[col].std()] if range_x is None else range_x
        fig, ax = plt.subplots(1,1)
        df[col].plot(kind = "hist", range = range_, edgecolor = "blue", alpha = 1, bins = nbin, density = 1, ax = ax, figsize = size)
        plt.xlabel(col)
        plt.show()
    
def box_plotter(df, cols = None):
    cols = df.columns if cols is None else cols
    fig, ax = plt.subplots(1, 1)
    df[cols].boxplot(ax=ax)
    plt.xticks(rotation = 90)
    plt.show()

In [4]:
## Bivar Tools
def data_corr(df, size = None, cols = None):
    cols = df.columns if cols is None else cols 
    size = (len(cols), len(cols)) if size is None else size
    plt.figure(figsize = size)
    sns.heatmap(df.corr(), cmap = "coolwarm", square = True, vmin = -1, vmax = 1, annot=True)
    plt.show()

In [22]:
month_lib = {
    "jan": 0,
    "feb": 1,    
    "mar": 2,
    "apr": 3,    
    "may": 4,
    "jun": 5,    
    "jul": 6,
    "aug": 7,    
    "sep": 8,
    "oct": 9,    
    "nov": 10,
    "dec": 11
}

In [33]:
df = pd.read_csv("datasets_hi4/train-data.csv", sep=";")
df_GSCPI = pd.read_csv("datasets_hi4/extra-dataset/GSCPI_data.csv")
df_LPI = pd.read_csv("datasets_hi4/extra-dataset/LPIextend.csv")

In [29]:
date = df["Date"].str.split(n=1, expand=True)
df["Year"] = date[1]
df["Quarter"] = pd.to_numeric(date[0].str.slice(stop=3).replace(month_lib))//3
df["Quarter"] = df["Quarter"].astype(str)
df.head()

Unnamed: 0,index,id_product,Region,Country,Site,Operations,Zone,Cluster,Reference proxy,Product Line proxy,...,Customer Persona proxy,Strategic Product Family proxy,Product Life cycel status,Date,Month 1,Month 2,Month 3,Month 4,Year,Quarter
0,645874,156160,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-13523,Product Line-4,...,Customer Segmentation-3,Strategic Product Family-12,,may-aug 2021,0,0,0,0,2021,1
1,469488,224631,CHINA,CN,CN_DC_Shanghai,China Operations,China & HK,China,reference-12778,Product Line-4,...,Customer Segmentation-14,Strategic Product Family-9,,jan-apr 2021,0,1,0,0,2021,0
2,348904,104047,EUROPE,GB,NL_DC_HLD,Europe Operations,UK and Ireland,United Kingdom,reference-3513,Product Line-3,...,Customer Segmentation-23,Strategic Product Family-7,,jan-apr 2021,0,2,0,0,2021,0
3,1725822,11823,EAJP,AU,AU_DC_Perth,International Operations,Pacific,Australia,reference-672,Product Line-1,...,Customer Segmentation-11,Strategic Product Family-3,,jan-apr 2023,0,0,0,0,2023,0
4,404781,159924,EUROPE,ES,NL_DC_HLD,Europe Operations,Iberia,Spain,reference-3496,Product Line-3,...,Customer Segmentation-14,Strategic Product Family-7,,jan-apr 2021,0,0,0,0,2021,0


In [34]:
df_GSCPI["Year"] = df_GSCPI["Year-Month"].str.slice(stop=4)
df_GSCPI["Quarter"] = pd.to_numeric(df_GSCPI["Year-Month"].str.slice(start=5))//3
df_GSCPI["Quarter"] = df_GSCPI["Quarter"].astype(str)
df_GSCPI.head()

Unnamed: 0,Year-Month,GSCPI,Year,Quarter
0,2020-01,0.090108,2020,0
1,2020-02,1.191976,2020,0
2,2020-03,2.546002,2020,0
3,2020-04,3.161703,2020,1
4,2020-05,2.53711,2020,1


In [35]:
df_GSCPI["Ave"] = 0
for y in df["Year"]:
    for q in df["Quarter"]:
        df[]["Ave"] += df[]["GSCPI"]/4

KeyError: 'Year'

In [11]:
cols = []
for col in df_LPI.columns:
    if "Score" in col:
            cols.append(col)
            
df_LPI["LogPerf"] = df_LPI[cols].mean(axis=1)
df_LPI = clean_country(df_LPI, "Country", output_format="alpha-2", inplace=True)
df_LPI = df_LPI[["Country_clean", "LogPerf"]].rename(columns={"Country_clean": "Country"}) 

In [17]:
mdf = pd.merge(df, df_LPI, on = "Country")

In [18]:
mdf.head()

Unnamed: 0,index,id_product,Region,Country,Site,Operations,Zone,Cluster,Reference proxy,Product Line proxy,Division proxy,Customer Persona proxy,Strategic Product Family proxy,Product Life cycel status,Date,Month 1,Month 2,Month 3,Month 4,LogPerf
0,645874,156160,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-13523,Product Line-4,Division-3,Customer Segmentation-3,Strategic Product Family-12,,may-aug 2021,0,0,0,0,4.066667
1,1119813,140385,EUROPE,DE,DE_FO_BNDch,Europe Operations,DACH,Germany,reference-6404,Product Line-4,Division-3,Customer Segmentation-1,Strategic Product Family-12,,jan-apr 2022,0,0,0,0,4.066667
2,384038,139181,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-6321,Product Line-4,Division-3,Customer Segmentation-15,Strategic Product Family-12,,jan-apr 2021,0,0,0,0,4.066667
3,392116,147259,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-11181,Product Line-4,Division-3,Customer Segmentation-4,Strategic Product Family-9,,jan-apr 2021,0,0,0,0,4.066667
4,159613,159613,EUROPE,DE,NL_DC_HLD,Europe Operations,DACH,Germany,reference-10010,Product Line-3,Division-2,Customer Segmentation-33,Strategic Product Family-7,,sep-dec 2020,0,0,0,0,4.066667


In [21]:
mdf.head()

Unnamed: 0,index,id_product,Region,Country,Site,Operations,Zone,Cluster,Reference proxy,Product Line proxy,Division proxy,Customer Persona proxy,Strategic Product Family proxy,Product Life cycel status,Month 1,Month 2,Month 3,Month 4,LogPerf,Date_in
0,645874,156160,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-13523,Product Line-4,Division-3,Customer Segmentation-3,Strategic Product Family-12,,0,0,0,0,4.066667,2021-05-01
1,1119813,140385,EUROPE,DE,DE_FO_BNDch,Europe Operations,DACH,Germany,reference-6404,Product Line-4,Division-3,Customer Segmentation-1,Strategic Product Family-12,,0,0,0,0,4.066667,2022-01-01
2,384038,139181,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-6321,Product Line-4,Division-3,Customer Segmentation-15,Strategic Product Family-12,,0,0,0,0,4.066667,2021-01-01
3,392116,147259,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-11181,Product Line-4,Division-3,Customer Segmentation-4,Strategic Product Family-9,,0,0,0,0,4.066667,2021-01-01
4,159613,159613,EUROPE,DE,NL_DC_HLD,Europe Operations,DACH,Germany,reference-10010,Product Line-3,Division-2,Customer Segmentation-33,Strategic Product Family-7,,0,0,0,0,4.066667,2020-09-01
