In [80]:
## Common python packages
import numpy as np
import pandas as pd

## For plotting
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline
import seaborn as sns
sns.set_style("white")
sns.set_context("notebook")
sns.set_color_codes()
import hist, mplhep

## sklearn - ML tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, precision_recall_curve
from sklearn.utils import resample, shuffle

from xgboost.sklearn import XGBClassifier
import xgboost as xgb

## weighted stats
from statsmodels.stats.weightstats import DescrStatsW

## "-" sign for graphs
rcParams['axes.unicode_minus'] = False

## Some extra styling
def namestr(obj, namespace = globals()):
    "Prints the name of a variable"
    return [name for name in namespace if namespace[name] is obj][0]

## For time
from time import time
from dateutil.relativedelta import relativedelta

##
from dataprep.clean import clean_country

In [2]:
## Univar Tools
def data_stats(df, cols = None):
    cols = df.columns if cols is None else cols
    return pd.DataFrame({"Mean": df[cols].mean, "Med": df[cols].median(), "STD": df[cols].std, 
                         "Min": df[cols].min(), "Max": df[cols].max()})
              
def hist_plotter(df, cols = None, range_x = None, n_std = 1, size = None, nbin = 100):
    cols = df.select_dtypes(include=np.number).columns if cols is None else cols
    for col in cols:
        range_ = [df[col].min() + n_std * df[col].std(), df[col].max() - n_std * df[col].std()] if range_x is None else range_x
        fig, ax = plt.subplots(1,1)
        df[col].plot(kind = "hist", range = range_, edgecolor = "blue", alpha = 1, bins = nbin, density = 1, ax = ax, figsize = size)
        plt.xlabel(col)
        plt.show()
    
def box_plotter(df, cols = None):
    cols = df.columns if cols is None else cols
    fig, ax = plt.subplots(1, 1)
    df[cols].boxplot(ax=ax)
    plt.xticks(rotation = 90)
    plt.show()

In [3]:
## Bivar Tools
def data_corr(df, size = None, cols = None):
    cols = df.columns if cols is None else cols 
    size = (len(cols), len(cols)) if size is None else size
    plt.figure(figsize = size)
    sns.heatmap(df.corr(), cmap = "coolwarm", square = True, vmin = -1, vmax = 1, annot=True)
    plt.show()

In [4]:
month_lib = {
    "jan": "-01",
    "feb": "-02",    
    "mar": "-03",
    "apr": "-04",    
    "may": "-05",
    "jun": "-06",    
    "jul": "-07",
    "aug": "-08",    
    "sep": "-09",
    "oct": "-10",    
    "nov": "-11",
    "dec": "-12"
}

In [50]:
df = pd.read_csv("datasets_hi4/train-data.csv", sep=";")
df_GSCPI = pd.read_csv("datasets_hi4/extra-dataset/GSCPI_data.csv")
df_LPI = pd.read_csv("datasets_hi4/extra-dataset/LPIextend.csv")

In [6]:
df_GSCPI["Date"] = pd.to_datetime(df_GSCPI["Year-Month"] + "-01", yearfirst=True)
df_GSCPI = df_GSCPI.drop(columns = ["Year-Month"])

In [61]:
sum(df_LPI["Country"].duplicated())

0

In [51]:
df_LPI.columns

Index(['Unnamed: 0', 'ID', 'Country', 'population (2023)', 'area',
       'landAreaKm', 'unMember', 'netChange', 'growthRate', 'worldPercentage',
       'density', 'densityMi', 'rank', 'LPI Grouped Rank', 'Customs Score',
       'Customs Grouped Rank', 'Infrastructure Score',
       'Infrastructure Grouped Rank', 'International Shipments Score',
       'International Shipments Grouped Rank',
       'Logistics Competence and Quality Score',
       'Logistics Competence and Quality Grouped Rank', 'Timeliness Score',
       'Timeliness Grouped Rank', 'Tracking and Tracing Score',
       'Tracking and Tracing Grouped Rank'],
      dtype='object')

In [85]:
df_LPI["Country"] = clean_country(df_LPI, "Country", output_format="alpha-2")

#cols = ["Country"]
#for col in df_LPI.columns:
#    if "Score" in col:
#            cols.append(col)
#df[] = df_LPI[cols].groupby(["Country"]).mean(axis=1)
df_LPI["Country"]



  0%|                                                     | 0/8 [00:00<?, ?it/s]

Country Cleaning Report:
	248 values cleaned (99.2%)
	2 values unable to be parsed (0.8%), set to NaN
Result contains 248 (99.2%) values in the correct format and 2 null values (0.8%)


ValueError: Columns must be same length as key

In [39]:
df_LPI.head()

0    4.316667
1    4.166667
2    4.050000
3    4.066667
4    4.033333
Name: LogPerf, dtype: float64

In [18]:
date = df["Date"].str.split(n=1, expand=True)
df["Date_in"] = pd.to_datetime(date[1] + date[0].str.slice(stop=3).replace(month_lib) + "-01")
df["Date_fin"] = df.apply(lambda row: row.Date_in + relativedelta(months=3), axis=1)
df = df.drop(columns = ["Date"])

TypeError: drop() got an unexpected keyword argument 'column'

In [17]:
df.head()

Unnamed: 0,index,id_product,Region,Country,Site,Operations,Zone,Cluster,Reference proxy,Product Line proxy,...,Customer Persona proxy,Strategic Product Family proxy,Product Life cycel status,Date,Month 1,Month 2,Month 3,Month 4,Date_in,Date_fin
0,645874,156160,EUROPE,DE,NL_DC_Venray,Europe Operations,DACH,Germany,reference-13523,Product Line-4,...,Customer Segmentation-3,Strategic Product Family-12,,may-aug 2021,0,0,0,0,2021-05-01,2021-08-01
1,469488,224631,CHINA,CN,CN_DC_Shanghai,China Operations,China & HK,China,reference-12778,Product Line-4,...,Customer Segmentation-14,Strategic Product Family-9,,jan-apr 2021,0,1,0,0,2021-01-01,2021-04-01
2,348904,104047,EUROPE,GB,NL_DC_HLD,Europe Operations,UK and Ireland,United Kingdom,reference-3513,Product Line-3,...,Customer Segmentation-23,Strategic Product Family-7,,jan-apr 2021,0,2,0,0,2021-01-01,2021-04-01
3,1725822,11823,EAJP,AU,AU_DC_Perth,International Operations,Pacific,Australia,reference-672,Product Line-1,...,Customer Segmentation-11,Strategic Product Family-3,,jan-apr 2023,0,0,0,0,2023-01-01,2023-04-01
4,404781,159924,EUROPE,ES,NL_DC_HLD,Europe Operations,Iberia,Spain,reference-3496,Product Line-3,...,Customer Segmentation-14,Strategic Product Family-7,,jan-apr 2021,0,0,0,0,2021-01-01,2021-04-01
