In [3]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [2]:
## Common python packages
import numpy as np
import pandas as pd

## For plotting
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline
import seaborn as sns
sns.set_style("white")
sns.set_context("notebook")
sns.set_color_codes()

## sklearn - ML tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc, precision_recall_curve
from sklearn.utils import resample, shuffle

from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from xgboost import XGBRegressor

## weighted stats
from statsmodels.stats.weightstats import DescrStatsW

## "-" sign for graphs
rcParams['axes.unicode_minus'] = False

## Some extra styling
def namestr(obj, namespace = globals()):
    "Prints the name of a variable"
    return [name for name in namespace if namespace[name] is obj][0]

## For time
from dateutil.relativedelta import relativedelta

## For country encoding
from dataprep.clean import clean_country

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
## Univar Tools
def data_stats(df, cols = None):
    cols = df.columns if cols is None else cols
    return pd.DataFrame({"Mean": df[cols].mean, "Med": df[cols].median(), "STD": df[cols].std, 
                         "Min": df[cols].min(), "Max": df[cols].max()})
              
def hist_plotter(df, cols = None, range_x = None, n_std = 1, size = None, nbin = 100):
    cols = df.select_dtypes(include=np.number).columns if cols is None else cols
    for col in cols:
        range_ = [df[col].min() + n_std * df[col].std(), df[col].max() - n_std * df[col].std()] if range_x is None else range_x
        fig, ax = plt.subplots(1,1)
        df[col].plot(kind = "hist", range = range_, edgecolor = "blue", alpha = 1, bins = nbin, density = 1, ax = ax, figsize = size)
        plt.xlabel(col)
        plt.show()
    
def box_plotter(df, cols = None):
    cols = df.columns if cols is None else cols
    fig, ax = plt.subplots(1, 1)
    df[cols].boxplot(ax=ax)
    plt.xticks(rotation = 90)
    plt.show()
    
## Bivar Tools
def data_corr(df, size = None, cols = None):
    cols = df.columns if cols is None else cols 
    size = (len(cols), len(cols)) if size is None else size
    plt.figure(figsize = size)
    sns.heatmap(df.corr(), cmap = "coolwarm", square = True, vmin = -1, vmax = 1, annot=True)
    plt.show()
        
## For date conversion
month_lib = {
    "jan": 0,
    "feb": 1,    
    "mar": 2,
    "apr": 3,    
    "may": 4,
    "jun": 5,    
    "jul": 6,
    "aug": 7,    
    "sep": 8,
    "oct": 9,    
    "nov": 10,
    "dec": 11
}

In [None]:
df = pd.read_csv("datasets_hi4/train-data.csv", sep=";")

date = df["Date"].str.split(n=1, expand=True)
df["Year"] = date[1]
df["Trisem"] = pd.to_numeric(date[0].str.slice(stop=3).replace(month_lib))//3
df["Trisem"] = df["Trisem"].astype(str)

df.drop_duplicates(inplace=True)

for col in ["Month 1", "Month 2", "Month 3", "Month 4"]:
    df[col] = pd.to_numeric(df[col].str.replace(" ", ""))
       
#df['Month 1'] = df.groupby('Strategic Product Family proxy')['Month 1'].transform(lambda x: x.fillna(x.mean()))
df["Product Life cycel status"] = df["Product Life cycel status"].fillna("ACT")

In [None]:
df_GSCPI = pd.read_csv("datasets_hi4/extra-dataset/GSCPI_data.csv")

df_GSCPI["Year"] = df_GSCPI["Year-Month"].str.slice(stop=4)
df_GSCPI["Trisem"] = pd.to_numeric(df_GSCPI["Year-Month"].str.slice(start=5))//3
df_GSCPI["Trisem"] = df_GSCPI["Trisem"].astype(str)

df_GSCPI = df_GSCPI.groupby(["Year", "Trisem"], as_index=False)["GSCPI"].mean()

In [None]:
df_LPI = pd.read_csv("datasets_hi4/extra-dataset/LPIextend.csv")

cols = []
for col in df_LPI.columns:
    if "Score" in col:
            cols.append(col)
            
df_LPI["LogPerf"] = df_LPI[cols].mean(axis=1)
df_LPI = df_LPI.replace('TC<rkiye',"Turkey")
df_LPI = clean_country(df_LPI, "Country", output_format="alpha-2", inplace=True)
df_LPI["Country_clean"] = df_LPI["Country_clean"].fillna('NA')

df_LPI = df_LPI[["Country_clean", "LogPerf"]].rename(columns={"Country_clean": "Country"}) 
df_LPI["LogPerf"] = df_LPI["LogPerf"].transform(lambda x: x.fillna(x.mean() - (1/5) * x.std()))

In [None]:
df_inf = pd.read_csv("datasets_hi4/extra-dataset/worldbank_inflation_data.csv")

inf_years = np.array(list(map(lambda x: x.split('-')[0], df_inf["Year-Month"].to_list())))
inf_months = np.array(list(map(lambda x: int(x.split('-')[1]), df_inf["Year-Month"].to_list())))

df_inf["Year"] = inf_years
df_inf["Trisem"] = list(map(str, (inf_months - 1)//4))

df_inf = df_inf.replace('SÃ£o TomÃ© and Principe',"Sao Tome and Principe")
df_inf = clean_country(df_inf, "Country", output_format="alpha-2")
df_inf.drop(columns = ['Country', 'Year-Month'], inplace = True)
df_inf.rename(columns = {"Country_clean": "Country"}, inplace = True)
df_inf["Country"] = df_inf["Country"].fillna('NA')

df_inf["Energy Price Index"] = df_inf["Energy Price Index"].transform(lambda x: x.fillna(x.mean()))
df_inf["Headline Consumer Price Index"] = df_inf["Headline Consumer Price Index"].transform(lambda x: x.fillna(x.mean()))

dfi_grouped = df_inf.groupby(["Year", "Trisem", "Country"])["Energy Price Index", "Headline Consumer Price Index"].mean()

In [None]:
df_epi = pd.read_csv("datasets_hi4/extra-dataset/epi2022results05302022.csv")

df_epi = clean_country(df_epi, "country", output_format="alpha-2", inplace=True)
df_epi = df_epi[["country_clean", "SDA.new", "NXA.new", "CDA.new", "CHA.new", "NDA.new",
                 "BCA.new", "GIB.new", "GHP.new"]]
df_epi = df_epi.rename(columns = {"country_clean": "Country"})
df_epi["EmAv"] = df_epi[["NXA.new", "CDA.new", "SDA.new", "BCA.new", "NDA.new"]].mean(axis=1)
df_epi = df_epi.drop(["NXA.new","CDA.new","SDA.new","BCA.new","NDA.new"],axis=1)

df_epi["Country"] = df_epi["Country"].fillna('NA')

In [None]:
df_econ = pd.read_csv("datasets_hi4/extra-dataset/worldbank_economic_data.csv")

df_econ = df_econ[df_econ["Country"].str.contains("Macao")==False ]

df_econ = df_econ.replace('Turkiye',"Turkey")
df_econ = clean_country(df_econ, "Country", output_format="alpha-2")
df_econ = df_econ.dropna(subset = "Country_clean")
df_econ.drop(columns = ['Country'], inplace = True)
df_econ.rename(columns = {"Country_clean": "Country"}, inplace = True)

df_econ["Year"] = df_econ["Year"].astype(str)

df_econ = df_econ[['Country', "Year", 
                   'Final consumption expenditure (annual % growth)', 
                   'GDP (current US$)', 
                   'Imports of goods and services (annual % growth)']].drop_duplicates()

df_econ["Final consumption expenditure (annual % growth)"] = df_econ["Final consumption expenditure (annual % growth)"].transform(lambda x: x.fillna(x.mean()))
df_econ["Imports of goods and services (annual % growth)"] = df_econ["Imports of goods and services (annual % growth)"].transform(lambda x: x.fillna(x.mean()))
df_econ["GDP (current US$)"] = df_econ["GDP (current US$)"].transform(lambda x: x.fillna(x.mean()))

TW_rows = pd.DataFrame([["TW", str(2020 + i), 
                         df_econ["Final consumption expenditure (annual % growth)"].mean(), 
                         df_econ["GDP (current US$)"].mean(),
                         df_econ["Imports of goods and services (annual % growth)"].mean()] for i in range(4)], 
                       columns = df_econ.columns)

df_econ = df_econ.append(TW_rows, ignore_index = True)

In [None]:
mdf = pd.merge(df, df_GSCPI, on=["Year", "Trisem"])
mdf = pd.merge(mdf, df_LPI, on =["Country"])
mdf = pd.merge(mdf, df_epi, on=["Country"])
mdf = pd.merge(mdf, df_econ, on = ['Country', 'Year'])
mdf_train = pd.merge(mdf, dfi_grouped, how = "left", on = ["Year", "Trisem", "Country"])

In [101]:
mdf_train = pd.read_csv("datasets_hi4/out.csv")

In [102]:
dataX, datay = mdf_train.drop(columns = ["Month 4"]), mdf_train["Month 4"]

X_train, X_test, y_train, y_test = train_test_split(dataX, datay, test_size = .2, random_state=42)
X_train['Month 1'] = X_train.groupby('Strategic Product Family proxy')['Month 1'].transform(lambda x: x.fillna(x.mean()))
X_test['Month 1'] = X_test.groupby('Strategic Product Family proxy')['Month 1'].transform(lambda x: x.fillna(x.mean()))

# Encoding and Preprocessing

In [103]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.layers import CategoryEncoding, StringLookup

In [104]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

In [105]:
X_train_clone = X_train.copy(deep = True)
X_test_clone = X_test.copy(deep = True)
y_train_clone = y_train.copy(deep = True)
y_test_clone = y_test.copy(deep = True)

In [106]:
irrelevant_features = [ 
                         "Strategic Product Family proxy",
                         "Operations",
#                         "Zone",
#                         "Cluster",
#                         "Reference proxy",
                         "Site",
                         "Country",
#                         "Division proxy",
#                         "Customer Persona proxy",
                         "Date",
                         'Energy Price Index',
                         'Headline Consumer Price Index'
]

X_train.drop(columns = irrelevant_features, inplace = True)
X_test.drop(columns = irrelevant_features, inplace = True)

In [107]:
X_train["Year"] = X_train["Year"].astype(int)
X_train["Trisem"] = X_train["Trisem"].astype(int)

X_test["Year"] = X_test["Year"].astype(int)
X_test["Trisem"] = X_test["Trisem"].astype(int)

In [108]:
categorical_cols_string = ['Region', 'Product Life cycel status', 'Product  Line proxy', 
                          "Zone", "Cluster", "Reference proxy", "Division proxy", "Customer Persona proxy"]
categorical_cols_int = ['index', 'id_product', 'Year', "Trisem"]
numerical_cols = list(set(X_train.columns) - set(categorical_cols_string) - set(categorical_cols_int))

In [109]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
                                                    test_size=0.1, random_state=42)

In [110]:
def df_to_dataset(dataframe_X, dataframe_y, shuffle=True, batch_size=32):
    df_X = dataframe_X.copy()
    df_y = dataframe_y.copy()
    labels = df_y
    df = {key: value[:,tf.newaxis] for key, value in dataframe_X.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe_X))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

In [111]:
for column in X_test.columns:
    if column in numerical_cols: 
        X_test[column] = np.asarray(X_test[column]).astype('float32')
        X_train[column] = np.asarray(X_train[column]).astype('float32')

In [112]:
def get_normalization_layer(name, dataset):
    # Create a Normalization layer for the feature.
    normalizer = layers.Normalization(axis=None)
  
    # Prepare a Dataset that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])
  
    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)
  
    return normalizer

In [113]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a layer that turns strings into integer indices.
    if dtype == 'string':
        index = layers.StringLookup(max_tokens=max_tokens)
    # Otherwise, create a layer that turns integer values into integer indices.
    else:
        index = layers.IntegerLookup(max_tokens=max_tokens)
  
    # Prepare a `tf.data.Dataset` that only yields the feature.
    feature_ds = dataset.map(lambda x, y: x[name])
  
    # Learn the set of possible values and assign them a fixed integer index.
    index.adapt(feature_ds)
  
    # Encode the integer indices.
    encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())
  
    # Apply multi-hot encoding to the indices. The lambda function captures the
    # layer, so you can use them, or include them in the Keras Functional model later.
    return lambda feature: encoder(index(feature))

In [114]:
batch_size = 1024
train_ds = df_to_dataset(X_train, y_train, batch_size=batch_size)
val_ds = df_to_dataset(X_val, y_val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(X_test, y_test, shuffle=False, batch_size=batch_size)

  df = {key: value[:,tf.newaxis] for key, value in dataframe_X.items()}
  df = {key: value[:,tf.newaxis] for key, value in dataframe_X.items()}
  df = {key: value[:,tf.newaxis] for key, value in dataframe_X.items()}


In [18]:
all_inputs = []
encoded_features = []

# Numerical features.
for header in numerical_cols:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)

2023-12-03 09:11:40.615981: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype float and shape [1363338,1]
	 [[{{node Placeholder/_6}}]]
2023-12-03 09:11:40.616420: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_4' with dtype float and shape [1363338,1]
	 [[{{node Placeholder/_4}}]]
2023-12-03 09:11:56.419294: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_4' with dtype float and shap

In [19]:
for header in categorical_cols_string:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
    encoding_layer = get_category_encoding_layer(name=header,
                                                 dataset=train_ds,
                                                 dtype='string',
                                                 max_tokens=5)
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

2023-12-03 09:15:05.954842: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_8' with dtype float and shape [1363338,1]
	 [[{{node Placeholder/_8}}]]
2023-12-03 09:15:05.955260: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_22' with dtype string and shape [1363338,1]
	 [[{{node Placeholder/_22}}]]
2023-12-03 09:15:21.643074: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_9' with dtype float and s

In [20]:
for header in categorical_cols_int:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='int64')
    encoding_layer = get_category_encoding_layer(name=header,
                                                 dataset=train_ds,
                                                 dtype='int64',
                                                 max_tokens=5)
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

2023-12-03 09:17:25.879070: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_20' with dtype float and shape [1363338,1]
	 [[{{node Placeholder/_20}}]]
2023-12-03 09:17:25.881377: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_5' with dtype float and shape [1363338,1]
	 [[{{node Placeholder/_5}}]]
2023-12-03 09:17:43.890633: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_8' with dtype float and sh

# Neural network

In [115]:
all_features = layers.concatenate(encoded_features)
model = layers.Dense(32, activation='tanh')(all_features)
model = layers.BatchNormalization()(model)
model = layers.Dense(64, activation='tanh')(model)
#model = layers.BatchNormalization()(model)
model = layers.Dense(128, activation='tanh')(model)
#model = layers.BatchNormalization()(model)
model = layers.Dense(64, activation='tanh')(model)
#model = layers.BatchNormalization()(model)
model = layers.Dense(32, activation='tanh')(model)
model = layers.BatchNormalization()(model)
output = layers.Dense(1, activation='relu')(model)

final_model = tf.keras.Model(all_inputs, output)

In [116]:
init_lr = 10**(-2.8)

In [117]:
final_model.compile(optimizer=keras.optimizers.Adam(learning_rate=init_lr),
              loss=tf.keras.losses.MeanSquaredError(reduction="sum_over_batch_size", name="mean_squared_error"),
              metrics=[keras.metrics.RootMeanSquaredError(name="root_mean_squared_error", dtype=None)
])

In [118]:
# Use `rankdir='LR'` to make the graph horizontal.
tf.keras.utils.plot_model(final_model, show_shapes=True, rankdir="LR")

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [119]:
final_model.fit(train_ds, epochs=6, validation_data=val_ds)

Epoch 1/6


2023-12-03 11:08:46.568881: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype float and shape [1363338,1]
	 [[{{node Placeholder/_0}}]]
2023-12-03 11:08:46.569332: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_5' with dtype float and shape [1363338,1]
	 [[{{node Placeholder/_5}}]]




2023-12-03 11:09:05.821092: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype double and shape [151482,1]
	 [[{{node Placeholder/_6}}]]
2023-12-03 11:09:05.821936: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_6' with dtype double and shape [151482,1]
	 [[{{node Placeholder/_6}}]]


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f0e524a6b50>

In [120]:
loss, accuracy = final_model.evaluate(test_ds)
print("Accuracy", accuracy)

2023-12-03 11:10:47.748398: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_2' with dtype string and shape [378706,1]
	 [[{{node Placeholder/_2}}]]
2023-12-03 11:10:47.748826: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_13' with dtype float and shape [378706,1]
	 [[{{node Placeholder/_13}}]]


Accuracy 713.2530517578125


In [81]:
columns = numerical_cols + categorical_cols_string + categorical_cols_int

sample_key = {}
for key in columns:
    print(key)
    sample_key[key] = tf.convert_to_tensor(X_train[key].values.tolist())


Month 1
Month 2
LogPerf
GHP.new
GDP (current US$)
GSCPI
GIB.new
Unnamed: 0
Month 3
Imports of goods and services (annual % growth)
CHA.new
EmAv
Final consumption expenditure (annual % growth)
Region
Product Life cycel status
Product  Line proxy
Zone
Cluster
Reference proxy
Division proxy
Customer Persona proxy
index
id_product
Year
Trisem


In [84]:
final_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Region (InputLayer)            [(None, 1)]          0           []                               
                                                                                                  
 Product Life cycel status (Inp  [(None, 1)]         0           []                               
 utLayer)                                                                                         
                                                                                                  
 Product  Line proxy (InputLaye  [(None, 1)]         0           []                               
 r)                                                                                               
                                                                                            

In [83]:
final_model.predict(sample_key)

 3288/42605 [=>............................] - ETA: 32:48

KeyboardInterrupt: 

In [60]:
for i in range(len(X_train)):
    sample = X_train[numerical_cols + categorical_cols_string + categorical_cols_int].iloc[1].to_dict()
    input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
    final_predictions.append(final_model.predict(input_dict))



KeyboardInterrupt: 

In [61]:
len(X_train)

1363338

##### 