In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

### Load datasets

In [None]:
df = pd.read_csv("datasets/train_prev_months.csv")
df

## Extra Datasets

In [None]:
gspci = pd.read_csv("datasets/extra-dataset/GSCPI_data.csv", sep=",")
gspci['Year'] = gspci['Year-Month'].apply(lambda x: x.split('-')[0]).astype(int)
gspci['Month'] = gspci['Year-Month'].apply(lambda x: x.split('-')[1]).astype(int)
gspci.drop(columns=['Year-Month'], inplace=True)
gspci.head(3)

In [None]:
lpi = pd.read_csv("datasets/extra-dataset/LPIextend.csv")
lpi.drop(columns=["Unnamed: 0", "ID"], inplace=True)
lpi.loc[lpi.Country == 'Taiwan, China', 'Country'] = 'Taiwan'
lpi.head(3)

In [None]:
wb_eco = pd.read_csv("datasets/extra-dataset/worldbank_economic_data.csv")
wb_eco.head(3)

In [None]:
wb_inf = pd.read_csv("datasets/extra-dataset/worldbank_inflation_data.csv")
wb_inf['Year'] = wb_inf['Year-Month'].apply(lambda x: x.split('-')[0])
wb_inf['Month'] = wb_inf['Year-Month'].apply(lambda x: x.split('-')[1])
wb_inf.drop(columns=['Year-Month'], inplace=True)
wb_inf.head(3)

## Add from extra-datasets

In [None]:
def get_mean_gspci(date):
    months, year = date.split(" ")
    year = int(year)
    gscpi_months = []
    if months == 'may-aug':
        gscpi_months = [5, 6, 7, 8]
    elif months == 'sep-dec':
        gscpi_months = [9, 10, 11, 12]
    elif months == 'jan-apr':
        gscpi_months = [1, 2, 3, 4]
    elif months == 'may-jul':
        gscpi_months = [5, 6, 7]
        
    return gspci.loc[(gspci.Year == year) & (gspci.Month.isin(gscpi_months))].GSCPI.mean()

In [None]:
df['gscpi'] = [0]*len(df)

In [None]:
for date in df.Date.unique():
    df.loc[df.Date == date, 'gscpi'] = get_mean_gspci(date)

### Add country codes to extra datasets

In [None]:
country_codes = pd.read_csv('datasets/country_codes.txt', sep=',')
country_codes.loc[country_codes.Name == 'Namibia', 'Code'] = 'NA'
country_codes.loc[country_codes.Code == 'ID', 'Code'] = 'Id'
country_codes.head(3)

In [None]:
def get_country_code(country):
    df_country = country_codes.loc[country_codes.Name.str.contains(country)]
    if len(df_country) == 0:
        return "NoCode"
    return (df_country.Code.iloc[0])

def insert_code(df, index):
    df.insert(index, "Country_code", df.Country.apply(lambda country: get_country_code(country)))

In [None]:
insert_code(lpi, 1)

In [None]:
insert_code(wb_eco, 1)

In [None]:
insert_code(wb_inf, 1)

## Add LPI

In [None]:
lpi_col_to_add = ['Customs Score', 'Logistics Competence and Quality Score', 'International Shipments Score']

In [None]:
for col in lpi_col_to_add:
    df[col] = ['']*len(df)

In [None]:
for country_code in df.Country.unique():
    lpi_country = lpi.loc[lpi.Country_code == country_code]
    try:
        for col in lpi_col_to_add:
            df.loc[df.Country == country_code, col] = lpi_country[col].iloc[0]
    except:
        print(country_code)

## Split Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
features_to_keep = ['Site', 'Reference proxy', 'Customer Persona proxy', 'Strategic Product Family proxy', 'gscpi', 'id_product', 
                    'Prev.Month 1', 'Prev.Month 2', 'Prev.Month 3', 'Month 1', 'Month 2', 'Month 3']

In [None]:
X = df[features_to_keep]
X.dropna(inplace=True)
X.shape

In [None]:
y = df['Month 4']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
print(f"X_train shape -> {X_train.shape}, X_test shape -> {X_test.shape}")

## Target Encoding

In [None]:
from sklearn.preprocessing import TargetEncoder

In [None]:
enc = TargetEncoder(target_type='continuous')

In [None]:
index_fst_not_encoded = list(X.columns).index('gscpi')

In [None]:
X_train_not_encoded = X_train[X_train.columns[index_fst_not_encoded:]]
X_train = enc.fit_transform(X_train[X_train.columns[:index_fst_not_encoded]], y_train)
X_train = np.hstack((X_train, X_train_not_encoded))

In [None]:
X_test_not_encoded = X_test[X_test.columns[index_fst_not_encoded:]]
X_test = enc.fit_transform(X_test[X_test.columns[:index_fst_not_encoded]], y_test)
X_test = np.hstack((X_test, X_test_not_encoded))

## MLP

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Reshape, InputLayer, BatchNormalization
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

input_shape = X_train.shape[1:]


model = Sequential()

model.add(InputLayer(input_shape=input_shape))

model.add(Dense(4096, input_shape=input_shape, activation='relu', kernel_regularizer=regularizers.l1(0.1)))
model.add(Dropout(0.5))  

model.add(Dense(512, activation='relu', kernel_regularizer=regularizers.l1(0.01)))
model.add(Dropout(0.4))  

model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l1(0.01)))
model.add(Dropout(0.3)) 

# model.add(Dense(128, activation='relu', kernel_regularizer=regularizers.l1(0.01)))
# model.add(Dropout(0.3)) 

model.add(Dense(1, activation='linear'))  

In [None]:
model.summary()

In [None]:
from tensorflow.keras import backend as K

# Assuming your model has been defined and compiled

# Define a custom Pearson correlation coefficient metric
def pearson_correlation(y_true, y_pred):
    x = y_true - K.mean(y_true)
    y = y_pred - K.mean(y_pred)
    pearson_num = K.sum(x * y)
    pearson_den = K.sqrt(K.sum(K.square(x)) * K.sum(K.square(y)))
    pearson_corr = pearson_num / pearson_den
    return pearson_corr

def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return  1 - SS_res/(SS_tot + K.epsilon())

def rmse_loss(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(tf.cast(y_true, tf.float32) - y_pred)))

In [None]:
model.compile(loss=rmse_loss,
              optimizer=tf.keras.optimizers.Adam(
              learning_rate = 1e-5
          ),
          metrics=['mse', r2_score, pearson_correlation])

In [None]:
%run PlotEpochs.ipynb

In [None]:
filepath = 'model.h5'
checkpoint = ModelCheckpoint(filepath=filepath, 
                             monitor='val_loss',
                             verbose=1, 
                             save_best_only=True,
                             mode='min')

es = EarlyStopping(monitor='val_loss', patience=8, verbose=1)

callbacks = [
            es,
            checkpoint,
            PlotLearning()
            ]

In [None]:
model.fit(
    x=X_train,  # Input data
    y=y_train,  # Target data
    epochs=100,  # Number of training epochs
    batch_size=512,  # Batch size
    validation_split=0.2,  # Percentage of data used for validation
    callbacks = callbacks
)

## Predict X_test

In [None]:
model = tf.keras.models.load_model('model.h5', 
                          custom_objects={'pearson_correlation': pearson_correlation,
                                          'r2_score': r2_score,
                                          'rmse_loss': rmse_loss})

In [None]:
test = pd.read_csv("datasets/test_prev_months.csv")

In [None]:
test['gscpi'] = [0]*len(test)
for date in test.Date.unique():
    test.loc[test.Date == date, 'gscpi'] = get_mean_gspci(date)

In [None]:
features_to_keep = ['Site', 'Reference proxy', 'Customer Persona proxy', 'Strategic Product Family proxy', 'gscpi', 'id_product', 
                    'Prev.Month 1', 'Prev.Month 2', 'Prev.Month 3', 'Month 1', 'Month 2', 'Month 3']

In [None]:
test_X = test[features_to_keep]
test_X.shape

In [None]:
index_fst_not_encoded = list(test_X.columns).index('gscpi')
index_fst_not_encoded

In [None]:
test_not_encoded = test_X[test_X.columns[index_fst_not_encoded:]]
test_X = enc.transform(test_X[test_X.columns[:index_fst_not_encoded]])
test_X = np.hstack((test_X, test_not_encoded))

In [None]:
pred = model.predict(test_X).flatten()

In [None]:
real_test_results = pd.DataFrame()
real_test_results['index'] = pd.read_csv("datasets/X_test_working.csv")['index'].values
real_test_results['Month 4'] = pred

In [None]:
real_test_results

In [None]:
real_test_results.to_csv("datasets/real_test_results_submit3.csv", index=False, sep=";")