<img src="../images/BDG_LOGO.png" alt="drawing" align="right" width="200"/>

# H2020 RIA BigDataGrapes - Predictive Data Analytics (T4.3)

### This pilot is described in the deliverable D4.3 (Pilot 5). 

The specific goal of the price prediction is to develop a software module that allows to predict the future price of specific goods in the grapes and wines supply chain. Starting from past observations of the price of different agro/food items, we build a machine learning pipeline that allows us to experiment with several prediction solutions.



In [None]:
!pip install --user pandas tqdm seaborn sklearn tensorflow keras plotly

[33mDEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 is no longer maintained. pip 21.0 will drop support for Python 2.7 in January 2021. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support[0m
Collecting tensorflow
  Downloading tensorflow-2.1.0-cp27-cp27mu-manylinux2010_x86_64.whl (421.8 MB)
[K     |██████████████▎                 | 188.8 MB 2.3 MB/s eta 0:01:4384

In [None]:
import math
import numpy as np
from random import sample   
import pandas as pd
import datetime
from tqdm import tqdm_notebook
# from tqdm import tqdm

# import warnings
# warnings.filterwarnings('ignore')
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense
from keras.callbacks import EarlyStopping

import plotly.express as px


# for plotting
import matplotlib.pyplot as plt
sns.set(rc={'figure.figsize':(14,8)})
# Adjusting the size of matplotlib

import pickle
import json

## Neural Network Parameters

In [None]:
# itens for test
test_size = 0.3

# choose a number of time steps
n_steps = 7

# metric
loss_function="mse"

# epochs & batches
epochs = 1500
batch_size = 32
optimizer = 'adam'
neurons = 150


# 
dict_settings['nn_configurations'] = {
    "n_steps":n_steps
    , "loss_function":loss_function
    , "epochs":epochs
    , "batch_size":batch_size
    , "optimizer":optimizer
    , "neurons":neurons
}

# settings of the experiment
dict_settings = {
    "nn_configurations": {},
    "constraints": {},
    "product_name_id_mapping": {},
}

# lst of columns to report
lst_columns_to_report = ['model_name', 'mean_squared_error', 'root_mean_squared_error', 'mean_absolute_error', 'epochs', "neurons", "test_size", "n_steps", "max_sequential_nan", "minimum_temporal_points"]
datetime_column_name = "date"


## General Data Constraints
max_sequential_nan = 7
minimum_temporal_points = 50

dict_settings['constraints'] = {
        "max_sequential_nan": max_sequential_nan,
        "minimum_temporal_points":minimum_temporal_points
} 


### Load the Dataset

We build our datasets starting from the open data published by the governments. The data used are collected from the Hellenic Food Market, the European Commission and the Food and Agriculture Organization of the United Nations. The dataset consists of a collection of daily observations of prices for a variety of products available in different countries.

In [None]:
def fill_foward_missing_dates(df_product):
  
    # reindexing as datetime
    df_product[datetime_column_name] = pd.to_datetime(df_product[datetime_column_name])
    df_product = df_product.set_index("date", drop=True)
    
    # adding the missing datetime intervals
    df_product = df_product.asfreq('D')
    
    # foward filling
    df_product = df_product.fillna(method='ffill')

    # reset index: avoid datetime index
    return df_product.reset_index()


# check for a products if it has no more than a limit of sequencial missing information by day
def check_product_time_series(df_product):
    lst_intervals = [(dt2 - dt1).days if not pd.isna(dt1) else 0 for dt2, dt1 in zip(df_product[datetime_column_name], df_product[datetime_column_name].shift(1))]

    if any(x > max_sequential_nan for x in lst_intervals):
        return False
    else:
        return True

def filter_out_products_without_min_temporal(df):
    # keep only products with a good amout of data over the time
    df_temporal_points = df.drop_duplicates(subset=["product", "priceStringDate"]).groupby(['product']).size().reset_index(name='counts').sort_values('counts', ascending=False)
    df_temporal_points = df_temporal_points[df_temporal_points['counts'] > minimum_temporal_points]

    # what are these products
    lst_products = df_temporal_points['product'].unique()
    
    # filter out 
    df = df[df['product'].isin(lst_products)]
    
    return df


def group_price_by_date_and_product(df_data):
    # group the data by product and date 
    df_data = df_data.groupby(by=[datetime_column_name, "product"]).agg(["mean", "min", "max"]).reset_index()
    
    # flatten the columns
    df_data.columns = [' '.join(col).strip() for col in df_data.columns.values]
    df_data.columns = [col.replace(' ', '_') if "price" in col else col for col in df_data.columns.values]
    
    # return the values
    return df_data

df_temporal_points = None
df_debug = None

def load_data():
    
    global df_temporal_points, df_debug
    
    # reading the whole dataset (multi-product)
    df = pd.read_csv("../datasets/food_dataset.csv", parse_dates=["priceStringDate"])
    
    print("all products", len(df['product'].unique()))
    df = filter_out_products_without_min_temporal(df)
    
    
    # rename to the default date column 
    df = df.rename(columns={"priceStringDate": datetime_column_name})
    
    # sort the data by time
    df = df.sort_values(by=datetime_column_name, ascending=True)
    
    # fill missing country information
    df['country'].fillna('ND', inplace=True)
    
    # group the data
    df = group_price_by_date_and_product(df)
    df_debug = df
    
    lst_products = df['product'].unique()
    print("after min points filter out", len(lst_products))

    # check products time series
    for product_name in lst_products:
        # filter by product
        df_product = df[df['product']==product_name]
        
        # check timeseties 
        istimeseries_ok = check_product_time_series(df_product)
        
        if not istimeseries_ok:
            # if it is not filter out
            df = df[df['product']!=product_name]
        else:
            # fill foward 
            df_product = fill_foward_missing_dates(df_product)
            
            # filter out
            df = df[df['product']!=product_name]
            
            # put the new one
            df = pd.concat([df, df_product])
            
    print("after missed dates", len(df['product'].unique()))
    
    # reset the index
    df = df.reset_index()
    # return the dataframe
    return df 
    

df_data = load_data()
df_data.to_csv("../datasets/df_data_forward_agg.csv", index=False)


In [None]:
df_data.sample(5)

In [None]:
idx = 1
dict_map_name_id = {}

for p in df_data['product'].unique():
    dict_map_name_id.update({p: idx})
    idx = idx + 1

dict_settings['product_name_id_mapping'] = dict_map_name_id
dict_settings

### Auxiliar methods

In [None]:
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix], sequence[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)


def plot_price_by_country(df_product):
    #try: 
    # setup the plot
    fig = px.line(df_product, x=datetime_column_name, y="price_mean",width=800, height=400).update_traces(mode='lines+markers')
    fig.update_layout(title_text='Product Price by Country: {0}'.format(product_name),
                      xaxis_rangeslider_visible=True)
    # display
    fig.show()
    

### LSTM Model

we employ time series, i.e., sequences of per-product price observations, to learn a machine learning system that allows us to predict the future price of the product, given an historical time window. Time series prediction is a well-known task that is commonly addressed using neural networks. We employ Long Short-term memory (LSTM) networks to address this task. LSTM is a powerful RNN architecture with important application in time series prediction. 


In [None]:
def create_model():
    
    global neurons, n_steps, optimizer, loss_function
    n_features = 1
    
    # define model
    model = Sequential()
    
    model.add(LSTM(neurons, activation='relu'
                   , input_shape=(n_steps, n_features)
                   , return_sequences=False))
    
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss=loss_function)
    
    # patient early stopping
    es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100)
    
    return model, es


# split a univariate sequence into samples

def compute_scores(y_train, y_true, y_pred, reshape=True):
    # compute scores
    dic_result = {}
    dic_result["mean_squared_error"] = mean_squared_error(y_true, y_pred)
    dic_result["root_mean_squared_error"] = math.sqrt(mean_squared_error(y_true, y_pred))
    dic_result["mean_absolute_error"] = mean_absolute_error(y_true, y_pred)
    dic_result["y_train"] = y_train.reshape(1,-1).tolist() if reshape else y_train.tolist()
    dic_result["y_true"] = y_true.reshape(1,-1).tolist() if reshape else y_true.tolist()
    dic_result["y_pred"] = y_pred.reshape(1,-1).tolist() if reshape else y_pred.tolist()
    return dic_result


# report metric results
def get_dataframe_results(lst_results):
    d = {}
    # for each column
    for k in lst_results[0].keys():
        d[k] = tuple(d[k] for d in lst_results)

    # show the results
    df_results = pd.DataFrame.from_dict(d) # [lst_columns_to_report]
    df_results.sort_values(by=['root_mean_squared_error'])
    return df_results



def univariate_lstm_with_test_set(df_data, col_value):
    # global variable
    global test_size, n_steps, optimizer, loss_function
    
    # define input sequence
    raw_seq = df_data[col_value].tolist()
    raw_seq = np.array(raw_seq).reshape(len(raw_seq),1)
    raw_seq = raw_seq 
    
    # test size
    if test_size < 1:
        test_seq_len = int(len(raw_seq) * test_size)
    else:
        test_seq_len = test_size
    
    # scale for the training data
    raw_seq_seen_in_training = raw_seq[:-test_seq_len]
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(raw_seq_seen_in_training)
    
    # transform the whole data (inclusing the test set)
    scaled_seq = scaler.transform(raw_seq)
    
    # split into samples
    X, y = split_sequence(scaled_seq, n_steps)
    
    # reshape from [samples, timesteps] into [samples, timesteps, features]
    n_features = 1
    X = X.reshape((X.shape[0], X.shape[1], n_features))
    
    # split into train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_seq_len, shuffle=False)
    
    # create the model
    model, es = create_model()
    
    # fit model
    history = model.fit(X
                        , y
                        , epochs=epochs
                        , batch_size=batch_size
                        , verbose=1
                        , shuffle=False
                        , validation_data=(X, y)
                        , callbacks=[es]
                       )

    # predict
    y_pred = model.predict(X_test)
    
    
    # inverse transform
    y_train = scaler.inverse_transform(y_train)
    y_pred = scaler.inverse_transform(y_pred)
    y_test = raw_seq[-test_seq_len:]
    
    # evaluate the model
    # scores = model.evaluate(X_test, y_test, verbose=0)
    scores = compute_scores(y_train,y_test, y_pred)
    
    # return model, history, scores, y_pred
    return scores


def log_test_result(model_name, product_name, scores, features="TimeSeries"):
    dic_result = {"model_name": model_name
                  , "product_name": product_name
                  , "test_size": test_size
                  , "n_steps": n_steps
                  , "features": features
                  , "neurons": neurons
                  , "drop_out": ""
                  , "epochs": epochs
                  , "max_sequential_nan": max_sequential_nan
                  , "minimum_temporal_points": minimum_temporal_points}

    dic_result.update(scores)
    return dic_result


# Adjusting the size of matplotlib
resolution = (14,6)

def plot_forecast(model_name, dict_results):
    y_train = dict_results['y_train'][0]
    y_test = dict_results['y_true'][0]
    y_pred = dict_results['y_pred'][0]
    
    # Creates pandas DataFrame. 
    df_plot = pd.DataFrame({'forecast': [np.nan] * len(y_train) + list(y_pred), 
               'price': list(np.reshape(y_train, len(y_train))) + list(y_test)}) 
    
    df_plot['price'].plot(figsize=resolution, title='Price ({0})'.format(model_name), grid=True)
    df_plot['forecast'].plot()
    plt.legend(loc=1)
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.show()

### Prediction performance

In [None]:
%time

doExperiment = True
if doExperiment:
    # load the dataset
    df_data = load_data()

    # list of products 
    lst_all_products = list(df_data['product'].unique())

    # prediction results list
    lst_results = []

    for product_name in tqdm_notebook(lst_all_products): 
        # model name
        model_name = "lstm_fforward"

        # filter by product
        df_product = df_data[df_data['product']==product_name]

        # perform the prediction
        print("product_name", product_name)
        scores = univariate_lstm_with_test_set(df_product, "price_mean")
        result_by_product = log_test_result(model_name, product_name, scores)

        # plot the results
        # plot_forecast(model_name, result_by_product)

        # log the results
        lst_results.append(result_by_product)

    # save results
    df_results = get_dataframe_results(lst_results)
    df_results.to_csv("../results/results_lstm_fforward.csv", index=False, encoding = 'utf-8')
    df_results

## 3. Train and Save the models for the API

In [None]:

def univariate_lstm_save_models(df_data, col_value):
    # global variable
    global test_size, n_steps, optimizer, loss_function
    
    # define input sequence
    raw_seq = df_data[col_value].tolist()
    raw_seq = np.array(raw_seq).reshape(len(raw_seq),1)
    raw_seq = raw_seq 
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(raw_seq)
    
    # transform the whole data (inclusing the test set)
    scaled_seq = scaler.transform(raw_seq)
    
    # split into samples
    X, y = split_sequence(scaled_seq, n_steps)
    
    # reshape from [samples, timesteps] into [samples, timesteps, features]
    n_features = 1
    X = X.reshape((X.shape[0], X.shape[1], n_features))
    
    # define the model
    create_model()

    # fit model
    history = model.fit(X
                        , y
                        , epochs=epochs
                        , batch_size=batch_size
                        , verbose=0
                        , shuffle=False
                        , callbacks=[es])
    
    # return the model
    return scaler, model


In [None]:
%time
# load the dataset
df_data = load_data()

# prediction results list
dict_models = {}

# list of products 
lst_all_products = list(df_data['product'].unique())

for product_name in tqdm_notebook(lst_all_products):
    # model name
    model_name = "lstm_fforward"
    
    # filter by product
    df_product = df_data[df_data['product']==product_name]
    
    # perform the prediction
    # print("product_name", product_name)
    scaler, model = univariate_lstm_save_models(df_product, "price_mean")
    
    mapping_name_id = dict_settings["product_name_id_mapping"]
    product_id = mapping_name_id[product_name]
    
    # save model
    model.save("models/product_id_{0}.h5".format(product_id))
    # pickle.dump(model, open("models/product_id_{0}.pkl".format(product_id), 'wb'))
    
    # save scaler
    pickle.dump(scaler, open('scalers/product_id_{0}.pkl'.format(product_id), 'wb'))

    
with open('settings.json', 'w', encoding='utf8') as outfile:
    json.dump(dict_settings, outfile, ensure_ascii=False)


## 4 Baselines

### Baseline Average Window

In [None]:

def baseline_moving_average(df_product, col_value):
    # global variable
    global test_size
    n_steps = 15

    x = df_product[col_value]
    test_seq_len = int(test_size * len(x))
    
    y_test = x[-test_seq_len:]
    y_train = x[:-test_seq_len]
    
    x = x[-(test_seq_len+n_steps-1):]
    y_pred = pd.Series(x).rolling(window=n_steps).mean().iloc[n_steps-1:].values    
    
    # evaluate the model
    scores = compute_scores(y_train, y_test, y_pred, reshape=False)
    
    # return model, history, scores, y_pred
    return scores


In [None]:
%time
load the dataset
df_data = load_data()

# prediction results list
lst_results_bas_mavg = []

# list of products 
lst_all_products = list(df_data['product'].unique())

for product_name in tqdm_notebook(lst_all_products): 
    # model name
    model_name = "moving_avg"
    
    # filter by product
    df_product = df_data[df_data['product']==product_name]
    
    # perform the prediction
    scores = baseline_moving_average(df_product, "price_mean")
    result_by_product = log_test_result(model_name, product_name, scores)
    
    # log the results
    lst_results_bas_mavg.append(result_by_product)
    


In [None]:
df_bas2 = get_dataframe_results(lst_results_bas_mavg)
df_bas2.to_csv("../results/results_bas_moving_avg.csv", index=False, encoding = 'utf-8')
df_bas2