# Modelling

### Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.arima.model import ARIMA

### Data

In [2]:
# load sentiment data

sentiment_per_month = pd.read_csv('features/sentiment_per_month_additive.csv', index_col=0, parse_dates=True)

sentiment_per_month

Unnamed: 0_level_0,sentiment,trend,seasonal,residual
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-05-31,1.000000,,0.017662,
2005-06-30,1.000000,,-0.021048,
2005-07-31,1.000000,,-0.035035,
2005-08-31,0.500000,0.714286,0.009044,-0.223330
2005-09-30,0.500000,0.714286,0.014301,-0.228586
...,...,...,...,...
2021-09-30,0.447143,0.483185,0.017662,-0.053704
2021-10-31,0.512363,0.517819,-0.021048,0.015592
2021-11-30,0.537600,,-0.035035,
2021-12-31,0.488746,,0.009044,


In [None]:
# load category sentiment additive data

sentiment_restaurants_per_month_additive = pd.read_csv('features/sentiment_restaurants_per_month_additive.csv', index_col=0, parse_dates=True)
sentiment_nightlife_per_month_additive = pd.read_csv('features/sentiment_nightlife_per_month_additive.csv', index_col=0, parse_dates=True)
sentiment_breakfast_brunch_per_month_additive = pd.read_csv('features/sentiment_breakfast_brunch_per_month_additive.csv', index_col=0, parse_dates=True)

In [53]:
# Cross-Validation Function for Time Series that Evaluates the Model given

def cross_validation_predictions(data, model, min_train_size=12):
    results = pd.DataFrame(columns=['date', 'actual', 'prediction'])
    for i in range(min_train_size, len(data)):
        train = data.iloc[:i]
        test = data.iloc[i:i+1]
        prediction = model(train)
        results = pd.concat([results, pd.DataFrame({'date': data.index[i], 'actual': test, 'prediction': prediction})])
    return results

# Evaluation Function (RMSE, MAE, MAPE, MASE)
# TODO: Look into the Information Criteria and Residual Diagnostic Tests
# TODO: Add MASE

def evaluate_model(data, model, min_train_size=12):
    predictions = cross_validation_predictions(data, model, min_train_size=min_train_size)
    # match the index of the data with the predictions
    rmse = np.sqrt(mean_squared_error(predictions['actual'], predictions['prediction']))
    mae = mean_absolute_error(predictions['actual'], predictions['prediction'])
    mape = mean_absolute_percentage_error(predictions['actual'], predictions['prediction'])
    return pd.DataFrame({'rmse': rmse, 'mae': mae, 'mape': mape}, index=[0])

In [54]:
# Baseline Model Function
def baseline_model(data, method='mean'):
    if method == 'mean':
        return data.mean()
    elif method == 'median':
        return data.median()
    elif method == 'naive':
        return data[-1]
    else:
        raise ValueError('Method not supported')

In [55]:
FREQUENCY = 'M'
SEASONAL_PERIOD = 7

models_without_decomposition = {
    'baseline_mean': lambda x: baseline_model(x, 'mean'),
    'baseline_median': lambda x: baseline_model(x, 'median'),
    'baseline_naive': lambda x: baseline_model(x, 'naive'),
    'holt': lambda x: ExponentialSmoothing(x, trend='add', freq=FREQUENCY, seasonal_periods=SEASONAL_PERIOD).fit().forecast(1),
    'holt_winters': lambda x: ExponentialSmoothing(x, trend='add', seasonal='add', freq=FREQUENCY, seasonal_periods=SEASONAL_PERIOD).fit().forecast(1),
    'arima': lambda x: ARIMA(x, order=(1, 1, 1), freq=FREQUENCY).fit().forecast(1),
}

models_with_decomposition = {
    'baseline_mean': lambda x: baseline_model(x, 'mean'),
    'baseline_median': lambda x: baseline_model(x, 'median'),
    'baseline_naive': lambda x: baseline_model(x, 'naive'),
}

In [56]:
all_models_results = pd.DataFrame(columns=['name', 'rmse', 'mae', 'mape', 'mase'])
trend_data = sentiment_per_month['trend'].dropna()
for model_name, model in models_without_decomposition.items():
    results = evaluate_model(trend_data, model, min_train_size=24)
    results['name'] = model_name
    all_models_results = pd.concat([all_models_results, results])

for model_name, model in models_with_decomposition.items():
    results = evaluate_model(trend_data, model, min_train_size=24)
    results['name'] = model_name + '_decomposition'
    all_models_results = pd.concat([all_models_results, results])

all_models_results

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

Unnamed: 0,name,rmse,mae,mape,mase
0,baseline_mean,0.136492,0.117234,0.280201,
0,baseline_median,0.111963,0.091693,0.216692,
0,baseline_naive,0.020167,0.014906,0.033082,
0,holt,0.021111,0.015183,0.033503,
0,holt_winters,0.023304,0.016227,0.035523,
0,arima,0.019941,0.014322,0.03186,
0,baseline_mean_decomposition,0.136492,0.117234,0.280201,
0,baseline_median_decomposition,0.111963,0.091693,0.216692,
0,baseline_naive_decomposition,0.020167,0.014906,0.033082,


# Doing it for category sentiment time series

In [57]:
# Load the data
sentiment_restaurants_per_month_additive = pd.read_csv('features/sentiment_restaurants_per_month_additive.csv', index_col=0, parse_dates=True)
sentiment_nightlife_per_month_additive = pd.read_csv('features/sentiment_nightlife_per_month_additive.csv', index_col=0, parse_dates=True)
sentiment_breakfast_brunch_per_month_additive = pd.read_csv('features/sentiment_breakfast_brunch_per_month_additive.csv', index_col=0, parse_dates=True)

In [58]:
cat_sentiments_dict = [{'name': 'restaurants', 'data': sentiment_restaurants_per_month_additive},
                       {'name': 'nightlife', 'data': sentiment_nightlife_per_month_additive},
                       {'name': 'breakfast_brunch', 'data': sentiment_breakfast_brunch_per_month_additive}]

all_models_results = pd.DataFrame(columns=['name', 'rmse', 'mae', 'mape', 'mase'])

for cat_sentiment in cat_sentiments_dict:
    trend_data = cat_sentiment['data']['trend'].dropna()
    for model_name, model in models_without_decomposition.items():
        results = evaluate_model(trend_data, model, min_train_size=24)
        results['name'] = cat_sentiment['name'] + '_' + model_name
        all_models_results = pd.concat([all_models_results, results])

    for model_name, model in models_with_decomposition.items():
        results = evaluate_model(trend_data, model, min_train_size=24)
        results['name'] = cat_sentiment['name'] + '_' + model_name + '_decomposition'
        all_models_results = pd.concat([all_models_results, results])

all_models_results

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._

Unnamed: 0,name,rmse,mae,mape,mase
0,restaurants_baseline_mean,0.08876,0.07761,0.163263,
0,restaurants_baseline_median,0.069844,0.060644,0.125874,
0,restaurants_baseline_naive,0.010572,0.007256,0.014294,
0,restaurants_holt,0.010356,0.006652,0.01304,
0,restaurants_holt_winters,0.01301,0.009122,0.018111,
0,restaurants_arima,0.00726,0.004975,0.009799,
0,restaurants_baseline_mean_decomposition,0.08876,0.07761,0.163263,
0,restaurants_baseline_median_decomposition,0.069844,0.060644,0.125874,
0,restaurants_baseline_naive_decomposition,0.010572,0.007256,0.014294,
0,nightlife_baseline_mean,0.060578,0.049066,0.098387,
