In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pmdarima import auto_arima
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error
import random
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from matplotlib.backends.backend_pdf import PdfPages
from statsmodels.tsa.arima.model import ARIMAResults
from pmdarima.arima import AutoARIMA
from statsmodels.tsa.seasonal import STL
from autots import AutoTS
from IPython.display import Image
import sys
import logging
from autots.models.model_list import model_lists
import json
from contextlib import contextmanager
import os
from helpers.constants import START_FORECASTING_YEAR, FORECASTING_LENGTH

# 1. Forecasting values

<b> Important Assumption:</b>

1. For models where testing was impossible due to:
- Insufficient data for testing.
- Insufficient data for training.
  
2. For models where the prediction was constant (due to errors in forecasting):

It is assumed that we couldn't test a model and therefore a models cannot be chosen for forecasting. This assumption persists even if there is now sufficient training data available (since we are not partitioning for testing), or if the prediction would not have been constant under normal forecasting conditions.

<b> Action: </b>

In such cases, the LAST AVAILABLE VALUE will be chosen for forecasting.

### Read all data

In [15]:
#Read FAO data

# Define the path to the CSV file
csv_file = 'data/AOSTAT_animal_2022_cleaned.csv'

# Load the CSV file into a DataFrame
data_df = pd.read_csv(csv_file)

# Count unique combinations of 'Area' and 'Item'
unique_combinations = data_df.groupby(['Area', 'Item']).size().reset_index(name='Count')

# Display the unique combinations and their counts
print(unique_combinations)

             Area          Item  Count
0     Afghanistan         Asses     62
1     Afghanistan        Camels     62
2     Afghanistan        Cattle     62
3     Afghanistan         Goats     62
4     Afghanistan        Horses     62
...           ...           ...    ...
1163     Zimbabwe        Cattle     62
1164     Zimbabwe         Goats     62
1165     Zimbabwe        Horses     62
1166     Zimbabwe         Sheep     62
1167     Zimbabwe  Swine / pigs     62

[1168 rows x 3 columns]


In [26]:
# Read all excel outputfiles

# Assuming your CSV file is in the current directory or you provide the full path
excel_file = 'ARIMA_evaluation_results_all.xlsx'
#Read the CSV file into a DataFrame
df = pd.read_excel(excel_file)

#obtain only the results where it could be modelled with an AutoARIMA and also where the data had an error like not enough data, not training data...
df_filtered = df[((df['Flag results'] == 'AutoARIMA') & (df['Predictions'] == 'MODEL')) | (df['Flag results'] != 'AutoARIMA')]

arima_results_df = df_filtered[['Country', 'Animal Type', 'Flag results', 'Model_type', 'Flag', 'Predictions']]

# Count the occurrences of each unique value in 'Flag results'
flag_results_counts = arima_results_df['Flag results'].value_counts()

# Display the counts
print(flag_results_counts)


Flag results
AutoARIMA     589
NoTestData     48
No Data        45
constant        1
Name: count, dtype: int64


In [46]:

# Assuming your CSV file is in the current directory or you provide the full path
excel_file = 'AUTOTS_evaluation_results_all.xlsx'
#Read the CSV file into a DataFrame
df = pd.read_excel(excel_file)
df_filtered = df[((df['Flag results'] == 'AutoTS') & (df['Predictions'] == 'MODEL'))]

autoTS_results_df = df_filtered[['Country', 'Animal Type', 'Flag results', 'Model_type', 'Flag', 'Predictions']]

# Count the occurrences of each unique value in 'Flag results'
flag_results_counts = autoTS_results_df['Flag results'].value_counts()

# Display the counts
print(flag_results_counts)


Flag results
AutoTS    198
Name: count, dtype: int64


In [48]:

# Assuming your CSV file is in the current directory or you provide the full path
excel_file = 'AUTOTS_evaluation_results_ensemble_all.xlsx'
#Read the CSV file into a DataFrame
results_autots_ensemble_df = pd.read_excel(excel_file)

#for the autoTS-with esemble we keep all the results, the ones that failed and nto as we are not going to try more models
autoTS_esemble_results_df = results_autots_ensemble_df[['Country', 'Animal Type', 'Flag results', 'Model_type', 'Flag', 'Predictions']]

# Count the occurrences of each unique value in 'Flag results'
flag_results_counts = autoTS_esemble_results_df['Flag results'].value_counts()

# Display the counts
print(flag_results_counts)


Flag results
AutoTS-ensemble    281
Unknown Model        6
Name: count, dtype: int64


### Merge all result dataframes

In [58]:
# Concatenate the DataFrames
results_df= pd.concat([autoTS_esemble_results_df, autoTS_results_df, arima_results_df], ignore_index=True)
# Specify the path where you want to save the Excel file
excel_file = 'final_results_df_concatenated.xlsx'

# Save the DataFrame to Excel
results_df.to_excel(excel_file, index=False)
results_df

Unnamed: 0,Country,Animal Type,Flag results,Model_type,Flag,Predictions
0,Benin,Swine / pigs,AutoTS-ensemble,AutoTS BestN,AUTOTS-InsideCINotMet,LASTVALUE
1,Kyrgyzstan,Horses,AutoTS-ensemble,AutoTS BestN,AUTOTS-InsideCINotMet,LASTVALUE
2,Greece,Goats,AutoTS-ensemble,,AUTOTS-InsideCINotMet,LASTVALUE
3,Togo,Asses,AutoTS-ensemble,AutoTS BestN,AUTOTS-InsideCINotMet,LASTVALUE
4,Honduras,Swine / pigs,AutoTS-ensemble,AutoTS BestN,AUTOTS-InsideCINotMet,LASTVALUE
...,...,...,...,...,...,...
1163,Yugoslav SFR,Buffalo,NoTestData,,NoTestData,LASTVALUE
1164,Yugoslav SFR,Cattle,NoTestData,,NoTestData,LASTVALUE
1165,Yugoslav SFR,Horses,NoTestData,,NoTestData,LASTVALUE
1166,Yugoslav SFR,Sheep,NoTestData,,NoTestData,LASTVALUE


#### Count the porportion of models  where we couldn't get a "model"prediction

In [51]:
# Count the occurrences of each unique value in 'Predictions'
predictions_counts = results_df['Predictions'].value_counts()

# Calculate the percentage of each unique value
percentage_counts = (predictions_counts / len(results_df)) * 100

# Display the percentage counts
print(percentage_counts)

Predictions
MODEL        72.003425
LASTVALUE    27.996575
Name: count, dtype: float64


In [55]:
# Count the occurrences of each unique value in 'Predictions'
predictions_counts = results_df['Flag results'].value_counts()

# Calculate the percentage of each unique value
percentage_counts = (predictions_counts / len(results_df)) * 100

# Display the percentage counts
print(percentage_counts)

Flag results
AutoARIMA          50.428082
AutoTS-ensemble    24.058219
AutoTS             16.952055
NoTestData          4.109589
No Data             3.852740
Unknown Model       0.513699
constant            0.085616
Name: count, dtype: float64


In [56]:
# Count the occurrences of each unique value in 'Predictions'
predictions_counts = results_df['Flag'].value_counts()

# Calculate the percentage of each unique value
percentage_counts = (predictions_counts / len(results_df)) * 100

# Display the percentage counts
print(percentage_counts)

Flag
AutoARIMA                50.428082
AUTOTS-InsideCINotMet    15.667808
AutoTS-ARIMA              8.647260
AutoTS-ETS                7.791096
AutoTS-BestN              4.708904
NoTestData                4.109589
No Data                   3.852740
AutoTS-nan                3.681507
Unknown Model-nan         0.513699
AutoTS-FBProphet          0.513699
constant                  0.085616
Name: count, dtype: float64


### Perform forecasting

In [60]:

# Split into two DataFrames based on 'Predictions'
unique_combinations_modelling = results_df[results_df['Predictions'] == 'MODEL'][['Country', 'Animal Type']].drop_duplicates()
unique_combinations_last_value = results_df[results_df['Predictions'] == 'LASTVALUE'][['Country', 'Animal Type']].drop_duplicates()

Unnamed: 0,Country,Animal Type
0,Benin,Swine / pigs
1,Kyrgyzstan,Horses
2,Greece,Goats
3,Togo,Asses
4,Honduras,Swine / pigs
...,...,...
1163,Yugoslav SFR,Buffalo
1164,Yugoslav SFR,Cattle
1165,Yugoslav SFR,Horses
1166,Yugoslav SFR,Sheep


In [71]:
results_df
results_filtered_df = results_df[(results_df['Country'] == 'Kyrgyzstan') & (results_df['Animal Type'] == 'Horses')]
# Evaluate models based on model_type
if results_filtered_df['Model_type'].str.contains('AutoTS').any():
  

In [None]:
# Define the number of combinations to select
num_combinations = 5

# Randomly select a subset of unique country-animal combinations
random_combinations = unique_combinations_modelling.sample(num_combinations, replace=False, random_state=123)
evaluation_results = {}

# Initialize a dictionary to store error information
error_log = {}

# Initialize an empty DataFrame to store forecasted results
forecast_df = pd.DataFrame(columns=['Country', 'Animal Type', 'Year', 'Forecasted Value', 'Lower CI', 'Upper CI', 'Model Type', 'Prediction', 'Flag'])

# Iterate over unique_combinations
with PdfPages('forecasted_values.pdf') as pdf:
    for index, row in random_combinations.iterrows():
        title_color = 'red'
        country = row['Country']
        animal_type = row['Animal Type']
        
        # Filter the DataFrame for the specific country and animal type combination
        filtered_df = data_df[(data_df['Country'] == country) & (data_df['Animal Type'] == animal_type)]
        results_filtered_df = results_df[(results_df['Country'] == country) & (results_df['Animal Type'] == animal_type)]
        
        try:
            
            # Evaluate models based on model_type
            if results_filtered_df['Model_type'].str.contains('AutoTS').any():
                if results_filtered_df['Model_type'].str.contains('AutoTS-ensemble').any():
                    results = evaluate_AUTOTS_model(df_autots, country, animal_type, ensemble=ensemble)   
                    model_type, selected_model, constant_prediction, test_values, test_years, predictions, conf_int, proportion_in_ci = results
                    evaluation_results[(country, animal_type)] = (model_type, selected_model, constant_prediction, proportion_in_ci)
                    
                    model_type, selected_model, constant_prediction, proportion_in_ci = evaluate_AUTOTS_model(df, country, animal_type, ensemble='simple')
                else:
                    model_name = filtered_df.loc[filtered_df['Model_type'].str.contains('AutoTS'), 'Model_type'].values[0].split(' ')[1]
                    model_type, selected_model, constant_prediction, proportion_in_ci = evaluate_AUTOTS_model(df, country, animal_type, model=model_name)
                    
            elif filtered_df['Model_type'].str.contains('AutoARIMA').any():
                model_type, constant_prediction, predictions, conf_int= forecast_arima(df_arima, country, animal_type)
            else:
                raise ValueError("No valid model type found.")

            # Prepare data for forecasted values from 2024 to 2034
            forecast_years = range(2024, 2035)
            forecast_values = predictions
            lower_ci = conf_int[:, 0]
            upper_ci = conf_int[:, 1]  # Example upper CI

    
            # Append forecasted values to forecast_df
            for year, forecast_val, lc, uc in zip(forecast_years, forecast_values, lower_ci, upper_ci):
                forecast_df = forecast_df.append({
                    'Country': country,
                    'Animal Type': animal_type,
                    'Year': year,
                    'Forecasted Value': forecast_val,
                    'Lower CI': lc,
                    'Upper CI': uc,
                    'Model Type': model_type,
                    'Prediction': 'MODEL',
                    'Flag': 'Forecasted'
                }, ignore_index=True)

            # Plot actual vs. predicted values (example plot)
            plt.figure(figsize=(8, 4))
            plt.plot(filtered_df['Year'], filtered_df['Value'], label='Actual Data')
            plt.plot(forecast_years, forecast_values, label='Forecasted Data')
            plt.xlabel('Year')
            plt.ylabel('Population')
            title_text_obj = plt.title(f'{selected_model} prediction for {country} - {animal_type}')
            title_text_obj.set_color(title_color)
            plt.legend()
            plt.grid(True)

            # Save the current plot to the PDF
            pdf.savefig()
            plt.close()
        
        except Exception as e:
            # Log the error and continue to the next iteration
            error_log[(country, animal_type)] = str(e)
            continue

# After the loop, you can inspect the forecast_df containing the forecasted values and details
print("Forecast DataFrame:")
print(forecast_df)

# After the loop, you can inspect the error_log if needed
print("\nErrors encountered:")
for key, error_message in error_log.items():
    print(f"For {key}: {error_message}")

In [None]:
def fit_autots_model(df, country, animal_type, data_length = MIN_TRAINING_SAMPLES, forecast_length = 1, ensemble = None ):
    """
    Fits an AutoTS model to the specified data.

    Parameters:
    -----------
    df : pd.DataFrame
        The data frame containing the time series data.
    country : str
        The country for which the model is to be fitted.
    animal_type : str
        The type of animal for which the model is to be fitted.
    data_length : int, optional
        The minimum number of samples required to run the AutoTS model. Default is 20.
    forecast_length(int or None): 
        The number of periods to forecast into the future. If None, it will be set based on test data length.
    ensemble(str or None): 
        Defines whether to apply esemble methods or not to AUTOTS model options are ('simple', 'horizontal', 'vertical', 'stacked', 'all')
        Default None

    Returns:
    --------
    dict
        A dictionary containing the fitted AutoTS model and its type, or None if there is not enough data or an error occurs.
    """

    # List of models to consider in AutoTS
    model_list = [
        'ARIMA',
        'FBProphet',
        'ETS',
    ]
    
    # Initialize AutoTS model with specified parameters
    model = AutoTS(
        forecast_length=forecast_length, # Number of periods to forecast into the future
        frequency='infer',            # Frequency of the time series data ('infer' tries to auto-detect)
        prediction_interval=CONFIDENCE_INTERVAL,      # Confidence level for prediction intervals (e.g., 95%)
        ensemble= ensemble,                # Type of ensemble method ('simple', 'horizontal', 'vertical', 'stacked', 'all')
        transformer_list="fast",  # "superfast",
        max_generations=4,             # Maximum number of generations for evolutionary optimization
        num_validations=2,             # Number of cross-validation folds
        validation_method='backwards', # Method for cross-validation ('backwards', 'rolling', 'expanding')
        model_list=model_list,         # List of models to consider ('fast', 'default', 'all', or a custom list)
        no_negatives=True,             # Whether to restrict forecasted values to be non-negative
        n_jobs='auto'                  # Number of jobs to run in parallel ('auto' uses all available cores)
    )

    try:
        data = df
    
        
        if not data.empty:
            # Ensure data length is sufficient
            if len(data) < data_length:
                return {'model': None, 'type': 'No Data'}
            
            # Check if the series is constant
            elif data['Value'].nunique() == 1:
               #print(f"Time series for {country} - {animal_type} is constant. No model will be fitted.")
                return {'model': None, 'type': 'constant'}
            else:
            
                # Fit the AutoTS model with suppressed outpu
                with suppress_stdout():
                    model = model.fit(
                        data,
                        date_col='Year_datetime',
                        value_col='Value')
                    
                
                return {'model': model, 'type': 'AutoTS'}

        else:
            return {'model': None, 'type': 'No Data'}

    except ValueError as e:
        return {'model': None, 'type': 'Error'}

In [None]:
def evaluate_AUTOTS_model(df, country, animal_type, test_start_year = TEST_YEAR, ensemble = None):
    """
    Evaluate the forecasting model for a specific country and animal type.

    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing the time series data with columns 'Year', 'Area', 'Item', and 'Value'.
    country : str
        The country for which the model is to be evaluated.
    animal_type : str
        The type of animal for which the model is to be evaluated.
    test_start_year : int
        The year from which the test data starts. Data before this year will be used for training,
        and data from this year onwards will be used for testing.
    ensemble(str or None): 
        Defines whether to apply esemble methods or not to AUTOTS model options are ('simple', 'horizontal', 'vertical', 'stacked', 'all')
        Default None

    Returns:
    --------
    tuple
        A tuple containing:
        - model_type : str or None
            Type of the selected model ('AutoTS', 'constant', 'No Data', etc.), or None if no data is available.
        - selected_model : dict or None
            Information about the selected model, if applicable (e.g., model parameters), or None if model is None.
        - best_fitness_score : float or None
            The fitness score of the selected model, if applicable, or None if model is None.
        - constant_prediction : bool
            Indicates if the model made constant predictions.
        - actual_values : pandas.Series or None
            The actual values of the test set, or None if no test data is available.
        - test_years : pandas.Series or None
            The years corresponding to the actual values, or None if no test data is available.
        - predictions : numpy.ndarray or None
            The predicted values for the test set, or None if model is None.
        - conf_int : numpy.ndarray or None
            The confidence intervals for the predicted values, or None if model is None or constant predictions.
        - proportion_in_ci : float or None
            The proportion of actual values that fall within the confidence intervals of the predicted values,
            or None if model is None, constant predictions, or no confidence intervals.

    """

  
    # Filter the data for the specified country and animal type
    train_data = df[(df['Area'] == country) & (df['Item'] == animal_type) & (df['Year'] < test_start_year)].copy()
    test_data = df[(df['Area'] == country) & (df['Item'] == animal_type) & (df['Year'] >= test_start_year)].copy()
    forecast_length = len( test_data) 
    if test_data.empty:
        # No test data available
        model_type = 'No Data'
        selected_model = None
        constant_prediction = False
        test_data['Value'] = None
        test_data['Year'] = None
        predictions = None
        conf_int = None
        proportion_in_ci = None

        return model_type, selected_model, constant_prediction, test_data['Value'], \
               test_data['Year'], predictions, conf_int, proportion_in_ci

    # Fit the model on the training data
    model_info = fit_autots_model(train_data, country, animal_type, forecast_length = forecast_length, ensemble = ensemble)
 
    # Retrieve information about the selected model
    model = model_info['model']
    model_type = model_info['type']


    if model is None:
        # Model could not be fitted or no suitable model found
        model_type = 'Unknown Model'
        selected_model = None
        constant_prediction = False
        test_data['Value'] = None
        test_data['Year'] = None
        predictions = None
        conf_int = None
        proportion_in_ci = None

        return model_type, selected_model, constant_prediction, test_data['Value'], \
               test_data['Year'], predictions, conf_int, proportion_in_ci

    #check type of model emsemble in order to be able to access the type of model in the correct way
    if not model.ensemble:
        best_selected_model_spects= model.best_model
        selected_model = best_selected_model_spects['Model'].values[0]
    else:
        # Access information about the selected model
        best_selected_model_spects= model.best_model['ModelParameters']

        # Function to extract model name
        def extract_model_name(json_str):
            try:
                json_data = json.loads(json_str)
                return json_data.get('model_name')
            except json.JSONDecodeError:
                return None
        
        # Apply the function to extract model names
        model_names = best_selected_model_spects.apply(extract_model_name)
        
        # Get the extracted model name (assuming there's only one entry)
        selected_model = model_names.iloc[0] if not model_names.empty else None


    if model_type == 'constant':
        # Constant prediction model
        predictions = np.full(len(test_data), train_data['Value'].mean())
        conf_int = None
        constant_prediction = False

                
    elif model_type == 'AutoTS':
        # AutoTS model prediction
        # Forecast for the length of your test data
        prediction =  model.predict()
        # Extract predictions and intervals
        predictions = prediction.forecast
        lower_forecast = prediction.lower_forecast
        upper_forecast = prediction.upper_forecast
        conf_int = np.hstack((lower_forecast[['Value']].values, upper_forecast[['Value']].values))
        # Check if all predicted values are the same and generate a flag
        constant_prediction = np.all(predictions['Value'] == predictions['Value'][0])
        if constant_prediction:
            conf_int = None

    else:
        # Unknown model type
        model_type = 'Unknown Model'
        selected_model = None
        constant_prediction = False
        predictions = None
        conf_int = None
 
    # Calculate the proportion of actual values within the confidence intervals
    if conf_int is not None:
        proportion_in_ci = calculate_percentage_within_ci(test_data['Value'], conf_int)
    else:
        proportion_in_ci = None


    return model_type, selected_model, constant_prediction, test_data['Value'], test_data['Year'], predictions, conf_int, proportion_in_ci