In [47]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pmdarima import auto_arima
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error, mean_squared_error
import random
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
from matplotlib.backends.backend_pdf import PdfPages
from statsmodels.tsa.arima.model import ARIMAResults
from pmdarima.arima import AutoARIMA
from statsmodels.tsa.seasonal import STL
from autots import AutoTS
from IPython.display import Image
import sys
import logging
from autots.models.model_list import model_lists
import json
from contextlib import contextmanager
import os
from helpers.constants import START_FORECASTING_YEAR, FORECASTING_LENGTH
from helpers.auto_arima_modelling import forecast_arima
from helpers.autots_modelling import forecast_auto_ts

# 1. Forecasting values

<b> Important Assumption:</b>

1. For models where testing was impossible due to:
- Insufficient data for testing.
- Insufficient data for training.
  
2. For models where the prediction was constant (due to errors in forecasting):

It is assumed that we couldn't test a model and therefore a models cannot be chosen for forecasting. This assumption persists even if there is now sufficient training data available (since we are not partitioning for testing), or if the prediction would not have been constant under normal forecasting conditions.

<b> Action: </b>

In such cases, the LAST AVAILABLE VALUE will be chosen for forecasting.

### Read all data

In [8]:
#Read FAO data

# Define the path to the CSV file
csv_file = 'data/AOSTAT_animal_2022_cleaned.csv'

# Load the CSV file into a DataFrame
data_df = pd.read_csv(csv_file)

# Count unique combinations of 'Area' and 'Item'
unique_combinations = data_df.groupby(['Area', 'Item']).size().reset_index(name='Count')

# Display the unique combinations and their counts
print(unique_combinations)

             Area          Item  Count
0     Afghanistan         Asses     62
1     Afghanistan        Camels     62
2     Afghanistan        Cattle     62
3     Afghanistan         Goats     62
4     Afghanistan        Horses     62
...           ...           ...    ...
1163     Zimbabwe        Cattle     62
1164     Zimbabwe         Goats     62
1165     Zimbabwe        Horses     62
1166     Zimbabwe         Sheep     62
1167     Zimbabwe  Swine / pigs     62

[1168 rows x 3 columns]


In [9]:
# Read all excel outputfiles

# Assuming your CSV file is in the current directory or you provide the full path
excel_file = 'ARIMA_evaluation_results_all.xlsx'
#Read the CSV file into a DataFrame
df = pd.read_excel(excel_file)

#obtain only the results where it could be modelled with an AutoARIMA and also where the data had an error like not enough data, not training data...
df_filtered = df[((df['Flag results'] == 'AutoARIMA') & (df['Predictions'] == 'MODEL')) | (df['Flag results'] != 'AutoARIMA')]

arima_results_df = df_filtered[['Country', 'Animal Type', 'Flag results', 'Model_type', 'Flag', 'Predictions']]

# Count the occurrences of each unique value in 'Flag results'
flag_results_counts = arima_results_df['Flag results'].value_counts()

# Display the counts
print(flag_results_counts)


Flag results
AutoARIMA     599
NoTestData     48
No Data        45
constant        1
Name: count, dtype: int64


In [10]:

# Assuming your CSV file is in the current directory or you provide the full path
excel_file = 'AUTOTS_evaluation_results_all.xlsx'
#Read the CSV file into a DataFrame
df = pd.read_excel(excel_file)
df_filtered = df[((df['Flag results'] == 'AutoTS') & (df['Predictions'] == 'MODEL'))]

autoTS_results_df = df_filtered[['Country', 'Animal Type', 'Flag results', 'Model_type', 'Flag', 'Predictions']]

# Count the occurrences of each unique value in 'Flag results'
flag_results_counts = autoTS_results_df['Flag results'].value_counts()

# Display the counts
print(flag_results_counts)


Flag results
AutoTS    190
Name: count, dtype: int64


In [11]:

# Assuming your CSV file is in the current directory or you provide the full path
excel_file = 'AUTOTS_evaluation_results_ensemble_all.xlsx'
#Read the CSV file into a DataFrame
results_autots_ensemble_df = pd.read_excel(excel_file)

#for the autoTS-with esemble we keep all the results, the ones that failed and nto as we are not going to try more models
autoTS_esemble_results_df = results_autots_ensemble_df[['Country', 'Animal Type', 'Flag results', 'Model_type', 'Flag', 'Predictions']]

# Count the occurrences of each unique value in 'Flag results'
flag_results_counts = autoTS_esemble_results_df['Flag results'].value_counts()

# Display the counts
print(flag_results_counts)


Flag results
AutoTS-ensemble    281
Unknown Model        6
Name: count, dtype: int64


### Merge all result dataframes

In [12]:
# Concatenate the DataFrames
results_df= pd.concat([autoTS_esemble_results_df, autoTS_results_df, arima_results_df], ignore_index=True)
# Specify the path where you want to save the Excel file
excel_file = 'final_results_df_concatenated.xlsx'

# Save the DataFrame to Excel
results_df.to_excel(excel_file, index=False)
results_df

Unnamed: 0,Country,Animal Type,Flag results,Model_type,Flag,Predictions
0,Benin,Swine / pigs,AutoTS-ensemble,AutoTS BestN,AUTOTS-InsideCINotMet,LASTVALUE
1,Kyrgyzstan,Horses,AutoTS-ensemble,AutoTS BestN,AUTOTS-InsideCINotMet,LASTVALUE
2,Greece,Goats,AutoTS-ensemble,,AUTOTS-InsideCINotMet,LASTVALUE
3,Togo,Asses,AutoTS-ensemble,AutoTS BestN,AUTOTS-InsideCINotMet,LASTVALUE
4,Honduras,Swine / pigs,AutoTS-ensemble,AutoTS BestN,AUTOTS-InsideCINotMet,LASTVALUE
...,...,...,...,...,...,...
1165,Yugoslav SFR,Buffalo,NoTestData,,NoTestData,LASTVALUE
1166,Yugoslav SFR,Cattle,NoTestData,,NoTestData,LASTVALUE
1167,Yugoslav SFR,Horses,NoTestData,,NoTestData,LASTVALUE
1168,Yugoslav SFR,Sheep,NoTestData,,NoTestData,LASTVALUE


#### Count the porportion of models  where we couldn't get a "model"prediction

In [13]:
# Count the occurrences of each unique value in 'Predictions'
predictions_counts = results_df['Predictions'].value_counts()

# Calculate the percentage of each unique value
percentage_counts = (predictions_counts / len(results_df)) * 100

# Display the percentage counts
print(percentage_counts)

Predictions
MODEL        72.051282
LASTVALUE    27.948718
Name: count, dtype: float64


In [14]:
# Count the occurrences of each unique value in 'Predictions'
predictions_counts = results_df['Flag results'].value_counts()

# Calculate the percentage of each unique value
percentage_counts = (predictions_counts / len(results_df)) * 100

# Display the percentage counts
print(percentage_counts)

Flag results
AutoARIMA          51.196581
AutoTS-ensemble    24.017094
AutoTS             16.239316
NoTestData          4.102564
No Data             3.846154
Unknown Model       0.512821
constant            0.085470
Name: count, dtype: float64


In [15]:
# Count the occurrences of each unique value in 'Predictions'
predictions_counts = results_df['Flag'].value_counts()

# Calculate the percentage of each unique value
percentage_counts = (predictions_counts / len(results_df)) * 100

# Display the percentage counts
print(percentage_counts)

Flag
AutoARIMA                51.196581
AUTOTS-InsideCINotMet    15.641026
AutoTS-ARIMA              8.376068
AutoTS-ETS                7.350427
AutoTS-BestN              4.700855
NoTestData                4.102564
No Data                   3.846154
AutoTS-nan                3.675214
Unknown Model-nan         0.512821
AutoTS-FBProphet          0.512821
constant                  0.085470
Name: count, dtype: float64


### Perform forecasting

In [16]:

# Split into two DataFrames based on 'Predictions'
unique_combinations_modelling = results_df[results_df['Predictions'] == 'MODEL'][['Country', 'Animal Type']].drop_duplicates()
unique_combinations_last_value = results_df[results_df['Predictions'] == 'LASTVALUE'][['Country', 'Animal Type']].drop_duplicates()

In [38]:
results_df
results_filtered_df = results_df[(results_df['Country'] == 'Canada') & (results_df['Animal Type'] == 'Sheep')]
results_df



Unnamed: 0,Country,Animal Type,Flag results,Model_type,Flag,Predictions
0,Benin,Swine / pigs,AutoTS-ensemble,AutoTS BestN,AUTOTS-InsideCINotMet,LASTVALUE
1,Kyrgyzstan,Horses,AutoTS-ensemble,AutoTS BestN,AUTOTS-InsideCINotMet,LASTVALUE
2,Greece,Goats,AutoTS-ensemble,,AUTOTS-InsideCINotMet,LASTVALUE
3,Togo,Asses,AutoTS-ensemble,AutoTS BestN,AUTOTS-InsideCINotMet,LASTVALUE
4,Honduras,Swine / pigs,AutoTS-ensemble,AutoTS BestN,AUTOTS-InsideCINotMet,LASTVALUE
...,...,...,...,...,...,...
1165,Yugoslav SFR,Buffalo,NoTestData,,NoTestData,LASTVALUE
1166,Yugoslav SFR,Cattle,NoTestData,,NoTestData,LASTVALUE
1167,Yugoslav SFR,Horses,NoTestData,,NoTestData,LASTVALUE
1168,Yugoslav SFR,Sheep,NoTestData,,NoTestData,LASTVALUE


In [56]:
# Define the number of combinations to select
num_combinations = 4
unique_combinations = results_df[['Country', 'Animal Type']].drop_duplicates()

# Randomly select a subset of unique country-animal combinations
random_combinations = unique_combinations.sample(num_combinations, replace=False, random_state=123)
evaluation_results = {}

# Initialize a dictionary to store error information
error_log = {}

# Initialize an empty DataFrame to store forecasted results
#forecast_df = pd.DataFrame(columns=['Country', 'Animal Type', 'Year', 'Forecasted Value', 'Lower CI', 'Upper CI', 'Model Type', 'Prediction', 'Flag'])
forecast_data = []

# Iterate over unique_combinations
with PdfPages('forecasted_values.pdf') as pdf:
    for index, row in random_combinations.iterrows():
        title_color = 'red'
        country = row['Country']
        animal_type = row['Animal Type']
        
        # Filter the DataFrame for the specific country and animal type combination
        filtered_df = data_df[(data_df['Area'] == country) & (data_df['Item'] == animal_type)]
        results_filtered_df = results_df[(results_df['Country'] == country) & (results_df['Animal Type'] == animal_type)]
        forecast_years = range(START_FORECASTING_YEAR, START_FORECASTING_YEAR + FORECASTING_LENGTH)

        
        try:     
            if results_filtered_df['Predictions'].item() == 'MODEL':
                 
                # Evaluate models based on model_type
                if 'AutoTS' in results_filtered_df['Model_type'].item():
                    print('AutoTS')
                    
                    if 'AutoTS-ensemble' in results_filtered_df['Model_type'].item():
                        model_type, selected_model, constant_prediction, proportion_in_ci = forecast_auto_ts(data_df, country, animal_type, ensemble='simple')
                        model_type += '-ensemble'  # Append '-ensemble' to model_type
                    else:
                        model_name = results_filtered_df['Model_type'].item().split(' ')[1]
                        model_type, selected_model, constant_prediction, proportion_in_ci = forecast_auto_ts(data_df, country, animal_type, model_list=[model_name], ensemble=None)
                    
                elif 'AutoARIMA' in results_filtered_df['Model_type'].item():
                    print('AutoARIMA')
                    model_type, selected_model, constant_prediction, predictions, conf_int = forecast_arima(data_df, country, animal_type)
                else:
                    raise ValueError("No valid model type found.")
                    
            elif results_filtered_df['Predictions'].item() == 'LASTVALUE':
                last_value = filtered_df.loc[filtered_df['Year'].idxmax(), 'Value']  # Get the last value available
                predictions = [last_value] * FORECASTING_LENGTH
                conf_int = None
                model_type = None
                selected_model = results_filtered_df['Flag results'].item()
                constant_prediction = True

            # Prepare data for forecasted values from 2024 to 2034
           
            forecast_values = predictions
            if conf_int is not None:
                lower_ci = conf_int[:, 0]
                upper_ci = conf_int[:, 1]
            else:
                lower_ci = [None] * FORECASTING_LENGTH  # Set None for lower CI
                upper_ci = [None] * FORECASTING_LENGTH  # Set None for upper CI

            # Append forecasted values to forecast_data
            for year, forecast_val, lc, uc in zip(forecast_years, forecast_values, lower_ci, upper_ci):
                forecast_data.append({
                    'Country': country,
                    'Animal Type': animal_type,
                    'Year': year,
                    'Forecasted Value': forecast_val,
                    'Lower CI': lc,
                    'Upper CI': uc,
                    'Model Type': selected_model,
                    'Prediction': 'MODEL',
                    'Flag': model_type,
                })
        
            # Plot actual vs. predicted values (example plot)
            plt.figure(figsize=(8, 4))
            plt.plot(filtered_df['Year'], filtered_df['Value'], label='Actual Data')

            if predictions is not None:
                plt.plot(forecast_years, forecast_values, label='Forecasted Data')
                # Plot confidence intervals ensuring first that they are not None
                if conf_int is not None and not constant_prediction:
                    title_color = 'green'
                    plt.fill_between(forecast_years, conf_int[:, 0], conf_int[:, 1], color='gray', alpha=0.2, label='Confidence Interval')

            plt.xlabel('Year')
            plt.ylabel('Population')
            title_text_obj = plt.title(f'{selected_model} prediction for {country} - {animal_type}')
            title_text_obj.set_color(title_color)
            plt.legend()
            plt.grid(True)

            # Save the current plot to the PDF
            pdf.savefig()
            plt.close()
                
        except Exception as e:
            # Log the error and continue to the next iteration
            error_log[(country, animal_type)] = str(e)
             # Plot actual vs. predicted values
            plt.figure(figsize=(8, 4))
            plt.plot(filtered_df['Year'], filtered_df['Value'], label='Actual Data')
            plt.xlabel('Year')
            plt.ylabel('Population')
            title_text_obj = plt.title(f'{selected_model} prediction for {country} - {animal_type}')
            title_text_obj.set_color(title_color)
            plt.legend()
            plt.grid(True)

            # Save the current plot to the PDF
            pdf.savefig()
            plt.close()
            continue
# Convert forecast_data list to DataFrame
forecast_df = pd.DataFrame(forecast_data)
# After the loop, you can inspect the error_log if needed
print("\nErrors encountered:")
for key, error_message in error_log.items():
    print(f"For {key}: {error_message}")



AutoARIMA
AutoTS
Using 3 cpus for n_jobs.
AutoARIMA

Errors encountered:
For ('Eswatini', 'Horses'): Model String 'BestN' not a recognized model type


In [58]:
# Specify the file path where you want to save the Excel file
file_path = 'forecast_data.xlsx'

# Save the DataFrame to Excel
forecast_df.to_excel(file_path, index=False)
forecast_df

Unnamed: 0,Country,Animal Type,Year,Forecasted Value,Lower CI,Upper CI,Model Type,Prediction,Flag
0,Oman,Cattle,2024,436545.1,419970.948568,453119.203605,"ARIMA(0,1,1)(0,0,0)[0] intercept",MODEL,AutoARIMA
1,Oman,Cattle,2025,442258.4,416265.4031,468251.309535,"ARIMA(0,1,1)(0,0,0)[0] intercept",MODEL,AutoARIMA
2,Oman,Cattle,2026,447971.6,415160.578489,480782.694607,"ARIMA(0,1,1)(0,0,0)[0] intercept",MODEL,AutoARIMA
3,Oman,Cattle,2027,453684.9,415246.687827,492123.14573,"ARIMA(0,1,1)(0,0,0)[0] intercept",MODEL,AutoARIMA
4,Oman,Cattle,2028,459398.2,416057.346085,502739.047934,"ARIMA(0,1,1)(0,0,0)[0] intercept",MODEL,AutoARIMA
5,Oman,Cattle,2029,465111.5,417368.820497,512854.133983,"ARIMA(0,1,1)(0,0,0)[0] intercept",MODEL,AutoARIMA
6,Oman,Cattle,2030,470824.8,419053.209427,522596.305514,"ARIMA(0,1,1)(0,0,0)[0] intercept",MODEL,AutoARIMA
7,Oman,Cattle,2031,476538.0,421029.253704,532046.821698,"ARIMA(0,1,1)(0,0,0)[0] intercept",MODEL,AutoARIMA
8,Oman,Cattle,2032,482251.3,423241.513536,541261.122327,"ARIMA(0,1,1)(0,0,0)[0] intercept",MODEL,AutoARIMA
9,Oman,Cattle,2033,487964.6,425650.162204,550279.03412,"ARIMA(0,1,1)(0,0,0)[0] intercept",MODEL,AutoARIMA
