In [41]:
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from math import sqrt
import warnings

warnings.filterwarnings("ignore")

In [35]:
data_2019 = pd.read_csv('2019.csv')
data_2020 = pd.read_csv('2020.csv')
data_2021 = pd.read_csv('2021.csv')
data_2022 = pd.read_csv('2022.csv')
data_2023 = pd.read_csv('2023.csv')

In [50]:
# Concatenate all years' data into one DataFrame
all_data = pd.concat([data_2019, data_2020, data_2021, data_2022])

# Data preprocessing: Remove null values and set index
all_data.dropna(inplace=True)
all_data.set_index(['Província', 'Distrito'], inplace=True)

In [53]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 159 entries, ('Niassa', 'Lago') to ('Nampula', 'Mogovolas')
Data columns (total 53 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   W1      159 non-null    int64  
 1   W2      159 non-null    int64  
 2   W3      159 non-null    int64  
 3   W4      159 non-null    int64  
 4   W5      159 non-null    int64  
 5   W6      159 non-null    int64  
 6   W7      159 non-null    int64  
 7   W8      159 non-null    int64  
 8   W9      159 non-null    int64  
 9   W10     159 non-null    int64  
 10  W11     159 non-null    int64  
 11  W12     159 non-null    float64
 12  W13     159 non-null    int64  
 13  W14     159 non-null    int64  
 14  W15     159 non-null    int64  
 15  W16     159 non-null    int64  
 16  W17     159 non-null    int64  
 17  W18     159 non-null    int64  
 18  W19     159 non-null    int64  
 19  W20     159 non-null    int64  
 20  W21     159 non-null    int64  
 21  W

In [51]:
# Function to evaluate ARIMA model on a given province and district
def evaluate_arima_model(train_data, test_data):
    # Define the model
    model = ARIMA(train_data, order=(5,1,0))
    # Fit the model
    model_fit = model.fit()
    # Make predictions
    predictions = model_fit.forecast(steps=len(test_data))[0]
    # Calculate RMSE
    rmse = sqrt(mean_squared_error(test_data, predictions))
    return rmse

In [52]:
# Iterate over each province and district to train and test ARIMA model
total_rmse = 0
num_records = 0
for (province, district), group in all_data.groupby(level=[0, 1]):
    # Aggregate the data by taking the mean across the weeks
    train_data_aggregated = group.iloc[:, 2:-1].mean(axis=1)  # Exclude the first two columns (Province and District)
    test_data_aggregated = group.iloc[:, -1]

    # Convert to univariate time series (pandas Series)
    train_data_aggregated = train_data_aggregated.squeeze()
    test_data_aggregated = test_data_aggregated.squeeze()

    try:
        # Evaluate ARIMA model
        rmse = evaluate_arima_model(train_data_aggregated, test_data_aggregated)
        print(f'Province: {province}, District: {district}, RMSE: {rmse}')
        
        total_rmse += rmse
        num_records += 1
    except Exception as e:
        print(f"Error occurred for Province: {province}, District: {district}: {str(e)}")


Error occurred for Province: Cabo Delgado, District: Ancuabe: too many indices for array
Error occurred for Province: Cabo Delgado, District: Chiure: too many indices for array
Error occurred for Province: Cabo Delgado, District: Ibo: too many indices for array
Error occurred for Province: Cabo Delgado, District: Macomia: object of type 'numpy.float64' has no len()
Error occurred for Province: Cabo Delgado, District: Mecufi: object of type 'numpy.float64' has no len()
Error occurred for Province: Cabo Delgado, District: Meluco: too many indices for array
Error occurred for Province: Cabo Delgado, District: Metuge: too many indices for array
Error occurred for Province: Cabo Delgado, District: Mocimboa da Praia: too many indices for array
Error occurred for Province: Cabo Delgado, District: Montepuez: too many indices for array
Error occurred for Province: Cabo Delgado, District: Mueda: object of type 'numpy.float64' has no len()
Error occurred for Province: Cabo Delgado, District: Namu