In [1]:
import pandas as pd

df_2019 = pd.read_csv('2019.csv')
df_2020 = pd.read_csv('2020.csv')
df_2021 = pd.read_csv('2021.csv')
df_2022 = pd.read_csv('2022.csv')
df_2023 = pd.read_csv('2023.csv')

In [24]:
df_2023

Unnamed: 0,Província,Distrito,W1,W2,W3,W4,W5,W6,W7,W8,...,W37,W38,W39,W40,W41,W42,W43,W44,W45,W46
0,Niassa,Lago,11,24,22,28,35,56,58.0,47,...,0,0,0,0,0,0,0,0,0,0
1,Niassa,Lichinga,105,185,281,195,161,153,153.0,100,...,0,0,0,0,0,0,0,0,0,0
2,Niassa,Mecanhelas,42,58,67,31,34,28,56.0,27,...,0,0,0,0,0,0,0,0,0,0
3,Niassa,Sanga,31,13,17,17,23,16,5.0,3,...,0,0,0,0,0,0,0,0,0,0
4,Niassa,Mandimba*,0,11,0,0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,Nampula,Meconta,0,0,0,0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
69,Nampula,Erati,0,0,0,0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
70,Maputo Cidade,KaTembe,0,0,0,0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
71,Maputo Cidade,KaMubukwana,0,0,0,0,0,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
df_train = pd.concat([df_2019, df_2020, df_2021, df_2022], ignore_index=True)
df_train.fillna(method='ffill', inplace=True)
df_2023.fillna(method='ffill', inplace=True)

In [7]:
# Assuming we have corrected the missing weeks issue earlier
X_train = df_train.drop(columns=['Província', 'Distrito'])
X_test = df_2023.drop(columns=['Província', 'Distrito'])

In [9]:
# Identify missing week columns that are in the training data but not in the testing data
missing_weeks = [col for col in X_train.columns if col not in X_test.columns]

# Add these missing week columns to the testing dataset with a default value of 0
for week in missing_weeks:
    X_test[week] = 0

# Ensure the order of columns in the testing dataset matches that of the training dataset
X_test = X_test[X_train.columns]

# Now proceed to define targets and train the model
y_train = X_train.sum(axis=1)  # This assumes a regression task aiming to predict the sum of weekly data
y_test = X_test.sum(axis=1)  # Ensure this matches the actual 2023 data structure if summing weekly data

from sklearn.linear_model import LinearRegression

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Ensure the code correction above is applied before proceeding to model training and prediction

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
predictions = model.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, predictions)
rmse = mean_squared_error(y_test, predictions, squared=False)
r2 = r2_score(y_test, predictions)

In [11]:
missing_weeks = set(X_train.columns) - set(X_test.columns)
for week in missing_weeks:
    X_test[week] = 0
X_test = X_test[X_train.columns]


In [13]:
mae

7.1945857559999835

In [14]:
rmse

22.559556986924665

In [15]:
r2

0.9997631685528082

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt

In [17]:
def load_and_preprocess_data():
    # Adjust the paths according to your files' locations
    df_2019 = pd.read_csv('2019.csv')
    df_2020 = pd.read_csv('2020.csv')
    df_2021 = pd.read_csv('2021.csv')
    df_2022 = pd.read_csv('2022.csv')
    df_2023 = pd.read_csv('2023.csv')
    
    # Combine training data and handle missing values if any
    df_train = pd.concat([df_2019, df_2020, df_2021, df_2022]).fillna(method='ffill')
    df_test = df_2023.fillna(method='ffill')
    
    # Assuming the weekly data starts from the 3rd column
    X_train = df_train.iloc[:, 2:]
    y_train = X_train.sum(axis=1)
    X_test = df_test.iloc[:, 2:]
    y_test = X_test.sum(axis=1)
    
    return X_train, y_train, X_test, y_test

In [18]:
def train_model(X_train, y_train):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model

In [19]:
def predict_and_evaluate(model, X_test, y_test):
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)
    return predictions, mae, rmse, r2

In [20]:
def plot_actual_vs_predicted(y_test, predictions):
    plt.figure(figsize=(10, 6))
    plt.plot(y_test.values, label='Actual', alpha=0.7)
    plt.plot(predictions, label='Predicted', alpha=0.7)
    plt.title('Actual vs. Predicted Totals for 2023')
    plt.xlabel('Index')
    plt.ylabel('Total Count')
    plt.legend()
    plt.show()

In [21]:
def generate_model_report(mae, rmse, r2):
    print(f"Model Report:\nMAE: {mae}\nRMSE: {rmse}\nR² Score: {r2}")

In [22]:
def save_results_to_csv(predictions, filename='result_2023.csv'):
    result_df = pd.DataFrame(predictions, columns=['Predicted Total'])
    result_df.to_csv(filename, index=False)
    print(f"Saved the predictions to {filename}")

In [23]:
X_train, y_train, X_test, y_test = load_and_preprocess_data()
model = train_model(X_train, y_train)
predictions, mae, rmse, r2 = predict_and_evaluate(model, X_test, y_test)
plot_actual_vs_predicted(y_test, predictions)
generate_model_report(mae, rmse, r2)
save_results_to_csv(predictions)

ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 53 is different from 46)