In [None]:
%pip install tensorflow

In [111]:
# Importing necessary libraries
import numpy as np
from numpy import concatenate
import pandas as pd
from pandas import read_csv, concat, DataFrame
from math import sqrt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set the Seaborn context to 'talk' and style to 'whitegrid'
sns.set_context('talk')
sns.set_style('white')


In [112]:
# Read Excel file
excel_file = 'data/wlfdata.xlsx'
df = pd.read_excel(excel_file, engine='openpyxl')

In [None]:
# Save the data to a CSV file
csv_file = 'data/wlfdata.csv'
df.to_csv(csv_file, index=False)

In [None]:
dataset = read_csv('data/wlfdata.csv')
print(dataset.head())
print(dataset.dtypes)

In [None]:
# Distribution of cases in each year
fig, ax = plt.subplots(figsize=(8, 8))

years = np.arange(2018, 2023, 1).astype(int)
for year in years:
    sns.lineplot(data=dataset[dataset.year == year],
                    x="epiweek", y='cases', ax=ax, label=year)
    ax.get_xaxis().set_ticks([])
    ax.set_xlabel('Time')
    ax.set_ylabel('Cases')
    ax.set_title('Total Lassa Cases on a Yearly Basis')
fig.patch.set_alpha(0)
plt.legend(bbox_to_anchor=(1.15, 1), loc="upper right")
fig.savefig('cases_by_years.png', dpi=300)
print("File saved successfully as 'cases_by_years.png'")

In [None]:
# Assuming df contains the data with columns: 'State', 'Year', 'Month', and numeric features

# Define the figure and axes for plotting
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

# Plot the data for each state
for i, state in enumerate(dataset['state'].unique()):
    row = i // 2
    col = i % 2
    ax = axes[row, col]
    state_data = dataset[dataset['state'] == state]
    sns.lineplot(data=state_data, x='epiweek', y='cases', hue='year', ax=ax)
    ax.set_title(f'State: {state}')
    ax.set_xlabel('Epiweek')
    ax.set_ylabel('Cases')
    ax.legend(title='Year')
    ax.tick_params(axis='x', labelrotation=90)  # Rotate x-axis labels vertically

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

In [None]:
# Now set this 'date' column as the index if needed
dataset.set_index('datetime', inplace=True)


In [None]:
dataset.drop(['year', 'epiweek','state'], axis=1, inplace=True)

In [None]:
print(dataset.head())

In [None]:
# Heat Map showing the correlation between all variables including the target
corr=dataset.corr(method='spearman').abs()
fig, ax = plt.subplots(figsize=(8,8))
matrix = np.triu(corr) # Getting the lower traingle of the correlation matrix
cbar_kws={"label": "Correlation", "shrink":1}
heatmap=sns.heatmap(data=corr, linewidths=1, square=False, cmap='Reds', ax=ax, annot=True,annot_kws={"size": 10}, mask=matrix, fmt= ".2f",cbar_kws=cbar_kws)
fig.suptitle('Heatmap of Correlation Between Data Features', fontsize=18, y=.94, x=.43);



In [None]:
values = dataset.values
# specify columns to plot
groups = [0, 1, 2, 3, 4, 5, 6]
i = 1
# plot each column
plt.figure(figsize=(10,15))
for group in groups:
    plt.subplot(len(groups), 1, i)
    plt.plot(values[:, group])
    plt.title(dataset.columns[group], y=0.5, loc='right')
    i += 1
plt.tight_layout()
plt.show()

In [None]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    
    Arguments:
        data: Sequence of observations as a list or Pandas DataFrame.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = data.shape[1]
    df = pd.DataFrame(data)
    cols, names = [], []

    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [(str(df.columns[j]) + '(t-%d)' % i) for j in range(n_vars)]

    # forecast sequence (t, t+1, ... t+n_out-1)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [(str(df.columns[j]) + '(t)') for j in range(n_vars)]
        else:
            names += [(str(df.columns[j]) + '(t+%d)' % i) for j in range(n_vars)]

    agg = pd.concat(cols, axis=1)
    agg.columns = names
    if dropnan:
        agg.dropna(inplace=True)
    return agg


In [None]:
# Prepare data for LSTM
values = dataset.values
encoder = LabelEncoder()
values[:, 6] = encoder.fit_transform(values[:, 6])
values = values.astype('float32')
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)
reframed = series_to_supervised(scaled, 1, 1)

print(reframed.head())

In [None]:
# Split into train and test sets
values = reframed.values
n_train_weeks = int(len(values) * 0.8)
train = values[:n_train_weeks, :]
test = values[n_train_weeks:, :]
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

# Reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print("Shape of train_X:", train_X.shape)
print("Shape of train_y:", train_y.shape)
print("Shape of test_X:", test_X.shape)
print("Shape of test_y:", test_y.shape)

In [None]:
# Design network
model = Sequential()
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')

# Fit network
history = model.fit(train_X, train_y, epochs=100, batch_size=52, validation_data=(test_X, test_y), verbose=2, shuffle=False)

# Plot history
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
# Debugging shapes before prediction
print("Shape of test_X before prediction:", test_X.shape)

# Make a prediction
try:
    yhat = model.predict(test_X)
    print("Shape of yhat:", yhat.shape)
except Exception as e:
    print("Error during prediction:", e)
    print("Input shape to model:", test_X.shape)

# Reshape test_X back to its original shape for inverse scaling
test_X_flat = test_X.reshape((test_X.shape[0], test_X.shape[2]))

# Debugging shapes after prediction
print("Shape of yhat:", yhat.shape)
print("Shape of test_X after reshape:", test_X_flat.shape)

# Create an array of zeros to concatenate with yhat for inverse scaling
# Adjusting the shapes to match the scaled data
inv_yhat_full = np.zeros((len(yhat), scaled.shape[1]))
inv_yhat_full[:, 0] = yhat[:, 0]  # Put the predictions in the first column
# Ensure the remaining part of inv_yhat_full matches test_X_flat
inv_yhat_full[:, 1:] = test_X_flat[:, :scaled.shape[1] - 1]  # Adjusting to the correct number of columns

# Invert scaling for forecast
inv_yhat = scaler.inverse_transform(inv_yhat_full)
inv_yhat = inv_yhat[:, 0]

# Invert scaling for actual
test_y_full = np.zeros((len(test_y), scaled.shape[1]))
test_y_full[:, 0] = test_y
# Ensure the remaining part of test_y_full matches test_X_flat
test_y_full[:, 1:] = test_X_flat[:, :scaled.shape[1] - 1]  # Adjusting to the correct number of columns

inv_y = scaler.inverse_transform(test_y_full)
inv_y = inv_y[:, 0]

# Calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)
