In [39]:
# This is a machine learning model for KA prediction 
# The KA data is collected form the Lankien Hopsital-MSF
# Climate variables were collected from the climate data stores  

In [None]:
# Load libraries 

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import textwrap

In [3]:

file_path = "/Users/berhe/Desktop/ka-project/KA_linelist/ll_ka.xlsx"
xls = pd.ExcelFile(file_path)


# Load the first sheet
df = pd.read_excel(xls, sheet_name="Sheet1")

# Display basic info about the dataset
df.info(), df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5425 entries, 0 to 5424
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Patient No.  5425 non-null   object        
 1   Lab No.      5425 non-null   object        
 2   date         5425 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 127.3+ KB


(None,
   Patient No. Lab No.       date
 0   LA1281/15   K1338 2015-11-02
 1   LA1306/15   K1409 2015-11-13
 2   LA1323/15   K1452 2015-11-23
 3   LA1326/15   K1460 2015-11-25
 4   LA1338/15   K1493 2015-12-01)

In [None]:

# Import the hospital KA data and clean

# Convert 'Date of admission' to datetime if not already
df["date"] = pd.to_datetime(df["date"])

# Aggregate cases by week
df_weekly = df.resample("W-Mon", on="date").size().reset_index(name="case_count")

# Display the cleaned and aggregated dataset
df_weekly.head()


In [None]:
# plot the Kalazar cases weekly 
import matplotlib.pyplot as plt

# Plot weekly leishmaniasis cases
plt.figure(figsize=(12, 4))
plt.plot(df_weekly["date"], df_weekly["case_count"], marker="o", linestyle="-", color="b", label="Weekly Cases")

# Formatting
plt.xlabel("Date")
plt.ylabel("Number of Cases")
plt.title("Weekly Leishmaniasis Cases in Lankien, South Sudan, 2016-2022")
plt.grid(True, linestyle="--", alpha=0.6)
plt.xticks(rotation=45)
plt.legend()

# Show the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm

# Ensure Date of admission is a datetime index
df_weekly["Date of admission"] = pd.to_datetime(df_weekly["Date of admission"])
df_weekly.set_index("Date of admission", inplace=True)

# Ensure data has a weekly frequency
df_weekly = df_weekly.asfreq('W')

# Handle missing values if any
df_weekly["case_count"].interpolate(method="linear", inplace=True)

# Perform seasonal decomposition using moving average
decomposition = sm.tsa.seasonal_decompose(df_weekly["case_count"], model="additive", period=52)  # Assuming yearly seasonality

# Plot decomposition
plt.figure(figsize=(12, 8))
decomposition.plot()
plt.suptitle("Seasonal Decomposition of Weekly Leishmaniasis Cases", fontsize=14)
plt.show()


In [None]:
# Use a rolling average to approximate seasonality (since statsmodels is unavailable)
df_weekly["trend"] = df_weekly["case_count"].rolling(window=52, center=True, min_periods=1).mean()  # Approximate yearly trend
df_weekly["detrended"] = df_weekly["case_count"] - df_weekly["trend"]
df_weekly["seasonal"] = df_weekly["detrended"].rolling(window=4, center=True, min_periods=1).mean()  # Approximate seasonal component

# Plot the original data, trend, and seasonality
plt.figure(figsize=(12, 8))

plt.subplot(3, 1, 1)
plt.plot(df_weekly["Date of admission"], df_weekly["case_count"], label="Original Data", color="blue")
plt.title("Weekly Leishmaniasis Cases")
plt.legend()

plt.subplot(3, 1, 2)
plt.plot(df_weekly["Date of admission"], df_weekly["trend"], label="Trend", color="red")
plt.title("Trend Component")
plt.legend()

plt.subplot(3, 1, 3)
plt.plot(df_weekly["Date of admission"], df_weekly["seasonal"], label="Seasonality", color="green")
plt.title("Seasonal Component (Approximate)")
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# The red trend line shows long-term fluctuations in leishmaniasis cases.
# There are periods of increase and decline, suggesting potential long-term drivers (e.g., environmental changes, population movement, interventions).


# The green seasonal component suggests recurring short-term patterns.
# If these patterns align with specific months or seasons, it indicates climate or environmental factors might be influencing case counts.
# However, since this was an approximate decomposition, we should confirm this using statistical methods like autocorrelation analysis or Fourier transformation.

In [None]:
from statsmodels.graphics.tsaplots import plot_acf
import numpy as np

# Autocorrelation plot
plt.figure(figsize=(12, 5))
plot_acf(df_weekly["case_count"], lags=52, alpha=0.05)  # Checking up to one year of weekly lags
plt.title("Autocorrelation of Weekly Leishmaniasis Cases")
plt.show()

# Fourier Transform to detect dominant frequencies (seasonal patterns)
fft_vals = np.fft.fft(df_weekly["case_count"] - df_weekly["case_count"].mean())  # Remove mean for better detection
fft_freqs = np.fft.fftfreq(len(df_weekly), d=1)  # Frequency in weeks

# Plot the power spectrum
plt.figure(figsize=(12, 5))
plt.plot(1 / fft_freqs[1:len(fft_freqs)//2], np.abs(fft_vals[1:len(fft_vals)//2]))  # Convert frequency to period (1/freq)
plt.xlabel("Period (weeks)")
plt.ylabel("Power")
plt.title("Fourier Transform: Detecting Seasonal Patterns in Weekly Cases")
plt.grid(True)
plt.show()


In [None]:
# Results of Autocorrelation and Fourier Analysis
# Autocorrelation (ACF Plot)
  # Shows significant positive autocorrelation at lags around 52 weeks, suggesting a yearly seasonal cycle.
  # Peaks at 52 weeks indicate that case numbers tend to repeat on a yearly basis, hinting at climate-driven seasonality.

# Fourier Transform (Frequency Analysis)

  # The power spectrum shows a strong peak around 52 weeks, further confirming annual seasonality.
  # Other smaller peaks may indicate sub-seasonal variations, possibly linked to short-term environmental fluctuations (e.g., rainy/dry seasons).
  
#Conclusion
  # ✅ There is clear evidence of a yearly seasonal pattern in leishmaniasis cases.
  # 🌍 This strongly suggests that climate factors (e.g., rainfall, humidity, temperature) may play a key role in transmission.

In [None]:
# Manual autocorrelation function (ACF) without statsmodels
def autocorrelation(series, lag):
    return series.autocorr(lag=lag)

# Compute autocorrelation for lags up to 52 weeks (1 year)
lags = np.arange(1, 53)
acf_values = [autocorrelation(df_weekly["case_count"], lag) for lag in lags]

# Plot autocorrelation manually
plt.figure(figsize=(12, 5))
plt.bar(lags, acf_values, color="blue", alpha=0.7)
plt.axhline(y=0, color="black", linewidth=1)
plt.axhline(y=0.2, linestyle="--", color="gray")
plt.axhline(y=-0.2, linestyle="--", color="gray")
plt.xlabel("Lag (weeks)")
plt.ylabel("Autocorrelation")
plt.title("Autocorrelation of Weekly Leishmaniasis Cases (Manual)")
plt.grid(True, linestyle="--", alpha=0.6)
plt.show()

# Fourier Transform for seasonality detection
fft_vals = np.fft.fft(df_weekly["case_count"] - df_weekly["case_count"].mean())  # Remove mean for better detection
fft_freqs = np.fft.fftfreq(len(df_weekly), d=1)  # Frequency in weeks

# Plot Fourier transform spectrum
plt.figure(figsize=(12, 5))
plt.plot(1 / fft_freqs[1:len(fft_freqs)//2], np.abs(fft_vals[1:len(fft_vals)//2]))  # Convert frequency to period (1/freq)
plt.xlabel("Period (weeks)")
plt.ylabel("Power")
plt.title("Fourier Transform: Detecting Seasonal Patterns in Weekly Cases")
plt.grid(True)
plt.show()