# INTRODUCTION

- What is are the objectives of the time series regression in this project? 
- How will you arrive at the results 
- Why did you use python
- What is the way forward after the regression analysis

**Load the required libraries** 

In [None]:
import pandas as pd 
import numpy as np
from matplotlib import pyplot as plt 
import seaborn as sns 
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose 
import warnings

warnings.filterwarnings("ignore")

In [None]:
# Load the required data
appointment_df = pd.read_csv("../data/Appointments List.csv", 
                             encoding='UTF-16', delimiter="\t")
appointment_df.head()

## Data Wrangling 

What is data wrangling? 



In [None]:
# Drop the first row of the data frame
appointment_df.drop(index=0, inplace=True)
appointment_df.reset_index(drop=True, inplace=True)

# Convert to time series data
appointment_df["Date"] = pd.to_datetime(appointment_df["Date Of Service"], errors="coerce")

# Replace "Made" to "Seen" and "Deleted" to "Cancelled"
appointment_df["Appt Status Description"].replace({
    "Made":"Seen",
    "Deleted": "Cancelled"
}, inplace=True)

appointment_ts = appointment_df[["Date","Appointment UID"]]

# Count null values 
null_values = appointment_ts.isna().sum()
print("Rows with null values: ", null_values)
del null_values # for memory management

# Count duplicated records 
duplicated_records = appointment_ts.duplicated().sum()
print("Duplicated records: ", duplicated_records)
del duplicated_records


# Remove all the null values  
appointment_ts.dropna(how="any", inplace=True)
appointment_ts.reset_index(drop=True)

# Count the null values to confirm the operation 
null_values = appointment_ts.isna().sum()
print("Rows with null values after cleaning: ", null_values)
del null_values # for memory management

# Group the data by date  
daily_appointments_ts = appointment_ts.groupby("Date").count().reset_index()
daily_appointments_ts.rename({"Appointment UID":"appointment_count"}, 
                            axis=1, inplace=True)

daily_appointments_ts.head()

Here is what I did to make the appointment data ready for analysis; 

* Dropped the first row since it had irrelvant values
* Converted the date column to a recognizable format by python for better time series analysis
* Renamed all the appointment status that were labelled as "Made" and "Deleted" to "Seen" and "Cancelled" respectively. 
* Select the Date and appointment UID. 
* Count and dropped all the records with the null values in the selected data. 
* Calculated the number of appointments by date. 

**Note:** More data wrangling will be performed based on the status of the analysis. 

In [None]:
# Unique days 
appointment_ts["Date"].nunique()

## Time Series Regression
### Regression Analysis

In [None]:
# Ensure the date column is sorted 
daily_appointments_ts = daily_appointments_ts.sort_values("Date").reset_index(drop=True)
daily_appointments_ts

# Drop the first column it might be an error. No possible appointment was in 2000 the next one in 2017
daily_appointments_ts.drop(index=0, inplace=True)
daily_appointments_ts.reset_index(drop=True, inplace=True)

daily_appointments_ts.head()

# Add a numeric time variable that counts days since the start 
daily_appointments_ts["Time"]  = (daily_appointments_ts["Date"] - 
                                  daily_appointments_ts["Date"].min()).dt.days

daily_appointments_ts

# Define the independent(Y) and dependent variables(X)
X = sm.add_constant(daily_appointments_ts["Time"]) # Add a constant for intercept
y = daily_appointments_ts["appointment_count"]

# Create the model 
model = sm.OLS(y, X).fit()
print(model.summary())

# Predicted Counts
predicted_counts = model.predict(X)

In [None]:
%matplotlib inline 
sns.set()

# Plot the results 
plt.figure(figsize = (12, 7))
plt.plot(daily_appointments_ts["Date"], daily_appointments_ts["appointment_count"],
        label = "Actual Counts", color = "blue")
plt.plot(daily_appointments_ts["Date"], predicted_counts,
        label = "Regression line", color = "red")
plt.xlabel("Date")
plt.ylabel("Count")
plt.legend()
plt.grid(True)
plt.show()

### Time series Decompisition 

This was done to refine the analysis and find the details of the appointments over time. Like:

* Calculate seasonal trends
* Visualize the annual cycles 
* Visualize the daily fluctuations in the number of appointments 

In [None]:
# Set the date as index 
daily_appointments_ts.set_index("Date", inplace=True)

# Perform decomposition
result = seasonal_decompose(daily_appointments_ts["appointment_count"], 
                           model = "additive", period=7)

# Plot the result 
result.plot()
plt.show()

Findings;

* There is an upward trend of the daily number of appointments
* daily fluctuations of the number of appointments range from 0 to 25

### Moving average

Lets smoothen the curves by calculating 7-day, 30-day and 90-day moving averages. 

What is the importance of moving averages? 


In [None]:
# 7 days moving average 
daily_appointments_ts["7_day_MA"] = daily_appointments_ts["appointment_count"].rolling(window=7).mean()

# Plot the data  
plt.figure(figsize=(12, 7))
plt.plot(daily_appointments_ts.index, daily_appointments_ts["appointment_count"],
        label = "Daily Count", color="white")
plt.plot(daily_appointments_ts.index, daily_appointments_ts["7_day_MA"],
        label = "7-day Moving average", color="blue")
plt.xlabel("Date")
plt.ylabel("Count")

plt.title("7-day Moving Averages")
plt.legend()
plt.grid()
plt.show()

In [None]:
# 30 days moving average 
daily_appointments_ts["30_day_MA"] = daily_appointments_ts["appointment_count"].rolling(window=30).mean()

# Plot the data  
plt.figure(figsize=(12, 7))
plt.plot(daily_appointments_ts.index, daily_appointments_ts["appointment_count"],
        label = "Daily Count", color="white")
plt.plot(daily_appointments_ts.index, daily_appointments_ts["30_day_MA"],
        label = "30-day Moving average", color="blue")
plt.xlabel("Date")
plt.ylabel("Count")

plt.title("30-day Moving Averages")
plt.legend()
plt.grid()
plt.show()

In [None]:
# 90 days moving average 
daily_appointments_ts["90_day_MA"] = daily_appointments_ts["appointment_count"].rolling(window=90).mean()

# Plot the data  
plt.figure(figsize=(12, 7))
plt.plot(daily_appointments_ts.index, daily_appointments_ts["appointment_count"],
        label = "Daily Count", color="white")
plt.plot(daily_appointments_ts.index, daily_appointments_ts["90_day_MA"],
        label = "90-day Moving average", color="blue")
plt.xlabel("Date")
plt.ylabel("Count")

plt.title("90-day Moving Averages")
plt.legend()
plt.grid()
plt.show()

## Cambridge Time Series
I will narrow down to the successfull appointments with MH Infusion at cambridge all time and visualize the trend. 


First I prepared the data by: 

* Filtering to get the appointments that were successful, involved MH infusion and were held in the Cambridge facility. 
* Dropped the first record that had an appointment held in the year 2000 which was irrelevant in the analysis(this might be an error during data collection)

In [None]:
# Filter the data 
cambridge_mh_success = appointment_df[(appointment_df["Appt Status Description"]=="Seen")\
               &(appointment_df["Facility City"]=="CAMBRIDGE")\
               &(appointment_df["Appt Type Group"]=="MH Infusion")].reset_index(drop=True)


# Clean the data
## Get the required columns 
cambridge_mh_success = cambridge_mh_success[["Date", "Appointment UID"]]

## remove the null values  
cambridge_mh_success.dropna(how="any", inplace=True)
cambridge_mh_success.reset_index(drop=True, inplace=True)

## Sort values by data 
cambridge_mh_success.sort_values("Date").reset_index(drop=True)

## Group the data by date  
cambridge_mh_success = cambridge_mh_success.groupby("Date").count().reset_index()
cambridge_mh_success.rename({"Appointment UID":"appointment_count"}, 
                            axis=1, inplace=True)

## Drop the first row 
cambridge_mh_success.drop(index=0, inplace=True)
cambridge_mh_success.reset_index(drop=True)

## Set date as index
cambridge_mh_success.set_index("Date", inplace=True)

cambridge_mh_success.head()

In [None]:
# 7 days moving average 
cambridge_mh_success["7_day_MA"] = cambridge_mh_success["appointment_count"].rolling(window=7).mean()

# Plot the data  
plt.figure(figsize=(12, 7))
plt.plot(cambridge_mh_success.index, cambridge_mh_success["appointment_count"],
        label = "Daily Count", color="white")
plt.plot(cambridge_mh_success.index, cambridge_mh_success["7_day_MA"],
        label = "7-day Moving average", color="green")
plt.xlabel("Date")
plt.ylabel("Count")

plt.title("7-day Moving Averages")
plt.legend()
plt.grid()
plt.show()

In [None]:
# 30 days moving average 
cambridge_mh_success["30_day_MA"] = cambridge_mh_success["appointment_count"].rolling(window=30).mean()

# Plot the data  
plt.figure(figsize=(12, 7))
plt.plot(cambridge_mh_success.index, cambridge_mh_success["appointment_count"],
        label = "Daily Count", color="white")
plt.plot(cambridge_mh_success.index, cambridge_mh_success["30_day_MA"],
        label = "30-day Moving average", color="green")
plt.xlabel("Date")
plt.ylabel("Count")

plt.title("30-day Moving Averages")
plt.legend()
plt.grid()
plt.show()

In [None]:
# 90 days moving average 
cambridge_mh_success["90_day_MA"] = cambridge_mh_success["appointment_count"].rolling(window=90).mean()

# Plot the data  
plt.figure(figsize=(12, 7))
plt.plot(cambridge_mh_success.index, cambridge_mh_success["appointment_count"],
        label = "Daily Count", color="white")
plt.plot(cambridge_mh_success.index, cambridge_mh_success["90_day_MA"],
        label = "90-day Moving average", color="green")
plt.xlabel("Date")
plt.ylabel("Count")

plt.title("90-day Moving Averages")
plt.legend()
plt.grid()
plt.show()

From the analysis of the number of daily appointments that involved MH infusion in the camridge facility, it is evident; 

* There is a general rise in the number of appointments from the year 2018 to early 2023
* After mid-2023, the number of appointments start to experience a drop.

## Other locations Moving Averages 

I will put all other facilities from the other locations other than Cambridge, analyze the successfull appointments that involve MH infusion and see how they compare with Camdrige. This is what I will do; 

* Filter out Cambridge facilities. 
* Get the successfull appointments that involve MH infusion 
* Clean and organize the data as required 

In [None]:
# Filter the data  
other_mh_success = appointment_df[(appointment_df["Appt Status Description"]=="Seen")\
               &(appointment_df["Facility City"]!="CAMBRIDGE")\
               &(appointment_df["Appt Type Group"]=="MH Infusion")].reset_index(drop=True)



# Clean the data
## Get the required columns 
other_mh_success = other_mh_success[["Date", "Appointment UID", "Facility City"]]

## remove the null values  
other_mh_success.dropna(how="any", inplace=True)
other_mh_success.reset_index(drop=True, inplace=True)

# Drop the facility city column
other_mh_success.drop("Facility City", axis=1, inplace=True)

## Sort values by data 
other_mh_success.sort_values("Date").reset_index(drop=True)

## Group the data by date  
other_mh_success = other_mh_success.groupby("Date").count().reset_index()
other_mh_success.rename({"Appointment UID":"appointment_count"}, 
                            axis=1, inplace=True)

## Drop the first row 
other_mh_success.drop(index=0, inplace=True)
other_mh_success.reset_index(drop=True)

## Set date as index
other_mh_success.set_index("Date", inplace=True)

other_mh_success.head()

In [None]:
# 7 days moving average 
other_mh_success["7_day_MA"] = other_mh_success["appointment_count"].rolling(window=7).mean()

# Plot the data  
plt.figure(figsize=(12, 7))
plt.plot(other_mh_success.index, other_mh_success["appointment_count"],
        label = "Daily Count", color="white")
plt.plot(other_mh_success.index, other_mh_success["7_day_MA"],
        label = "7-day Moving average", color="purple")
plt.xlabel("Date")
plt.ylabel("Count")

plt.title("7-day Moving Averages")
plt.legend()
plt.grid()
plt.show()

In [None]:
# 30 days moving average 
other_mh_success["30_day_MA"] = other_mh_success["appointment_count"].rolling(window=30).mean()

# Plot the data  
plt.figure(figsize=(12, 7))
plt.plot(other_mh_success.index, other_mh_success["appointment_count"],
        label = "Daily Count", color="white")
plt.plot(other_mh_success.index, other_mh_success["30_day_MA"],
        label = "30-day Moving average", color="purple")
plt.xlabel("Date")
plt.ylabel("Count")

plt.title("30-day Moving Averages")
plt.legend()
plt.grid()
plt.show()

In [None]:
# 90 days moving average 
other_mh_success["90_day_MA"] = other_mh_success["appointment_count"].rolling(window=90).mean()

# Plot the data  
plt.figure(figsize=(12, 7))
plt.plot(other_mh_success.index, other_mh_success["appointment_count"],
        label = "Daily Count", color="white")
plt.plot(other_mh_success.index, other_mh_success["90_day_MA"],
        label = "90-day Moving average", color="purple")
plt.xlabel("Date")
plt.ylabel("Count")

plt.title("90-day Moving Averages")
plt.legend()
plt.grid()
plt.show()

From the analysis above that concerns the all the other facilities apart from Cambridge, it is evident that 

### Successfull MH Infusion - mid 2023 onwards
Since there has been a drop in MH infusion from mid 2023 to 2024 in Cambridge Facility, we will explore the data from 2023 and find where did the fall exactly begin. 

In [None]:
# Filter the data 
decided_date = pd.to_datetime("2023-06-01")
cambridge_mh_success_2023 = cambridge_mh_success[cambridge_mh_success.index>=decided_date]
cambridge_mh_success_2023.head()

In [None]:
# 7 days moving average 
cambridge_mh_success_2023["7_day_MA"] = cambridge_mh_success_2023["appointment_count"].rolling(window=7).mean()

# Plot the data  
plt.figure(figsize=(12, 7))
plt.plot(cambridge_mh_success_2023.index, cambridge_mh_success_2023["appointment_count"],
        label = "Daily Count", color="white")
plt.plot(cambridge_mh_success_2023.index, cambridge_mh_success_2023["7_day_MA"],
        label = "7-day Moving average", color="black")
plt.xlabel("Date")
plt.ylabel("Count")

plt.title("7-day Moving Averages")
plt.legend()
plt.grid()
plt.show()

In [None]:
# 30 days moving average 
cambridge_mh_success_2023["30_day_MA"] = cambridge_mh_success_2023["appointment_count"].rolling(window=30).mean()

# Plot the data  
plt.figure(figsize=(12, 7))
plt.plot(cambridge_mh_success_2023.index, cambridge_mh_success_2023["appointment_count"],
        label = "Daily Count", color="white")
plt.plot(cambridge_mh_success_2023.index, cambridge_mh_success_2023["30_day_MA"],
        label = "30-day Moving average", color="black")
plt.xlabel("Date")
plt.ylabel("Count")

plt.title("30-day Moving Averages")
plt.legend()
plt.grid()
plt.show()

In [None]:
# 90 days moving average 
cambridge_mh_success_2023["90_day_MA"] = cambridge_mh_success_2023["appointment_count"].rolling(window=90).mean()

# Plot the data  
plt.figure(figsize=(12, 7))
plt.plot(cambridge_mh_success_2023.index, cambridge_mh_success_2023["appointment_count"],
        label = "Daily Count", color="white")
plt.plot(cambridge_mh_success_2023.index, cambridge_mh_success_2023["90_day_MA"],
        label = "90-day Moving average", color="black")
plt.xlabel("Date")
plt.ylabel("Count")

plt.title("90-day Moving Averages")
plt.legend()
plt.grid()
plt.show()

In [None]:
# Calculate the appointment count difference 
cambridge_mh_success_2023["appointment_count"].max() - cambridge_mh_success_2023["appointment_count"].min()

In [None]:
# Moving average range
## 7 day MA
range1 = cambridge_mh_success_2023["7_day_MA"].max() - cambridge_mh_success_2023["7_day_MA"].min()
print(f"Cambridge 7 day moving average range: {range1}")

## 30 day MA
range2 = cambridge_mh_success_2023["30_day_MA"].max() - cambridge_mh_success_2023["30_day_MA"].min()
print(f"Cambridge 30 day moving average range: {range2}")

## 90 day MA
range3 = cambridge_mh_success_2023["90_day_MA"].max() - cambridge_mh_success_2023["90_day_MA"].min()
print(f"Cambridge 90 day moving average range: {range3}")