## Packages

In [1]:
import pandas as pd
import numpy as np
## import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import sqlalchemy 
from sqlalchemy import create_engine, text

import sys
import os

## Add the path of the functions folder
current_dir = os.getcwd()  ## Gets the current working directory
sub_dir = os.path.abspath(os.path.join(current_dir, '..'
                                       , 'Functions'))
sys.path.append(sub_dir)

# Now you can import functions
from db_secrets import SQL_107

#from visualisations import plot_prediction_error, plot_prediction_density_subplots

from helpers import aggregate_sites

In [2]:
# scikit-survival
from sksurv.preprocessing import OneHotEncoder
from sksurv.linear_model import CoxPHSurvivalAnalysis

from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from datetime import datetime

## Connection

In [3]:
## text for query
with open("../Exploratory_Analysis/111_sql.sql", "r") as file:
    query_text = file.read()

query_text = query_text.replace('REPLACE START DATE','2022-01-01')

In [4]:
## Create an engine + connection
engine = create_engine(SQL_107())
conn = engine.connect()

## Return data
df_raw = pd.read_sql(query_text,conn)

## Wrangle

In [5]:
## Makes working copy
df = df_raw.copy()

#df = df.sample(n=100000, random_state=42)

In [None]:
## List columns
df.columns

In [7]:
df = df[['Call Connect Time'
         ,'Outcome Location Name'
         ,'Bank Holiday'
         , 'In_Out_Hours'
         , 'Sub ICB Name'
         ,'Outcome Type'
         ,'Outcome Datetime']].copy()

In [8]:
df['Uncensored'] = df['Outcome Type'].transform(lambda x: False if x == 'No UEC Contact' else True)
df = df.drop(['Outcome Type'],axis=1) 

In [9]:
## Round time to nearest minute
df['Call Connect Time'] = df['Call Connect Time'].dt.round(freq='min')
df['Outcome Datetime'] = df['Outcome Datetime'].dt.round(freq='min')


In [10]:
df['Mins to outcome'] = df['Outcome Datetime'] - df['Call Connect Time']
df['Mins to outcome'] = df['Mins to outcome'].dt.total_seconds()/60

## right censored data upto 24 hours
df['Mins to outcome'] = df['Mins to outcome'].fillna(1441) ## minutes in day+1
df['Mins to outcome'] = df['Mins to outcome'].transform(lambda x: 1441 if x > 1441 else x) 

df.loc[(df['Mins to outcome'] == 1441),'Uncensored' ] = False

## removes zeros and less than zero
df = df[df['Mins to outcome'] > 0]

In [11]:
## Replaces low frequency sites with 'OTHER SITE'
df['Outcome Location Name'] = (df['Outcome Location Name']
                               .apply(lambda x: aggregate_sites(x)))

In [12]:
## groups rare sites for a place
positive_counts = df.groupby(['Sub ICB Name', 'Outcome Location Name'])['Uncensored'].sum().reset_index(name='Attends')

total_positives = positive_counts.groupby('Sub ICB Name')['Attends'].sum().reset_index(name='Total_Attends')

lu_site_agg = positive_counts.merge(total_positives, on='Sub ICB Name')
lu_site_agg['Percentage'] = (lu_site_agg['Attends'] / lu_site_agg['Total_Attends']) * 100

lu_site_agg['Location'] = 'OTHER SITE'

## keep details of sites with > 5% of activity
lu_site_agg.loc[(lu_site_agg['Percentage'] > 5) &
                (lu_site_agg['Outcome Location Name'] != 'No UEC Contact')
                , 'Location'] = lu_site_agg['Outcome Location Name']

lu_site_agg.loc[lu_site_agg['Outcome Location Name'] == 'No UEC Contact'
                , 'Location'] = 'No UEC Contact'


In [13]:
## Add new location
df=pd.merge(df
         ,lu_site_agg[['Sub ICB Name','Outcome Location Name','Location']]
         , on = ['Sub ICB Name','Outcome Location Name']
         , how='left')

## Drop previous location
df = df.drop('Outcome Location Name', axis=1)

### date time

In [14]:
## Date time conversion to numeric
df['year']    = df['Call Connect Time'].dt.year

df['month sin'] = np.sin(df['Call Connect Time'].dt.month * (2*np.pi/12))
df['month cos'] = np.cos(df['Call Connect Time'].dt.month * (2*np.pi/12))

df['YearDay sin'] = np.sin(df['Call Connect Time'].dt.day_of_year * (2*np.pi/365))
df['YearDay cos'] = np.cos(df['Call Connect Time'].dt.day_of_year * (2*np.pi/365))

df['weekday sin'] = np.sin(df['Call Connect Time'].dt.weekday+1 * (2*np.pi/7))  # Monday=0, Sunday=6
df['weekday cos'] = np.cos(df['Call Connect Time'].dt.weekday+1 * (2*np.pi/7))  # Monday=0, Sunday=6

df['Hour sin'] = np.sin(df['Call Connect Time'].dt.hour * (2*np.pi/24))
df['Hour cos'] = np.cos(df['Call Connect Time'].dt.hour * (2*np.pi/24))

df = df.drop('Call Connect Time',axis=1) 
df = df.drop('Outcome Datetime',axis=1) 

In [15]:
df['Location'] = df['Location'].astype('category')
df['Bank Holiday'] = df['Bank Holiday'].astype('category')
df['In_Out_Hours'] = df['In_Out_Hours'].astype('category')
df['Sub ICB Name'] = df['Sub ICB Name'].astype('category')

# Cox PH

#### create a new df with one copy of the data per site

In [None]:
Outcome_Location = df[ ~df['Location'].isin(
                            [ 'No UEC Contact', 'OTHER SITE']) ]['Location'].unique()
print(Outcome_Location)


In [None]:
new_df = pd.DataFrame()

#for Location in Outcome_Location:
#for Location in ['UNIVERSITY HOSPITAL OF NORTH DURHAM']:
for Location in ['THE ROYAL VICTORIA INFIRMARY']:
    print(Location)
    temp_df = df.copy()
    temp_df['Site Version'] = Location
    temp_df['Uncensored'] = temp_df['Location'] == Location
    temp_df.loc[(temp_df['Mins to outcome'] == 1441),'Uncensored' ] = False

    new_df = pd.concat([new_df,temp_df], ignore_index=True, sort=False)

new_df['Site Version'] = new_df['Site Version'].astype('category')

## removes temp df
del temp_df

In [18]:
df = new_df

## removes temp df
del new_df

#### split

In [19]:
outcome_cols = ['Uncensored','Mins to outcome']

X = df.drop(outcome_cols,axis=1)
X = X.drop(['Location'],axis=1)
y = df[outcome_cols]

y = np.array(
    list(y.itertuples(index=False, name=None)),  # Convert rows to tuples
    dtype=[('Uncensored', '?'), ('Mins to outcome', '<f8')]  # Define the structured dtype
    )


X_train, X_test, y_train, y_test = train_test_split(X
                                                    , y 
                                                    , stratify=y['Uncensored'] ## make sure there are equal proportions in test and train
                                                    , test_size = 0.25
                                                    , random_state=42)

### fit the model

In [None]:
cph = make_pipeline(OneHotEncoder(), CoxPHSurvivalAnalysis())
cph.fit(X_train, y_train) ##takes 30+ mins
cph_chf_funcs = cph.predict_cumulative_hazard_function(X_test, return_array=False)


In [21]:
arrival_windows = np.arange(5, 1440, 5)

cph_risk_scores = np.vstack([chf(arrival_windows) for chf in cph_chf_funcs])

## AUC

In [None]:
cph_auc, cph_mean_auc = cumulative_dynamic_auc(survival_train=y_train
                                               ,survival_test= y_test
                                               ,estimate= cph_risk_scores
                                               ,times= arrival_windows)

plt.plot(arrival_windows, cph_auc, marker="o")
plt.axhline(cph_mean_auc, linestyle="--")
plt.xlabel("minutes from call")
plt.ylabel("time-dependent AUC")
plt.grid(True)

## Concordance
The C-Index  measures the predictive accuracy of survival models by evaluating the proportion of concordant pairs relative to all comparable pairs within a dataset. A pair of subjects 
i and j is considered comparable if, given t_i < t_j, then δ_i=1. A pair of comparable subjects is concordant when the predicted mean time aligns with the actual event times.

In [None]:
c_index = concordance_index_censored(
    y_test['Uncensored'],  # event indicator
    y_test['Mins to outcome'],  # time to event
    cph.predict(X_test)
)

print("Concordance Index:", c_index[0])

In [None]:
cph_predictions = cph.predict(X_test)