# Setup
## Packages

In [None]:
import pandas as pd
import numpy as np
from math import ceil
import plotly.express as px
## import matplotlib.pyplot as plt
## import sqlalchemy 
from sqlalchemy import create_engine ##, text

import sys
import os

## Add the path of the functions folder
current_dir = os.getcwd()  ## Gets the current working directory
sub_dir = os.path.abspath(os.path.join(current_dir, '..'
                                       , 'Functions'))
sys.path.append(sub_dir)

# Import functions
from db_secrets import SQL_107
from helpers import Trauma_Detect,Pregnancy_Detect

In [None]:
## Import machine learning methods

from xgboost.sklearn import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.preprocessing import LabelEncoder

#from sklearn.ensemble import AdaBoostClassifier

from sklearn.metrics import auc, roc_curve, RocCurveDisplay, f1_score, \
                            precision_score, recall_score, confusion_matrix, \
                            ConfusionMatrixDisplay, classification_report, \
                            accuracy_score


## Connection

In [None]:
## text for query
with open("../Exploratory_Analysis/111_sql.sql", "r") as file:
    query_text = file.read()

query_text = query_text.replace('REPLACE START DATE','2024-01-01')

In [None]:
## Create an engine + connection
engine = create_engine(SQL_107())
conn = engine.connect()

## Return data
df_raw = pd.read_sql(query_text,conn)

# Model 1

In [None]:
## Makes working copy
df = df_raw.copy()

#df = df.sample(n=100000, random_state=42)

## Wrangle

In [None]:
## List columns
df.columns

In [None]:
df = df[['Call Connect Time'
         ,'Bank Holiday'
         , 'In_Out_Hours'
         , 'Sub ICB Name'
         ,'Outcome Type'
         ,'GP Practice Code'
         ,'Symptom_Group'
         ,'Disposition Group'
         ,'Disposition'
         ,'Call_Taker_Triages'
         ,'Clinical_Triages'         
         ]].copy()

In [None]:
## Date time conversion to numeric
df['Hour']    = df['Call Connect Time'].dt.hour
df['year']    = df['Call Connect Time'].dt.year
df['month']   = df['Call Connect Time'].dt.month
df['day']     = df['Call Connect Time'].dt.day
df['hour']    = df['Call Connect Time'].dt.hour
df['weekday'] = df['Call Connect Time'].dt.weekday  # Monday=0, Sunday=6

df = df.drop('Call Connect Time',axis=1) 

In [None]:
## One hot encodinng for boolean variables
bool_mapping = {
    'Yes': True
    ,'No': False
    ,'In Hours': True
    ,'Out of Hours': False
    ,'1':True
    ,'0':False
}

df.loc[:,'Is Bank Holiday'] = df['Bank Holiday'].map(bool_mapping)             
df.loc[:,'In Hours'] = df['In_Out_Hours'].map(bool_mapping)
df.loc[:,'Call_Taker_Triage'] = df['Call_Taker_Triages'].map(bool_mapping)
df.loc[:,'Clinical_Triage'] = df['Clinical_Triages'].map(bool_mapping)

df = df.drop(['Bank Holiday'
              ,'In_Out_Hours'
              ,'Call_Taker_Triages'
              ,'Clinical_Triages'
              ],axis=1) 

In [None]:
## Apply trauma and pregnancy functions
df.loc[:,"Trauma_Type"] = df["Symptom_Group"].transform(lambda x: Trauma_Detect(x))
df.loc[:,"Pregnant"] = df["Symptom_Group"].transform(lambda x: Pregnancy_Detect(x))

conditions = [', Blunt'
              ,', Penetrating'
              ,', Pregnant, Over 20 Weeks'
              ,', Pregnant, Under 20 Weeks'
              ,', Pregnant']

for c in conditions:
    df.loc[:,'Symptom_Group'] = (df['Symptom_Group'].str.replace(c,'', regex=True))

In [None]:
## Catergory data types
category_list = ["Sub ICB Name"
                ,"Outcome Type"
                ,"GP Practice Code"
                ,"Symptom_Group"
                ,"Disposition Group"
                ,"Disposition"
                ,"Trauma_Type"
                ,"Pregnant"
                ,"Trauma_Type"
                ,"Pregnant"]

for c in category_list:
    df[c] = df[c].astype("category")

In [None]:
df.dtypes

In [None]:
df.head()

## Split

In [None]:
X = df.drop('Outcome Type',axis=1) # X = all 'data' except the 'survived' column
y = df['Outcome Type'] # y = 'survived' column from 'data'

label_encoder_y = LabelEncoder()
y_encoded = label_encoder_y.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X
                                                    , y_encoded 
                                                    , test_size = 0.25
                                                    , random_state=42)

## Fit decision tree model

In [None]:
model = XGBClassifier(use_label_encoder=False
                      ,enable_categorical=True
                      ,random_state=42)
model = model.fit(X_train,y_train)

## Predict values

In [None]:
## Predict training and test set labels
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

## Accuracy

In [None]:
accuracy = accuracy_score(y_test, y_pred_test)
report = classification_report(y_test, y_pred_test
                               , target_names=label_encoder_y.classes_)


In [None]:
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', report)

# Model 2

In [None]:
## Makes working copy
df = df_raw.copy()

#df = df.sample(n=100000, random_state=42)

### Functions

In [None]:
# Function to calculate previous col within a specified day window
def previous_col_within(data,col, days_window,name_col):
    
    # new col name
    new_col = f'prev_{name_col}_{days_window}_days'

    data[new_col] = (
                data.groupby('Pseudo NHS Number')[col]
                    .apply(lambda x: x.rolling(f'{days_window}D').count() - 1)
                    .apply(lambda x: max(0,x))
                    .reset_index(level=0, drop=True)
                    )
    
    return data

### Wrangle

In [None]:
# Convert dates to datetime
df['Call Connect Time'] = pd.to_datetime(df['Call Connect Time'])
df['Outcome Datetime'] = pd.to_datetime(df['Outcome Datetime'])

# Set index as Call Connect Time
df = df.set_index('Call Connect Time')


# Sort by Patient ID and Call Connect Time
df = df.sort_values(by=['Pseudo NHS Number'
                        , 'Call Connect Time'])


In [None]:
for t  in [7,30,90]:
    df = previous_col_within(df,'Start_Location'
                                ,days_window = t,name_col='Calls')

    df = previous_col_within(df,'Outcome ID'
                                ,days_window = t,name_col='UEC')



In [None]:
df = df.reset_index()[['Call Connect Time'
         ,'Bank Holiday'
         ,'In_Out_Hours'
         ,'Sub ICB Name'
         ,'Outcome Type'
         ,'GP Practice Code'
         ,'GP Deprivation'
         ,'Symptom_Group'
         ,'Disposition Group'
         ,'Disposition'
         ,'Call_Taker_Triages'
         ,'Clinical_Triages' 
         ,'prev_Calls_7_days'
         ,'prev_UEC_7_days'
         ,'prev_Calls_30_days'
         ,'prev_UEC_30_days'
         ,'prev_Calls_90_days'
         ,'prev_UEC_90_days'        
         ]].copy()

In [None]:
## Date time conversion to numeric
df['Hour']    = df['Call Connect Time'].dt.hour
df['year']    = df['Call Connect Time'].dt.year
df['month']   = df['Call Connect Time'].dt.month
df['day']     = df['Call Connect Time'].dt.day
df['hour']    = df['Call Connect Time'].dt.hour
df['weekday'] = df['Call Connect Time'].dt.weekday  # Monday=0, Sunday=6

df = df.drop('Call Connect Time',axis=1) 

In [None]:
## One hot encodinng for boolean variables
bool_mapping = {
    'Yes': True
    ,'No': False
    ,'In Hours': True
    ,'Out of Hours': False
    ,'1':True
    ,'0':False
}

df.loc[:,'Is Bank Holiday'] = df['Bank Holiday'].map(bool_mapping)             
df.loc[:,'In Hours'] = df['In_Out_Hours'].map(bool_mapping)
df.loc[:,'Call_Taker_Triage'] = df['Call_Taker_Triages'].map(bool_mapping)
df.loc[:,'Clinical_Triage'] = df['Clinical_Triages'].map(bool_mapping)

df = df.drop(['Bank Holiday'
              ,'In_Out_Hours'
              ,'Call_Taker_Triages'
              ,'Clinical_Triages'
              ],axis=1) 

In [None]:
## Apply trauma and pregnancy functions
df.loc[:,"Trauma_Type"] = df["Symptom_Group"].transform(lambda x: Trauma_Detect(x))
df.loc[:,"Pregnant"] = df["Symptom_Group"].transform(lambda x: Pregnancy_Detect(x))

conditions = [', Blunt'
              ,', Penetrating'
              ,', Pregnant, Over 20 Weeks'
              ,', Pregnant, Under 20 Weeks'
              ,', Pregnant']

for c in conditions:
    df.loc[:,'Symptom_Group'] = (df['Symptom_Group'].str.replace(c,'', regex=True))

In [None]:
## Catergory data types
category_list = ["Sub ICB Name"
                ,"Outcome Type"
                ,"GP Practice Code"
                ,"Symptom_Group"
                ,"Disposition Group"
                ,"Disposition"
                ,"Trauma_Type"
                ,"Pregnant"
                ,"Trauma_Type"
                ,"Pregnant"]

for c in category_list:
    df[c] = df[c].astype("category")

### Split

In [None]:
X = df.drop('Outcome Type',axis=1) # X = all 'data' except the 'survived' column
y = df['Outcome Type'] # y = 'survived' column from 'data'

label_encoder_y = LabelEncoder()
y_encoded = label_encoder_y.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X
                                                    , y_encoded 
                                                    , test_size = 0.25
                                                    , random_state=42)

### Fit

In [None]:
model = XGBClassifier(use_label_encoder=False
                      ,enable_categorical=True
                      ,random_state=42)
model = model.fit(X_train,y_train)

### Predict

In [None]:
## Predict training and test set labels
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

### Acccuracy

In [None]:
accuracy = accuracy_score(y_test, y_pred_test)
report = classification_report(y_test, y_pred_test
                               , target_names=label_encoder_y.classes_)


In [None]:
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:\n', report)