# Regensburg Pediatric Appendicitis

In [None]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mutual_info_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression

import torch
import torch.nn as nn
import torch.optim as optim

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

import warnings

warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# fetch dataset 
regensburg_pediatric_appendicitis = fetch_ucirepo(id=938)   
# data (as pandas dataframes) 
X = regensburg_pediatric_appendicitis.data.features 
y = regensburg_pediatric_appendicitis.data.targets 
df = pd.concat([X, y], axis=1)
df.head()

In [None]:
df.info()

# Data Cleaning + Preparation

We observe from the info that some columns have majority of values null and it would be best to drop them

In [None]:
df_cols = df.columns
row, column = df.shape
keep_cols = []
for col in df_cols:
    if ((df[col].isna().sum()/column)*100) > 50:
        continue
    keep_cols.append(col)
df_new = df.loc[:, keep_cols]
df_new.info()

The data looks much better now after dropping columns that are not needed. We will prepare the data before training the model.

In [None]:
df_new.head()

For the remaining columns with few missing values, we will eaither fill them with mean or mode based on the data type (numerical or categorical). 

In [None]:
num_cols = df_new.select_dtypes(include='number').columns
cat_cols = df_new.select_dtypes(exclude='number').columns
df_new = df_new.loc[~(df_new.Management.isna() | df_new.Severity.isna() | df_new.Diagnosis.isna())]
df_new.dropna(thresh=5, inplace=True)

for col in num_cols:
    df_new.loc[df_new[col].isna(), col] = df_new[col].mean()
    if col in ['Length_of_Stay', 'Thrombocyte_Count', 'CRP', 'US_Number', 'Age']:
        df_new[col] = df_new[col].astype(int)

for col in cat_cols:
    df_new[col] = df_new[col].astype('category')
    df_new.loc[df_new[col].isna(), col] = df_new[col].mode()[0]

df_new.info()

In [None]:
df_new.head()

# EDA

In [None]:
plt.figure(figsize=(10,10))
num_matrix = df_new.select_dtypes(include='number').corr()
sns.heatmap(num_matrix, annot=True, mask=np.triu(num_matrix))
plt.show()

We observe that the column Weight is highly correlated with various columns so it would be best to drop those columns as weight is an import feature when considering pediatrics.

In [None]:
scores = []
key_pair = set()
df_temp = df_new.dropna()
for col_x in cat_cols[:-3]:
    for col_y in y.columns:
        if col_x != col_y:
            if (col_y, col_x) not in key_pair:
                mis = mutual_info_score(df_temp[col_x], df_temp[col_y])
                # print(f'MIS of {col_x=} and {col_y=} is {round(mis,4)}')
                scores.append(((col_x, col_y), round(mis,4)))
scores.sort(key=lambda x: x[1], reverse=True)
for x, mis in scores:
    col_x, col_y = x
    print(f'MIS of {col_x=} and {col_y=} is {mis=}')


Considering the original problem had Management, Severity and Diagnosis as their target variables, we see we can get more information about Diagnosis and Severity from feature variables most of the time.  

We see columns such as Appendix_on_US, Loss_of_Appetite, and Nausea have considerable MIS which can be used to predict Diagnosis.

In [None]:
df_new.describe()

In [None]:
df_new.loc[df_new.Age == 0]

The above samples indicate information about patients that are babies. Babies with height and weight below average of their age group show complicated case of appendicitis.

In [None]:
df_new.describe(exclude='number')

In [None]:
df_new.Stool.value_counts()

In [None]:
df_new.loc[df_new.Stool == 'constipation, diarrhea']

We will remove this row to avoid confusion. It should be learnt that this patient had a compicated case of appenditis.

In [None]:
df_new = df_new.loc[df_new.Stool != 'constipation, diarrhea', :]

In [None]:
df_new.Management.value_counts()

In [None]:
df_new.loc[df_new.Management == 'simultaneous appendectomy', :]

It would be best to remove the row as this is an outlier.

In [None]:
df_new = df_new.loc[df.Management != 'simultaneous appendectomy', :]

In [None]:
fig, axes = plt.subplots(figsize=(8, 4), nrows=1, ncols=2, sharey=True)
sns.histplot(data=df, x='Age', hue='Diagnosis', kde=True, ax=axes[0], multiple='stack')
sns.histplot(data=df.loc[df.Diagnosis=='appendicitis', :], x='Age', hue='Severity', kde=True, ax=axes[1], multiple='stack')
axes[0].set_title('Age Distribution of all patients', fontsize=10)
axes[1].set_title('Age Distribution of appendicitis patients', fontsize=10)
fig.suptitle('Age Distribution', fontsize=16)
plt.show()

We observe that 10-12 year olds constitute the largest age group of people in the dataset and while most of them are diagnosed with appendicitis, majority of cases are not complicated.

In [None]:
fig, axes = plt.subplots(figsize=(10, 8), nrows=2, ncols=2)
diagnosis = ['appendicitis', 'no appendicitis']
severity = ['uncomplicated', 'complicated']

sns.boxplot(x='Length_of_Stay', y='Diagnosis', data=df_new, ax=axes[0][0])
axes[0][0].set_yticklabels(diagnosis, rotation=90, fontsize=8)
sns.boxplot(x='Length_of_Stay', y='Severity', data=df_new, ax=axes[0][1])
axes[0][1].set_yticklabels(severity, rotation=90, fontsize=8)
sns.boxplot(x='Weight', y='Diagnosis', data=df_new, ax=axes[1][0])
axes[1][0].set_yticklabels(diagnosis, rotation=90, fontsize=8)
sns.boxplot(x='Weight', y='Severity', data=df_new, ax=axes[1][1])
axes[1][1].set_yticklabels(severity, rotation=90, fontsize=8)

plt.show()

In [None]:
fig, axes = plt.subplots(figsize=(10, 8), nrows=2, ncols=2)
diagnosis = ['appendicitis', 'no appendicitis']
severity = ['uncomplicated', 'complicated']

sns.boxplot(x='Thrombocyte_Count', y='Diagnosis', data=df_new, ax=axes[0][0])
axes[0][0].set_yticklabels(diagnosis, rotation=90, fontsize=8)
sns.boxplot(x='Thrombocyte_Count', y='Severity', data=df_new, ax=axes[0][1])
axes[0][1].set_yticklabels(severity, rotation=90, fontsize=8)
sns.boxplot(x='Body_Temperature', y='Diagnosis', data=df_new, ax=axes[1][0])
axes[1][0].set_yticklabels(diagnosis, rotation=90, fontsize=8)
sns.boxplot(x='Body_Temperature', y='Severity', data=df_new, ax=axes[1][1])
axes[1][1].set_yticklabels(severity, rotation=90, fontsize=8)

plt.show()

Considering some of the numerical columns that showed high standard variations and significantly high maximum, we observe that the columns have outliers and are right-skewed.

# Model Selection

We will train various models to see which one will be appropriate to predict Diagnosis.

In [None]:
df_model = df_new[['Weight', 'Sex', 'Length_of_Stay', 'Appendix_on_US', 'Migratory_Pain', 'Lower_Right_Abd_Pain',
       'Contralateral_Rebound_Tenderness', 'Coughing_Pain', 'Nausea', 'Loss_of_Appetite', 'Body_Temperature',
       'WBC_Count', 'RBC_Count', 'Hemoglobin', 'RDW', 'Thrombocyte_Count', 'CRP', 'Stool', 'Peritonitis', 'US_Number']]

target = df_new['Diagnosis']
labelencoder = LabelEncoder()
labelencoder.fit(target)
target = labelencoder.transform(target)
X_train, X_test, y_train, y_test = train_test_split(df_model, target, test_size=0.4, random_state=42)

## Random Forest Regression  

We will start Random Forest Classifier.

In [None]:
dv = DictVectorizer(sparse=False)
X_train_dv = dv.fit_transform(X_train.to_dict(orient='records'))
X_test_dv = dv.transform(X_test.to_dict(orient='records'))

params = {'n_estimators': [75, 100, 125, 150],'max_depth': [5, 10, 15, 20], 'max_features': ['auto','sqrt', 'log2']}

rfc_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rfc_model, param_grid=params, cv=5, scoring='accuracy')
grid_search.fit(X_train_dv, y_train)
rfc_best_model = grid_search.best_estimator_
print(grid_search.best_params_)
print(grid_search.best_estimator_)

In [None]:
y_pred = rfc_best_model.predict(X_test_dv)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}")

In [None]:
rfc_importance = pd.DataFrame({'features': dv.feature_names_, 'importance':rfc_best_model.feature_importances_})
rfc_importance.sort_values(by='importance', ascending=False).head(12)

Accuracy wise Random Forest does a great job. We see that body characteristics such as WBC_Count, Weight, Body_Temperature as well as type of surgery such as Appendix needed on US or how long a patient had to stay in hospital are the most important factors for predicting the outcome.

## Logistic Regression

In [None]:
dv = DictVectorizer(sparse=False)
X_train_dv = dv.fit_transform(X_train.to_dict(orient='records'))
X_test_dv = dv.transform(X_test.to_dict(orient='records'))

params = {'penalty': [None, 'l1', 'l2', 'elasticnet'], 'C': [0.01, 0.1, 0.5, 1], 'l1_ratio': [0.1, 0.5, 0.9]}
log_model = LogisticRegression(random_state=42, multi_class='ovr')

grid_search = GridSearchCV(estimator=log_model, param_grid=params, cv=5, scoring='accuracy')
grid_search.fit(X_train_dv, y_train)
log_best_model = grid_search.best_estimator_
print(grid_search.best_params_)
print(grid_search.best_estimator_)

In [None]:
y_pred = (log_best_model.predict_proba(X_test_dv)[:, 1]) >= 0.5
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 Score: {f1_score(y_test, y_pred)}")

In [None]:
log_weights = pd.DataFrame({'features': dv.feature_names_, 'weights':log_best_model.coef_[0]})
log_weights.sort_values(by='weights', ascending=False).head(12)

Logistic regression does a less impressive job and we see it gave different degree of importance to certain variables than Random Forest did.

## PyTorch

In [None]:
X_train_torch = torch.tensor(X_train_dv).float()
X_test_torch = torch.tensor(X_test_dv) .float()
y_train_torch = torch.tensor(y_train).flatten().float()
y_test_torch = torch.tensor(y_test).flatten().float()

torch.manual_seed(42)

# Define the model
class SimpleClassifier(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimpleClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 16)
        self.fc3 = nn.Linear(16, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

# Define model hyperparameters
input_size = X_train_dv.shape[1]  # Adjust based on the number of features in your dataset
output_size = 1   # Binary classification has one output node with sigmoid activation

# Instantiate the model

model = SimpleClassifier(input_size,  output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5000
for epoch in range(num_epochs):
    outputs = model(X_train_torch).flatten()
    loss = criterion(outputs, y_train_torch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 250 == 0: 
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:
with torch.no_grad(): 
    y_pred = model(X_test_torch)
    _, predicted = torch.max(y_pred, dim=1) 
    accuracy = (predicted == y_test_torch).float().mean() 
    print(f'Test Accuracy: {accuracy.item():.4f}')

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

X_train_dv = dv.fit_transform(X_train.to_dict(orient='records'))
X_test_dv = dv.transform(X_test.to_dict(orient='records'))

# Define the model
model = Sequential()
model.add(Dense(64, input_dim=X_train_dv.shape[1], activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()


In [None]:
# Assuming X_train and y_train are your training data and labels
model.fit(X_train_dv, y_train, epochs=50, batch_size=32, validation_split=0.4, verbose=1)


In [None]:
model.evaluate(X_test_dv, y_test)

Tensorflow did a fine job overall.

Overall, Random Forest Classifier did an excellent job in predicting the diagnosis.

# Testing

The cell block should be run only when running `predict.py`` or the model is deployed via docker or online.

In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.model_selection import train_test_split

def clean_data(df):
    '''
    Cleans the data before used for training our model
    '''
    df_cols = df.columns
    _, column = df.shape
    # remove null values
    keep_cols = []
    for col in df_cols:
        if ((df[col].isna().sum()/column)*100) > 50:
            continue
        keep_cols.append(col)
    
    df_new = df.loc[:, keep_cols]
    
    # impute misisng values
    num_cols = df_new.select_dtypes(include='number').columns
    cat_cols = df_new.select_dtypes(exclude='number').columns
    # print(df_new.columns,'\n')
    df_new = df_new.loc[~(df_new.Management.isna() | df_new.Severity.isna() | df_new.Diagnosis.isna())]
    df_new.dropna(thresh=5, inplace=True)

    for col in num_cols:
        df_new.loc[df_new[col].isna(), col] = df_new[col].mean()
        if col in ['Length_of_Stay', 'Thrombocyte_Count', 'CRP', 'US_Number', 'Age']:
            df_new[col] = df_new[col].astype(int)

    for col in cat_cols:
        df_new[col] = df_new[col].astype('category')
        df_new.loc[df_new[col].isna(), col] = df_new[col].mode()[0]
        
    # Remove outlier values
    df_new = df_new.loc[df_new.Stool != 'constipation, diarrhea', :]
    df_new = df_new.loc[df.Management != 'simultaneous appendectomy', :]
    target = df_new.loc[:, 'Diagnosis']
    df_new = df_new[['Weight', 'Sex', 'Length_of_Stay', 'Appendix_on_US', 'Migratory_Pain', 'Lower_Right_Abd_Pain',
       'Contralateral_Rebound_Tenderness', 'Coughing_Pain', 'Nausea', 'Loss_of_Appetite', 'Body_Temperature',
       'WBC_Count', 'RBC_Count', 'Hemoglobin', 'RDW', 'Thrombocyte_Count', 'CRP', 'Stool', 'Peritonitis', 'US_Number']]
    # df_new = df_new[['Length_of_Stay', 'RBC_Count', 'WBC_Count', 'Weight', 'Thrombocyte_Count', 'Body_Temperature',
    #                  'Appendix_on_US', 'CRP', 'Peritonitis', 'Hemoglobin']]
    return df_new, target

regensburg_pediatric_appendicitis = fetch_ucirepo(id=938)
X = regensburg_pediatric_appendicitis.data.features 
y = regensburg_pediatric_appendicitis.data.targets 
df = pd.concat([X, y], axis=1)

X, target = clean_data(df)

_, X_test, _, y_test = train_test_split(X, target, train_size=0.6, random_state=42)

In [2]:
import requests 
import random 
import pickle

with open('le.bin', 'rb') as f:
    labelencoder = pickle.load(f)

url = "http://10.0.0.7:9696/predict"

choice = random.randint(0, 311)
sample = X_test.iloc[choice, :].to_dict()
response = requests.post(url, json=sample).json()

print(sample, '\n\n', response, '\n\n' "Actual result is ", y_test.iloc[choice])

{'Weight': 48.0, 'Sex': 'male', 'Length_of_Stay': 3, 'Appendix_on_US': 'no', 'Migratory_Pain': 'no', 'Lower_Right_Abd_Pain': 'yes', 'Contralateral_Rebound_Tenderness': 'yes', 'Coughing_Pain': 'yes', 'Nausea': 'no', 'Loss_of_Appetite': 'no', 'Body_Temperature': 38.8, 'WBC_Count': 11.4, 'RBC_Count': 5.23, 'Hemoglobin': 13.8, 'RDW': 13.5, 'Thrombocyte_Count': 276, 'CRP': 1, 'Stool': 'normal', 'Peritonitis': 'no', 'US_Number': 413} 

 {'probability': 1.0, 'result': 'Person does not have appenditic'} 

Actual result is  no appendicitis
