In [None]:
#imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier

#visualise performance
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

#models
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, accuracy_score

In [None]:
#load and sort dataset
df = pd.read_csv('dataP.csv')
df['urgency'] = df['urgency'].fillna(0)

df = df.sample(frac=1).reset_index(drop=True)

df['urgency'].value_counts()

In [None]:
#must be converted to strings so they cna be stored in the same array
df['referal_type'] = df['referal_type'].astype(str)
df['urgency'] = df['urgency'].astype(str)

#Split the data
X = df['letter_text']
y = df[['referal_type', 'urgency']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,
                                                    random_state = 1)

In [None]:
def universal_ML_model(ML, **kwargs):
  base_model = ML(**kwargs)

  model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultiOutputClassifier(base_model))
    ])
  return model

In [None]:
def evaluate_multi_output_model(model, X_test, y_test, label_names=['referal_type', 'urgency']):

  y_pred = model.predict(X_test)

  y_pred_df = pd.DataFrame(y_pred, columns = label_names)

  for label in label_names:
    print(f"\n=== Classifcation Report: {label.capitalize()} ===")
    print(classification_report(y_test[label], y_pred_df[label], zero_division = 0))

  fig, axes = plt.subplots(1, 2, figsize = (16, 6))
  fig.suptitle('Model Performance Visualisation', fontsize = 16)

  for i, label in enumerate(label_names):
    cm = confusion_matrix(y_test[label], y_pred_df[label], labels = np.unique(y_test[label]))
    sns.heatmap(cm, annot = True, fmt = 'd', cmap = 'Blues' if i == 0 else 'Reds', ax = axes[i],
                xticklabels = np.unique(y_test[label]), yticklabels = np.unique(y_test[label]))
    axes[i].set_title("Confusion Matrix (Urgency)")
    axes[i].set_xlabel("Predicted Label")
    axes[i].set_ylabel("True Label")

  plt.tight_layout(rect=[0, 0, 1, 0.96])
  plt.show()

# Full Dataset

## SVM

In [None]:
# build the multi output model
SVM = universal_ML_model(SVC, kernel = 'linear', probability = True, random_state = 1)
evaluate_multi_output_model(SVM, X_test, y_test)

### 5-Fold Cross-Validation

In [None]:

# Prepare results
f1_scores_type = []
f1_scores_urgency = []
acc_scores_type = []
acc_scores_urgency = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
for train_idx, test_idx in kf.split(X):
    print(f"\nFold {fold}")
    fold += 1

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train your pipeline
    SVM.fit(X_train, y_train)
    y_pred = SVM.predict(X_test)
    y_pred_df = pd.DataFrame(y_pred, columns=y.columns)

    # Compute F1 and Accuracy per output
    f1_type = f1_score(y_test['referal_type'], y_pred_df['referal_type'], average='macro')
    f1_urg = f1_score(y_test['urgency'], y_pred_df['urgency'], average='macro')
    acc_type = accuracy_score(y_test['referal_type'], y_pred_df['referal_type'])
    acc_urg = accuracy_score(y_test['urgency'], y_pred_df['urgency'])

    f1_scores_type.append(f1_type)
    f1_scores_urgency.append(f1_urg)
    acc_scores_type.append(acc_type)
    acc_scores_urgency.append(acc_urg)

    print(f"Referral Type → F1: {f1_type:.4f}, Acc: {acc_type:.4f}")
    print(f"Urgency       → F1: {f1_urg:.4f}, Acc: {acc_urg:.4f}")

# Summary
print("\nFinal SVM Results")
print("Referral Type F1:", f1_scores_type)
print("Referral Type Acc:", acc_scores_type)
print(f"Mean F1: {np.mean(f1_scores_type):.4f}, Mean Acc: {np.mean(acc_scores_type):.4f}")

print("\nUrgency F1:", f1_scores_urgency)
print("Urgency Acc:", acc_scores_urgency)
print(f"Mean F1: {np.mean(f1_scores_urgency):.4f}, Mean Acc: {np.mean(acc_scores_urgency):.4f}")


## k-NN

In [None]:
kNN = universal_ML_model(KNeighborsClassifier, n_neighbors = 5)
evaluate_multi_output_model(kNN, X_test, y_test)

### 5-Fold Cross-Validation

In [None]:
# Prepare results
f1_scores_type = []
f1_scores_urgency = []
acc_scores_type = []
acc_scores_urgency = []

kf = KFold(n_splits=5, shuffle=True, random_state=1)

fold = 1
for train_idx, test_idx in kf.split(X):
    print(f"\nFold {fold}")
    fold += 1

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train your pipeline
    kNN.fit(X_train, y_train)
    y_pred = kNN.predict(X_test)
    y_pred_df = pd.DataFrame(y_pred, columns=y.columns)

    # Compute F1 and Accuracy per output
    f1_type = f1_score(y_test['referal_type'], y_pred_df['referal_type'], average='macro')
    f1_urg = f1_score(y_test['urgency'], y_pred_df['urgency'], average='macro')
    acc_type = accuracy_score(y_test['referal_type'], y_pred_df['referal_type'])
    acc_urg = accuracy_score(y_test['urgency'], y_pred_df['urgency'])

    f1_scores_type.append(f1_type)
    f1_scores_urgency.append(f1_urg)
    acc_scores_type.append(acc_type)
    acc_scores_urgency.append(acc_urg)

    print(f"Referral Type → F1: {f1_type:.4f}, Acc: {acc_type:.4f}")
    print(f"Urgency       → F1: {f1_urg:.4f}, Acc: {acc_urg:.4f}")

# Summary
print("\nFinal kNN Results")
print("Referral Type F1:", f1_scores_type)
print("Referral Type Acc:", acc_scores_type)
print(f"Mean F1: {np.mean(f1_scores_type):.4f}, Mean Acc: {np.mean(acc_scores_type):.4f}")

print("\nUrgency F1:", f1_scores_urgency)
print("Urgency Acc:", acc_scores_urgency)

## Random Forest

In [None]:
RF = universal_ML_model(RandomForestClassifier, n_estimators=250, random_state=1)
evaluate_multi_output_model(RF, X_test, y_test)

### 5-Fold Cross-Validation

In [None]:
# Prepare results
f1_scores_type = []
f1_scores_urgency = []
acc_scores_type = []
acc_scores_urgency = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
for train_idx, test_idx in kf.split(X):
    print(f"\n=== Fold {fold} ===")
    fold += 1

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train your pipeline
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    y_pred_df = pd.DataFrame(y_pred, columns=y.columns)

    # Compute F1 and Accuracy per output
    f1_type = f1_score(y_test['referal_type'], y_pred_df['referal_type'], average='macro')
    f1_urg = f1_score(y_test['urgency'], y_pred_df['urgency'], average='macro')
    acc_type = accuracy_score(y_test['referal_type'], y_pred_df['referal_type'])
    acc_urg = accuracy_score(y_test['urgency'], y_pred_df['urgency'])

    f1_scores_type.append(f1_type)
    f1_scores_urgency.append(f1_urg)
    acc_scores_type.append(acc_type)
    acc_scores_urgency.append(acc_urg)

    print(f"Referral Type → F1: {f1_type:.4f}, Acc: {acc_type:.4f}")
    print(f"Urgency       → F1: {f1_urg:.4f}, Acc: {acc_urg:.4f}")

# Summary
print("\nFinal RF Results")
print("Referral Type F1:", f1_scores_type)
print("Referral Type Acc:", acc_scores_type)
print(f"Mean F1: {np.mean(f1_scores_type):.4f}, Mean Acc: {np.mean(acc_scores_type):.4f}")

print("\nUrgency F1:", f1_scores_urgency)
print("Urgency Acc:", acc_scores_urgency)

## MLP

In [None]:
MLP = universal_ML_model(
    MLPClassifier,
    hidden_layer_sizes=(100,50,25),
    activation='relu',
    solver='adam',
    max_iter=500,
    random_state=1
)
evaluate_multi_output_model(MLP, X_test, y_test)

### 5-Fold Cross-Validation

In [None]:
# Prepare results
f1_scores_type = []
f1_scores_urgency = []
acc_scores_type = []
acc_scores_urgency = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
for train_idx, test_idx in kf.split(X):
    print(f"\nFold {fold}")
    fold += 1

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train your pipeline
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    y_pred_df = pd.DataFrame(y_pred, columns=y.columns)

    # Compute F1 and Accuracy per output
    f1_type = f1_score(y_test['referal_type'], y_pred_df['referal_type'], average='macro')
    f1_urg = f1_score(y_test['urgency'], y_pred_df['urgency'], average='macro')
    acc_type = accuracy_score(y_test['referal_type'], y_pred_df['referal_type'])
    acc_urg = accuracy_score(y_test['urgency'], y_pred_df['urgency'])

    f1_scores_type.append(f1_type)
    f1_scores_urgency.append(f1_urg)
    acc_scores_type.append(acc_type)
    acc_scores_urgency.append(acc_urg)

    print(f"Referral Type → F1: {f1_type:.4f}, Acc: {acc_type:.4f}")
    print(f"Urgency       → F1: {f1_urg:.4f}, Acc: {acc_urg:.4f}")

# Summary
print("\nFinal MLP Results")
print("Referral Type F1:", f1_scores_type)
print("Referral Type Acc:", acc_scores_type)
print(f"Mean F1: {np.mean(f1_scores_type):.4f}, Mean Acc: {np.mean(acc_scores_type):.4f}")

print("\nUrgency F1:", f1_scores_urgency)
print("Urgency Acc:", acc_scores_urgency)

# Vascular-Only Dataset

In [None]:
df = df[df['referal_type'] != 'non vascular']
df['referal_type'].value_counts()

In [None]:
#must be converted to strings so they cna be stored in the same array
df['referal_type'] = df['referal_type'].astype(str)
df['urgency'] = df['urgency'].astype(str)

#Split the data
X = df['letter_text']
y = df[['referal_type', 'urgency']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25,
                                                    random_state = 1)
print(np.unique(y_test['referal_type']))

## SVM

In [None]:
# build the multi output model
SVM = universal_ML_model(SVC, kernel = 'linear', probability = True, random_state = 1)
evaluate_multi_output_model(SVM, X_test, y_test)

### 5-Fold Cross-Validation

In [None]:
# Prepare results
f1_scores_type = []
f1_scores_urgency = []
acc_scores_type = []
acc_scores_urgency = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
for train_idx, test_idx in kf.split(X):
    print(f"\nFold {fold}")
    fold += 1

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train your pipeline
    SVM.fit(X_train, y_train)
    y_pred = SVM.predict(X_test)
    y_pred_df = pd.DataFrame(y_pred, columns=y.columns)

    # Compute F1 and Accuracy per output
    f1_type = f1_score(y_test['referal_type'], y_pred_df['referal_type'], average='macro')
    f1_urg = f1_score(y_test['urgency'], y_pred_df['urgency'], average='macro')
    acc_type = accuracy_score(y_test['referal_type'], y_pred_df['referal_type'])
    acc_urg = accuracy_score(y_test['urgency'], y_pred_df['urgency'])

    f1_scores_type.append(f1_type)
    f1_scores_urgency.append(f1_urg)
    acc_scores_type.append(acc_type)
    acc_scores_urgency.append(acc_urg)

    print(f"Referral Type → F1: {f1_type:.4f}, Acc: {acc_type:.4f}")
    print(f"Urgency       → F1: {f1_urg:.4f}, Acc: {acc_urg:.4f}")

# Summary
print("\nFinal SVM Results")
print("Referral Type F1:", f1_scores_type)
print("Referral Type Acc:", acc_scores_type)
print(f"Mean F1: {np.mean(f1_scores_type):.4f}, Mean Acc: {np.mean(acc_scores_type):.4f}")

print("\nUrgency F1:", f1_scores_urgency)
print("Urgency Acc:", acc_scores_urgency)

## kNN

In [None]:
kNN = universal_ML_model(KNeighborsClassifier, n_neighbors = 5)
evaluate_multi_output_model(kNN, X_test, y_test)

### 5-Fold Cross-Validation

In [None]:
# Prepare results
f1_scores_type = []
f1_scores_urgency = []
acc_scores_type = []
acc_scores_urgency = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
for train_idx, test_idx in kf.split(X):
    print(f"\nFold {fold}")
    fold += 1

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train your pipeline
    kNN.fit(X_train, y_train)
    y_pred = kNN.predict(X_test)
    y_pred_df = pd.DataFrame(y_pred, columns=y.columns)

    # Compute F1 and Accuracy per output
    f1_type = f1_score(y_test['referal_type'], y_pred_df['referal_type'], average='macro')
    f1_urg = f1_score(y_test['urgency'], y_pred_df['urgency'], average='macro')
    acc_type = accuracy_score(y_test['referal_type'], y_pred_df['referal_type'])
    acc_urg = accuracy_score(y_test['urgency'], y_pred_df['urgency'])

    f1_scores_type.append(f1_type)
    f1_scores_urgency.append(f1_urg)
    acc_scores_type.append(acc_type)
    acc_scores_urgency.append(acc_urg)

    print(f"Referral Type → F1: {f1_type:.4f}, Acc: {acc_type:.4f}")
    print(f"Urgency       → F1: {f1_urg:.4f}, Acc: {acc_urg:.4f}")

# Summary
print("\nFinal kNN Results")
print("Referral Type F1:", f1_scores_type)
print("Referral Type Acc:", acc_scores_type)
print(f"Mean F1: {np.mean(f1_scores_type):.4f}, Mean Acc: {np.mean(acc_scores_type):.4f}")

print("\nUrgency F1:", f1_scores_urgency)
print("Urgency Acc:", acc_scores_urgency)

## Random Forest

In [None]:
RF = universal_ML_model(RandomForestClassifier, n_estimators=250, random_state=1)
evaluate_multi_output_model(RF, X_test, y_test)

### 5-Fold Cross-Validation

In [None]:
# Prepare results
f1_scores_type = []
f1_scores_urgency = []
acc_scores_type = []
acc_scores_urgency = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
for train_idx, test_idx in kf.split(X):
    print(f"\n=== Fold {fold} ===")
    fold += 1

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train your pipeline (assumes SVM is already a full pipeline)
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    y_pred_df = pd.DataFrame(y_pred, columns=y.columns)

    # Compute F1 and Accuracy per output
    f1_type = f1_score(y_test['referal_type'], y_pred_df['referal_type'], average='macro')
    f1_urg = f1_score(y_test['urgency'], y_pred_df['urgency'], average='macro')
    acc_type = accuracy_score(y_test['referal_type'], y_pred_df['referal_type'])
    acc_urg = accuracy_score(y_test['urgency'], y_pred_df['urgency'])

    f1_scores_type.append(f1_type)
    f1_scores_urgency.append(f1_urg)
    acc_scores_type.append(acc_type)
    acc_scores_urgency.append(acc_urg)

    print(f"Referral Type → F1: {f1_type:.4f}, Acc: {acc_type:.4f}")
    print(f"Urgency       → F1: {f1_urg:.4f}, Acc: {acc_urg:.4f}")

# Summary
print("\n=== Final SVM Results ===")
print("Referral Type F1:", f1_scores_type)
print("Referral Type Acc:", acc_scores_type)
print(f"Mean F1: {np.mean(f1_scores_type):.4f}, Mean Acc: {np.mean(acc_scores_type):.4f}")

print("\nUrgency F1:", f1_scores_urgency)
print("Urgency Acc:", acc_scores_urgency)

## MLP

In [None]:
MLP = universal_ML_model(
    MLPClassifier,
    hidden_layer_sizes=(100,50,25),
    activation='relu',
    solver='adam',
    max_iter=500,
    random_state=1
)
evaluate_multi_output_model(MLP, X_test, y_test)

### 5-Fold Cross-Validation

In [None]:
# Prepare results
f1_scores_type = []
f1_scores_urgency = []
acc_scores_type = []
acc_scores_urgency = []

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold = 1
for train_idx, test_idx in kf.split(X):
    print(f"\n=== Fold {fold} ===")
    fold += 1

    # Split data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Train your pipeline (assumes SVM is already a full pipeline)
    MLP.fit(X_train, y_train)
    y_pred = MLP.predict(X_test)
    y_pred_df = pd.DataFrame(y_pred, columns=y.columns)

    # Compute F1 and Accuracy per output
    f1_type = f1_score(y_test['referal_type'], y_pred_df['referal_type'], average='macro')
    f1_urg = f1_score(y_test['urgency'], y_pred_df['urgency'], average='macro')
    acc_type = accuracy_score(y_test['referal_type'], y_pred_df['referal_type'])
    acc_urg = accuracy_score(y_test['urgency'], y_pred_df['urgency'])

    f1_scores_type.append(f1_type)
    f1_scores_urgency.append(f1_urg)
    acc_scores_type.append(acc_type)
    acc_scores_urgency.append(acc_urg)

    print(f"Referral Type → F1: {f1_type:.4f}, Acc: {acc_type:.4f}")
    print(f"Urgency       → F1: {f1_urg:.4f}, Acc: {acc_urg:.4f}")

# Summary
print("\n=== Final SVM Results ===")
print("Referral Type F1:", f1_scores_type)
print("Referral Type Acc:", acc_scores_type)
print(f"Mean F1: {np.mean(f1_scores_type):.4f}, Mean Acc: {np.mean(acc_scores_type):.4f}")

print("\nUrgency F1:", f1_scores_urgency)
print("Urgency Acc:", acc_scores_urgency)