In [2]:
import pandas as pd
import re
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib
import os

print("📂 Loading data...")
df = pd.read_csv("data/traineddf_with_level1.csv", encoding='latin1')
df.head()


📂 Loading data...


Unnamed: 0,Group,Sex,Age,Patients.number.per.hour,Arrival.mode,Injury,Chief_complain,Mental,Pain,NRS_pain,...,Saturation,KTAS_RN,Diagnosis.in.ED,Disposition,KTAS_expert,Error_group,Length.of.stay_min,KTAS.duration_min,mistriage,mistriage.....
0,2,2,71,3.0,3,2,right ocular pain,1,1,2.0,...,100,2,Corneal abrasion,1,4,2,86.0,5.0,,
1,2,1,68,8.0,2,2,"arm pain, Lt",1,1,2.0,...,98,4,"Fracture of surgical neck of humerus, closed",2,5,4,862.0,1.0,,
2,2,1,54,6.0,4,1,fever,1,1,3.0,...,98,3,"Fever, unspecified",2,4,1,9246.0,2.0,,
3,2,2,49,11.0,3,1,With chest discomfort,1,1,3.0,...,98,2,"Angina pectoris, unspecified",1,3,2,400.0,3.0,,
4,2,1,38,6.0,3,1,"Eczema, Eyelid",1,1,3.0,...,97,4,Ocular pain,1,5,4,185.0,4.0,,


In [3]:
# Combine text fields into one for ML
df["symptoms_text"] = (
    df["Chief_complain"].fillna("") + " " + 
    df["Diagnosis.in.ED"].fillna("")
).str.strip()

# Select relevant columns for training
train_df = df[["symptoms_text", "Age", "Sex", "KTAS_expert"]].copy()

# Filter out rows with empty text or missing labels
train_df = train_df[(train_df["symptoms_text"] != "") & (train_df["KTAS_expert"].notna())].dropna()

print(f"✅ Training samples: {len(train_df)}")

# Function to clean text
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)  # remove special characters
    text = re.sub(r'\s+', ' ', text)              # normalize whitespace
    return text.strip()

# Apply cleaning
train_df["symptoms_clean"] = train_df["symptoms_text"].apply(clean_text)

# Inspect the cleaned text
train_df.head()


✅ Training samples: 359


Unnamed: 0,symptoms_text,Age,Sex,KTAS_expert,symptoms_clean
0,right ocular pain Corneal abrasion,71,2,4,right ocular pain corneal abrasion
1,"arm pain, Lt Fracture of surgical neck of hume...",68,1,5,arm pain lt fracture of surgical neck of humer...
2,"fever Fever, unspecified",54,1,4,fever fever unspecified
3,"With chest discomfort Angina pectoris, unspeci...",49,2,3,with chest discomfort angina pectoris unspecified
4,"Eczema, Eyelid Ocular pain",38,1,5,eczema eyelid ocular pain


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import scipy.sparse as sp

# Features & labels
X_text = train_df["symptoms_clean"]
X_meta = train_df[["Age", "Sex"]]
y = train_df["KTAS_expert"].astype(int)

# Split into train and test sets (80% train, 20% test)
X_train_text, X_test_text, X_train_meta, X_test_meta, y_train, y_test = train_test_split(
    X_text, X_meta, y, test_size=0.2, random_state=42, stratify=y
)

print("📈 Vectorizing text with TF-IDF...")
# TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english',
    min_df=2,
    max_df=0.95
)

# Fit on training text, transform both train and test
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

# Combine TF-IDF features with metadata
X_train_full = sp.hstack([X_train_tfidf, X_train_meta])
X_test_full = sp.hstack([X_test_tfidf, X_test_meta])

print("✅ Feature extraction complete.")
print(f"Training samples: {X_train_full.shape[0]}, Test samples: {X_test_full.shape[0]}")


📈 Vectorizing text with TF-IDF...
✅ Feature extraction complete.
Training samples: 287, Test samples: 72


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

print("🧠 Training Logistic Regression model...")

# Initialize model
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)

# Train
model.fit(X_train_full, y_train)

# Predict on test set
y_pred = model.predict(X_test_full)

# Evaluate
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))

# Save model + TF-IDF + metadata columns
MODEL_PATH = "backend/app/models/triage_nlp_model.joblib"
joblib.dump({
    'tfidf': tfidf,
    'model': model,
    'meta_cols': ["Age", "Sex"]
}, MODEL_PATH)

print(f"\n✅ Model saved to: {MODEL_PATH}")
print("🎉 Training complete!")


🧠 Training Logistic Regression model...





📊 Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       0.82      0.82      0.82        17
           3       0.50      0.32      0.39        19
           4       0.61      0.79      0.69        29
           5       0.00      0.00      0.00         2

    accuracy                           0.67        72
   macro avg       0.59      0.59      0.58        72
weighted avg       0.64      0.67      0.64        72


✅ Model saved to: backend/app/models/triage_nlp_model.joblib
🎉 Training complete!


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
