In [3]:
# Step 1: Import Libraries
import pandas as pd
import spacy
import joblib
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

# Ensure models directory exists
os.makedirs("../models", exist_ok=True)

# Load preprocessed data
df_admission = pd.read_csv("../data/Preprocessed_Data-at-Admission.csv")
df_days_breakdown = pd.read_csv("../data/Preprocessed_Days-Breakdown.csv")
df_hospital_los = pd.read_csv("../data/Preprocessed_Hospital-LoS.csv")
df_medications = pd.read_csv("../data/Preprocessed_Medications.csv")

# Step 2: Train Intent Classification Model
# Define example intents and queries
intent_data = {
    "intent": [
        "admission_details",
        "medication_details",
        "hospital_stay",
        "diagnostic_results",
        "discharge_summary"
    ],
    "example_query": [
        "What was my reason for admission?",
        "Which medications am I on?",
        "How long was my hospital stay?",
        "What did my X-ray show?",
        "Was I discharged successfully?"
    ]
}

df_intent = pd.DataFrame(intent_data)

# Preprocess intent queries
df_intent["example_query"] = df_intent["example_query"].str.lower()

# Train simple intent classifier
X = df_intent["example_query"]
y = df_intent["intent"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

intent_pipeline = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("tfidf", TfidfTransformer()),
    ("classifier", MultinomialNB())
])

intent_pipeline.fit(X_train, y_train)

# Save intent classifier
intent_model_path = "../models/intent_classifier.joblib"
joblib.dump(intent_pipeline, intent_model_path)
print(f"✅ Intent classifier trained and saved at {intent_model_path}")

# Step 3: Train Named Entity Recognition (NER) Model using spaCy
nlp = spacy.blank("en")  # Create a blank English NLP model

# Ensure NER pipeline exists
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")

# Define custom medical entities
medical_entities = [
    "ADMISSION_REASON", "MEDICATION", "HOSPITAL_STAY", "DIAGNOSIS_RESULT", "DISCHARGE_STATUS"
]

# Add entity labels
for entity in medical_entities:
    ner.add_label(entity)

# Initialize the model
nlp.initialize()

# Save the NER model
ner_model_path = "../models/ner_model.joblib"
joblib.dump(nlp, ner_model_path)
print(f"✅ NER model trained and saved at {ner_model_path}")


✅ Intent classifier trained and saved at ../models/intent_classifier.joblib
✅ NER model trained and saved at ../models/ner_model.joblib
