In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import (classification_report, confusion_matrix, 
                            accuracy_score, precision_recall_fscore_support)

# For model persistence
import joblib

# For loading dataset
from datasets import load_dataset

# Set random seed for reproducibility
np.random.seed(42)

print("✅ All libraries imported successfully!")
print("Random seed set to 42 for reproducibility")

✅ All libraries imported successfully!
Random seed set to 42 for reproducibility


In [2]:
print("\n" + "="*70)
print("LOADING MEDICAL DATASET")
print("="*70)

# Load dataset from Hugging Face
dataset = load_dataset("hpe-ai/medical-cases-classification-tutorial")

# Convert to pandas DataFrames
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

# Prepare feature (X) and target (y) variables
X_train = train_df['transcription']
y_train = train_df['medical_specialty']

X_val = val_df['transcription']
y_val = val_df['medical_specialty']

X_test = test_df['transcription']
y_test = test_df['medical_specialty']

print(f"\n✅ Dataset loaded successfully!")
print(f"\nTraining set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Total unique specialties: {y_train.nunique()}")

# Display class distribution
print(f"\nClass distribution in training set:")
print(y_train.value_counts())


LOADING MEDICAL DATASET


Repo card metadata block was not found. Setting CardData to empty.



✅ Dataset loaded successfully!

Training set: 1724 samples
Validation set: 370 samples
Test set: 370 samples
Total unique specialties: 13

Class distribution in training set:
medical_specialty
Cardiovascular / Pulmonary    526
Orthopedic                    289
Neurology                     187
Gastroenterology              152
Obstetrics / Gynecology       126
Hematology - Oncology          86
Neurosurgery                   76
ENT - Otolaryngology           53
Pediatrics - Neonatal          51
Psychiatry / Psychology        49
Nephrology                     45
Ophthalmology                  45
Radiology                      39
Name: count, dtype: int64


## Load and Prepare Dataset
<!-- Purpose: Load the medical dataset and prepare train/validation/test splits
Reusing the same data preparation from Notebook 1 -->

In [3]:
print("\n" + "="*70)
print("BUILDING BASELINE MODEL PIPELINE")
print("="*70)

# Create TfidfVectorizer (same configuration as Notebook 1)
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    strip_accents='ascii'
)

# Create Softmax Regression classifier (Multinomial Logistic Regression)
classifier = LogisticRegression(
    multi_class='multinomial',    # Use softmax for multi-class classification
    solver='lbfgs',                # Optimizer algorithm
    max_iter=1000,                 # Maximum iterations for convergence
    random_state=42,               # For reproducibility
    n_jobs=-1                      # Use all CPU cores
)

# Combine into a single pipeline
baseline_pipeline = Pipeline([
    ('tfidf', vectorizer),         # Step 1: Convert text to TF-IDF features
    ('classifier', classifier)     # Step 2: Train logistic regression model
])

print("\n✅ Baseline pipeline created successfully!")
print("\nPipeline steps:")
print("  1. TfidfVectorizer: Converts text to numerical features")
print("  2. LogisticRegression: Trains multinomial classification model")

print("\nModel configuration:")
print(f"  • Multi-class strategy: {classifier.multi_class}")
print(f"  • Solver: {classifier.solver}")
print(f"  • Max iterations: {classifier.max_iter}")
print(f"  • TF-IDF max features: {vectorizer.max_features}")
print(f"  • N-gram range: {vectorizer.ngram_range}")


BUILDING BASELINE MODEL PIPELINE

✅ Baseline pipeline created successfully!

Pipeline steps:
  1. TfidfVectorizer: Converts text to numerical features
  2. LogisticRegression: Trains multinomial classification model

Model configuration:
  • Multi-class strategy: multinomial
  • Solver: lbfgs
  • Max iterations: 1000
  • TF-IDF max features: 5000
  • N-gram range: (1, 2)


## Create Baseline Model Pipeline
<!-- Purpose: Build a complete pipeline with TfidfVectorizer + Logistic Regression
This creates a baseline model before hyperparameter tuning -->

In [4]:
print("\n" + "="*70)
print("TRAINING BASELINE MODEL")
print("="*70)

print("\nTraining model on 1,720 documents...")
print("This may take 1-2 minutes...\n")

# Train the pipeline on training data
baseline_pipeline.fit(X_train, y_train)

print("✅ Model training complete!")

# Get vocabulary size after fitting
vocab_size = len(baseline_pipeline.named_steps['tfidf'].vocabulary_)
print(f"\nVocabulary size: {vocab_size} unique terms extracted")


TRAINING BASELINE MODEL

Training model on 1,720 documents...
This may take 1-2 minutes...

✅ Model training complete!

Vocabulary size: 5000 unique terms extracted


## Train Baseline Model
<!-- Purpose: Fit the baseline model on training data
Establishes initial performance before optimization -->

In [None]:
print("\n" + "="*70)
print("BASELINE MODEL - TRAINING SET PERFORMANCE")
print("="*70)

# Make predictions on training data
y_train_pred = baseline_pipeline.predict(X_train)

# Calculate accuracy
train_accuracy = accuracy_score(y_train, y_train_pred)

print(f"\nTraining Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")

# Check for overfitting indicators
if train_accuracy > 0.95:
    print("⚠️  High training accuracy (>95%) - monitor for overfitting")
elif train_accuracy < 0.70:
    print("⚠️  Low training accuracy (<70%) - model may be underfitting")
else:
    print("✅ Training accuracy in healthy range (70-95%)")




BASELINE MODEL - TRAINING SET PERFORMANCE

Training Accuracy: 0.8730 (87.30%)
✅ Training accuracy in healthy range (70-95%)

Classification Report (Training Set):
                            precision    recall  f1-score   support

Cardiovascular / Pulmonary       0.90      0.99      0.94       526
      ENT - Otolaryngology       1.00      0.83      0.91        53
          Gastroenterology       0.91      0.97      0.94       152
     Hematology - Oncology       0.88      0.73      0.80        86
                Nephrology       1.00      0.53      0.70        45
                 Neurology       0.77      0.83      0.80       187
              Neurosurgery       0.76      0.42      0.54        76
   Obstetrics / Gynecology       0.94      0.95      0.95       126
             Ophthalmology       1.00      0.89      0.94        45
                Orthopedic       0.81      0.95      0.87       289
     Pediatrics - Neonatal       0.93      0.75      0.83        51
   Psychiatry / Psy

## Baseline Model - Training Set Performance
<!-- Purpose: Evaluate model performance on training data
Helps identify if model is learning patterns vs. memorizing -->

In [6]:
print("\n" + "="*70)
print("BASELINE MODEL - VALIDATION SET PERFORMANCE")
print("="*70)

# Make predictions on validation data
y_val_pred = baseline_pipeline.predict(X_val)

# Calculate accuracy
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"\nValidation Accuracy: {val_accuracy:.4f} ({val_accuracy*100:.2f}%)")

# Compare training vs validation accuracy
accuracy_gap = train_accuracy - val_accuracy
print(f"\nAccuracy Gap (Train - Val): {accuracy_gap:.4f}")

if accuracy_gap > 0.10:
    print("⚠️  Large gap (>10%) suggests overfitting")
elif accuracy_gap < 0:
    print("⚠️  Validation accuracy higher than training - unusual, check data")
else:
    print("✅ Reasonable generalization gap (<10%)")


BASELINE MODEL - VALIDATION SET PERFORMANCE

Validation Accuracy: 0.7486 (74.86%)

Accuracy Gap (Train - Val): 0.1243
⚠️  Large gap (>10%) suggests overfitting


## Baseline Model - Validation Set Performance
<!-- Purpose: Evaluate model on unseen validation data
More realistic measure of model performance -->

In [7]:
print("\n" + "="*70)
print("CROSS-VALIDATION EVALUATION")
print("="*70)

print("\nPerforming 5-fold cross-validation on training data...")
print("This may take 3-5 minutes...\n")

# Perform 5-fold cross-validation
cv_scores = cross_val_score(
    baseline_pipeline, 
    X_train, 
    y_train, 
    cv=5,                    # 5 folds
    scoring='accuracy',      # Metric to use
    n_jobs=-1               # Use all CPU cores
)

print("✅ Cross-validation complete!")
print("\nCross-Validation Scores (5 folds):")
print("-"*70)
for i, score in enumerate(cv_scores, 1):
    print(f"  Fold {i}: {score:.4f} ({score*100:.2f}%)")

print("\n" + "-"*70)
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} ({cv_scores.mean()*100:.2f}%)")
print(f"Std Dev: {cv_scores.std():.4f} (±{cv_scores.std()*100:.2f}%)")
print(f"95% Confidence Interval: {cv_scores.mean():.4f} ± {1.96*cv_scores.std():.4f}")

# Interpret results
if cv_scores.std() < 0.02:
    print("\n✅ Low variance across folds - stable model")
else:
    print("\n⚠️  High variance across folds - consider more data or regularization")


CROSS-VALIDATION EVALUATION

Performing 5-fold cross-validation on training data...
This may take 3-5 minutes...

✅ Cross-validation complete!

Cross-Validation Scores (5 folds):
----------------------------------------------------------------------
  Fold 1: 0.7768 (77.68%)
  Fold 2: 0.7652 (76.52%)
  Fold 3: 0.7333 (73.33%)
  Fold 4: 0.7565 (75.65%)
  Fold 5: 0.7558 (75.58%)

----------------------------------------------------------------------
Mean CV Accuracy: 0.7575 (75.75%)
Std Dev: 0.0143 (±1.43%)
95% Confidence Interval: 0.7575 ± 0.0280

✅ Low variance across folds - stable model


## Cross-Validation Evaluation
<!-- Purpose: Get robust performance estimate using 5-fold cross-validation
Reduces variance in performance estimation -->