In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import (classification_report, confusion_matrix, 
                            accuracy_score, precision_recall_fscore_support)

# For model persistence
import joblib

# For loading dataset
from datasets import load_dataset

# Set random seed for reproducibility
np.random.seed(42)

print("✅ All libraries imported successfully!")
print("Random seed set to 42 for reproducibility")

✅ All libraries imported successfully!
Random seed set to 42 for reproducibility


In [2]:
print("\n" + "="*70)
print("LOADING MEDICAL DATASET")
print("="*70)

# Load dataset from Hugging Face
dataset = load_dataset("hpe-ai/medical-cases-classification-tutorial")

# Convert to pandas DataFrames
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

# Prepare feature (X) and target (y) variables
X_train = train_df['transcription']
y_train = train_df['medical_specialty']

X_val = val_df['transcription']
y_val = val_df['medical_specialty']

X_test = test_df['transcription']
y_test = test_df['medical_specialty']

print(f"\n✅ Dataset loaded successfully!")
print(f"\nTraining set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Total unique specialties: {y_train.nunique()}")

# Display class distribution
print(f"\nClass distribution in training set:")
print(y_train.value_counts())


LOADING MEDICAL DATASET


Repo card metadata block was not found. Setting CardData to empty.



✅ Dataset loaded successfully!

Training set: 1724 samples
Validation set: 370 samples
Test set: 370 samples
Total unique specialties: 13

Class distribution in training set:
medical_specialty
Cardiovascular / Pulmonary    526
Orthopedic                    289
Neurology                     187
Gastroenterology              152
Obstetrics / Gynecology       126
Hematology - Oncology          86
Neurosurgery                   76
ENT - Otolaryngology           53
Pediatrics - Neonatal          51
Psychiatry / Psychology        49
Nephrology                     45
Ophthalmology                  45
Radiology                      39
Name: count, dtype: int64


## Load and Prepare Dataset
<!-- Purpose: Load the medical dataset and prepare train/validation/test splits
Reusing the same data preparation from Notebook 1 -->

In [3]:
print("\n" + "="*70)
print("BUILDING BASELINE MODEL PIPELINE")
print("="*70)

# Create TfidfVectorizer (same configuration as Notebook 1)
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    strip_accents='ascii'
)

# Create Softmax Regression classifier (Multinomial Logistic Regression)
classifier = LogisticRegression(
    multi_class='multinomial',    # Use softmax for multi-class classification
    solver='lbfgs',                # Optimizer algorithm
    max_iter=1000,                 # Maximum iterations for convergence
    random_state=42,               # For reproducibility
    n_jobs=-1                      # Use all CPU cores
)

# Combine into a single pipeline
baseline_pipeline = Pipeline([
    ('tfidf', vectorizer),         # Step 1: Convert text to TF-IDF features
    ('classifier', classifier)     # Step 2: Train logistic regression model
])

print("\n✅ Baseline pipeline created successfully!")
print("\nPipeline steps:")
print("  1. TfidfVectorizer: Converts text to numerical features")
print("  2. LogisticRegression: Trains multinomial classification model")

print("\nModel configuration:")
print(f"  • Multi-class strategy: {classifier.multi_class}")
print(f"  • Solver: {classifier.solver}")
print(f"  • Max iterations: {classifier.max_iter}")
print(f"  • TF-IDF max features: {vectorizer.max_features}")
print(f"  • N-gram range: {vectorizer.ngram_range}")


BUILDING BASELINE MODEL PIPELINE

✅ Baseline pipeline created successfully!

Pipeline steps:
  1. TfidfVectorizer: Converts text to numerical features
  2. LogisticRegression: Trains multinomial classification model

Model configuration:
  • Multi-class strategy: multinomial
  • Solver: lbfgs
  • Max iterations: 1000
  • TF-IDF max features: 5000
  • N-gram range: (1, 2)


## Create Baseline Model Pipeline
<!-- Purpose: Build a complete pipeline with TfidfVectorizer + Logistic Regression
This creates a baseline model before hyperparameter tuning -->

In [4]:
print("\n" + "="*70)
print("TRAINING BASELINE MODEL")
print("="*70)

print("\nTraining model on 1,720 documents...")
print("This may take 1-2 minutes...\n")

# Train the pipeline on training data
baseline_pipeline.fit(X_train, y_train)

print("✅ Model training complete!")

# Get vocabulary size after fitting
vocab_size = len(baseline_pipeline.named_steps['tfidf'].vocabulary_)
print(f"\nVocabulary size: {vocab_size} unique terms extracted")


TRAINING BASELINE MODEL

Training model on 1,720 documents...
This may take 1-2 minutes...

✅ Model training complete!

Vocabulary size: 5000 unique terms extracted


## Train Baseline Model
<!-- Purpose: Fit the baseline model on training data
Establishes initial performance before optimization -->