In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
import re

# Machine Learning libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline

# For model persistence
import joblib

# For loading dataset from Hugging Face
from datasets import load_dataset

print("✅ All libraries imported successfully!")

✅ All libraries imported successfully!


In [5]:
print("Loading Medical Cases Classification Dataset from Hugging Face...")
print("This may take 1-2 minutes on first run (downloading ~50MB)...\n")

# Load dataset (pre-split into train/validation/test)
dataset = load_dataset("hpe-ai/medical-cases-classification-tutorial")

# Convert to pandas DataFrames for easier manipulation
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

print("✅ Dataset loaded successfully!")
print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")
print(f"Test samples: {len(test_df)}")
print(f"Total samples: {len(train_df) + len(val_df) + len(test_df)}")

Loading Medical Cases Classification Dataset from Hugging Face...
This may take 1-2 minutes on first run (downloading ~50MB)...



Repo card metadata block was not found. Setting CardData to empty.


✅ Dataset loaded successfully!
Training samples: 1724
Validation samples: 370
Test samples: 370
Total samples: 2464


## Load the Medical Dataset
<!-- Purpose: Download and load the medical transcription dataset from Hugging Face
The dataset contains 2,460 medical case transcriptions across 13 specialties -->

In [6]:
print("\n" + "="*70)
print("DATASET STRUCTURE OVERVIEW")
print("="*70)

print(f"\nDataset shape: {train_df.shape}")
print(f"Number of features: {train_df.shape[1]}")
print(f"Feature names: {list(train_df.columns)}")

print("\n" + "-"*70)
print("First 3 samples:")
print("-"*70)
display(train_df.head(3))

print("\n" + "-"*70)
print("Dataset Information:")
print("-"*70)
print(train_df.info())

print("\n" + "-"*70)
print("Missing Values Check:")
print("-"*70)
print(train_df.isnull().sum())

print("\n" + "-"*70)
print("Basic Statistics:")
print("-"*70)
print(train_df.describe(include='all'))


DATASET STRUCTURE OVERVIEW

Dataset shape: (1724, 5)
Number of features: 5
Feature names: ['description', 'transcription', 'sample_name', 'medical_specialty', 'keywords']

----------------------------------------------------------------------
First 3 samples:
----------------------------------------------------------------------


Unnamed: 0,description,transcription,sample_name,medical_specialty,keywords
0,Pacemaker ICD interrogation. Severe nonischem...,"PROCEDURE NOTE: , Pacemaker ICD interrogation....",Pacemaker Interrogation,Cardiovascular / Pulmonary,"cardiovascular / pulmonary, cardiomyopathy, ve..."
1,"Erythema of the right knee and leg, possible s...","PREOPERATIVE DIAGNOSES: , Erythema of the righ...",Aspiration - Knee Joint,Orthopedic,"orthopedic, knee and leg, anterolateral portal..."
2,Left cardiac catheterization with selective ri...,"PREOPERATIVE DIAGNOSIS: , Post infarct angina....",Cardiac Cath & Selective Coronary Angiography,Cardiovascular / Pulmonary,"cardiovascular / pulmonary, selective, angiogr..."



----------------------------------------------------------------------
Dataset Information:
----------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1724 entries, 0 to 1723
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   description        1724 non-null   object
 1   transcription      1724 non-null   object
 2   sample_name        1724 non-null   object
 3   medical_specialty  1724 non-null   object
 4   keywords           1109 non-null   object
dtypes: object(5)
memory usage: 67.5+ KB
None

----------------------------------------------------------------------
Missing Values Check:
----------------------------------------------------------------------
description            0
transcription          0
sample_name            0
medical_specialty      0
keywords             615
dtype: int64

---------------------------------------------------

## Initial Data Exploration
<!-- Purpose: Understand the structure and content of the dataset -->

In [7]:
print("\n" + "="*70)
print("TEXT LENGTH ANALYSIS")
print("="*70)

# Calculate character lengths for both text fields
train_df['transcription_length'] = train_df['transcription'].str.len()
train_df['description_length'] = train_df['description'].str.len()

# Calculate word counts for both text fields
train_df['transcription_words'] = train_df['transcription'].str.split().str.len()
train_df['description_words'] = train_df['description'].str.split().str.len()

print("\nCharacter Length Statistics:")
print("-"*70)
print(f"Transcription:")
print(f"  Mean: {train_df['transcription_length'].mean():.1f} characters")
print(f"  Median: {train_df['transcription_length'].median():.1f} characters")
print(f"  Min: {train_df['transcription_length'].min():.0f} characters")
print(f"  Max: {train_df['transcription_length'].max():.0f} characters")

print(f"\nDescription:")
print(f"  Mean: {train_df['description_length'].mean():.1f} characters")
print(f"  Median: {train_df['description_length'].median():.1f} characters")
print(f"  Min: {train_df['description_length'].min():.0f} characters")
print(f"  Max: {train_df['description_length'].max():.0f} characters")

print("\nWord Count Statistics:")
print("-"*70)
print(f"Transcription:")
print(f"  Mean: {train_df['transcription_words'].mean():.1f} words")
print(f"  Median: {train_df['transcription_words'].median():.1f} words")

print(f"\nDescription:")
print(f"  Mean: {train_df['description_words'].mean():.1f} words")
print(f"  Median: {train_df['description_words'].median():.1f} words")


TEXT LENGTH ANALYSIS

Character Length Statistics:
----------------------------------------------------------------------
Transcription:
  Mean: 3326.2 characters
  Median: 2850.5 characters
  Min: 13 characters
  Max: 15216 characters

Description:
  Mean: 140.4 characters
  Median: 123.5 characters
  Min: 14 characters
  Max: 491 characters

Word Count Statistics:
----------------------------------------------------------------------
Transcription:
  Mean: 504.0 words
  Median: 427.0 words

Description:
  Mean: 19.7 words
  Median: 16.0 words


## Text Length Analysis - Calculate Statistics
<!-- Purpose: Analyze the length of transcriptions and descriptions
This helps us understand document sizes and choose appropriate features -->