 **DATASET ANALYSIS AND PREPROCESSING**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/mtsamples.csv')

In [None]:
print("Initial Dataset Shape:", df.shape)
print(df.head())
print(df.info())
print("Unique Medical Specialties:", df['medical_specialty'].nunique())
print("Specialty Distribution:\n", df['medical_specialty'].value_counts())

Handling Missing Values

In [None]:
print("\nMissing Values:\n", df.isnull().sum())
df = df.dropna(subset=['transcription'])
df['keywords'].fillna('', inplace=True)
df['description'].fillna('', inplace=True)

In [None]:
print("\nDuplicates:", df.duplicated().sum())
df = df.drop_duplicates()

Text Cleaning

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    cleaned_text = ' '.join(tokens)
    return cleaned_text

In [None]:
df['cleaned_transcription'] = df['transcription'].apply(clean_text)
df['cleaned_keywords'] = df['keywords'].apply(clean_text)
df['cleaned_description'] = df['description'].apply(clean_text)

In [None]:
print("\nSample Cleaned Transcription:\n", df['cleaned_transcription'].iloc[0])
df['features'] = df['cleaned_transcription'] + ' ' + df['cleaned_keywords']

Splitting data into train, test and validation sets

In [None]:
X = df['features']
y = df['medical_specialty']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("\nTrain Shape:", X_train.shape, y_train.shape)
print("Validation Shape:", X_val.shape, y_val.shape)
print("Train Specialty Distribution:\n", y_train.value_counts(normalize=True))

In [None]:
df.to_csv('processed_mtsamples.csv', index=False)

**EDA ON PREPROCESSED DATA**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
plt.figure(figsize=(12, 8))
sns.countplot(y=y_train, order=y_train.value_counts().index)
plt.title('Training Data Class Distribution')
plt.xlabel('Count')
plt.ylabel('Medical Specialty')
plt.show()

In [None]:
train_lengths = X_train.apply(lambda x: len(x.split()))
plt.figure(figsize=(10, 6))
sns.histplot(train_lengths, bins=50, kde=True)
plt.title('Distribution of Transcription Lengths in Train Data')
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.show()

In [None]:
all_text = ' '.join(X_train)
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of training transcriptions')
plt.show()

In [None]:
vectorizer = CountVectorizer(stop_words='english')
word_counts = vectorizer.fit_transform(X_train)
top_words = pd.DataFrame(word_counts.sum(axis=0), columns=vectorizer.get_feature_names_out()).T.sort_values(0, ascending=False).head(20)
print("Top 20 words in train data:\n", top_words)

**TRAINING BASELINE MODEL**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

In [None]:
clf = LogisticRegression(max_iter=1000, class_weight='balanced', multi_class='multinomial', solver='lbfgs')
clf.fit(X_train_tfidf, y_train)

In [None]:
y_pred = clf.predict(X_val_tfidf)

In [None]:
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='macro')
print(f"Baseline Accuracy: {accuracy:.4f}")
print(f"Baseline Macro F1: {f1:.4f}")
print(classification_report(y_val, y_pred))

In [None]:
cm = confusion_matrix(y_val, y_pred)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=False, cmap='Blues')
plt.title('Baseline Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

In [None]:
errors = pd.DataFrame({'Text': X_val[y_val != y_pred], 'True': y_val[y_val != y_pred], 'Pred': y_pred[y_val != y_pred]})
print("Sample Errors:\n", errors.head(5))

**FINE-TUNING BIOBERT MODEL**

In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter

In [None]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

In [None]:
train_df = pd.DataFrame({'text': X_train, 'label': y_train_enc})
val_df = pd.DataFrame({'text': X_val, 'label': y_val_enc})
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [None]:
model_name = 'dmis-lab/biobert-base-cased-v1.1'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

In [None]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_enc), y=y_train_enc)
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        logits = outputs.get('logits')
        loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir='./biobert_results',
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='./logs',
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True,
    fp16=True
)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average='macro')
    return {'accuracy': acc, 'f1': f1_macro}

In [None]:
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)
trainer.class_weights = class_weights

In [None]:
trainer.train()

In [None]:
results = trainer.evaluate()
print("BioBERT Fine-Tuned Results:", results)
predictions = trainer.predict(val_dataset)
y_pred_enc = predictions.predictions.argmax(-1)
y_pred = le.inverse_transform(y_pred_enc)

In [None]:
acc_biobert = accuracy_score(y_val_enc, y_pred_enc)
f1_biobert = f1_score(y_val_enc, y_pred_enc, average='macro')
print(f"BioBERT Accuracy: {acc_biobert:.4f}, Macro F1: {f1_biobert:.4f}")
print(classification_report(y_val, y_pred))

In [None]:
cm_biobert = confusion_matrix(y_val_enc, y_pred_enc)
plt.figure(figsize=(12, 10))
sns.heatmap(cm_biobert, annot=False, cmap='Blues')
plt.title('BioBERT Confusion Matrix')
plt.show()

In [None]:
errors = pd.DataFrame({'Text': X_val[y_val != y_pred].values[:5], 'True': y_val[y_val != y_pred].values[:5], 'Pred': y_pred[y_val != y_pred][:5]})
print("Sample Errors:\n", errors)

**INCORPORATION OF LANGUAGE MODEL**

EXTERNAL INCORPORATION OF LM

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
import requests
import pandas as pd
import time
from tqdm import tqdm

print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape, "y_val shape:", y_val.shape)

api_token = ""
api_url = "https://api-inference.huggingface.co/models/facebook/bart-large-mnli"
headers = {"Authorization": f"Bearer {api_token}"}

candidate_labels = list(le.classes_)[:10]
print(f"Number of candidate labels: {len(candidate_labels)}")

def query_external_lm(text, candidate_labels, retries=3, delay=5, timeout=60):
    payload = {
        "inputs": text,
        "parameters": {"candidate_labels": candidate_labels},
        "options": {"wait_for_model": True}
    }
    for attempt in range(retries):
        try:
            response = requests.post(api_url, headers=headers, json=payload, timeout=timeout)
            if response.status_code == 200:
                result = response.json()
                if isinstance(result, dict) and 'scores' in result:
                    score_dict = dict(zip(result.get('labels', candidate_labels), result['scores']))
                    ordered_scores = [score_dict.get(label, 0.0) for label in candidate_labels]
                    return ordered_scores
                else:
                    print(f"Unexpected response format: {result}")
                    return [0.0] * len(candidate_labels)
            else:
                print(f"API Error: {response.status_code} - {response.text}")
                if attempt < retries - 1:
                    time.sleep(delay)
                    continue
                return [0.0] * len(candidate_labels)
        except requests.exceptions.RequestException as e:
            print(f"Request failed: {e}")
            if attempt < retries - 1:
                time.sleep(delay)
                continue
            return [0.0] * len(candidate_labels)
    return [0.0] * len(candidate_labels)

def get_external_scores(texts, batch_size=5, max_samples=None):
    scores_list = []
    texts_to_process = texts[:max_samples] if max_samples else texts
    print(f"Processing {len(texts_to_process)} texts...")

    for i in tqdm(range(0, len(texts_to_process), batch_size), desc="Querying API"):
        batch_texts = texts_to_process[i:i+batch_size]
        batch_scores = [query_external_lm(text, candidate_labels) for text in batch_texts]
        scores_list.extend(batch_scores)
        time.sleep(2)
    return np.array(scores_list)

max_samples = 100
logger.info("Extracting external scores for X_val subset...")
external_scores = get_external_scores(X_val, batch_size=5, max_samples=max_samples)
logger.info(f"External scores shape: {external_scores.shape}")

if external_scores.size > 0:
    clf_external = LogisticRegression(max_iter=1000, class_weight='balanced', multi_class='multinomial', solver='lbfgs')
    clf_external.fit(external_scores, y_val_enc[:len(external_scores)])
    y_pred_external_clf = clf_external.predict(external_scores)
    acc_external_clf = accuracy_score(y_val_enc[:len(external_scores)], y_pred_external_clf)
    f1_external_clf = f1_score(y_val_enc[:len(external_scores)], y_pred_external_clf, average='macro')
    print(f"External LM Scores as Features - Accuracy: {acc_external_clf:.4f}, Macro F1: {f1_external_clf:.4f}")
    print("External LM Scores Classification Report:\n",
          classification_report(y_val[:len(external_scores)], le.inverse_transform(y_pred_external_clf), zero_division=0))
else:
    print("No external scores obtained, proceeding with internal model only.")

INTERNAL INCORPORATION OF LM

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
import requests
import pandas as pd

In [None]:
df['features_raw'] = df['transcription'].fillna('') + ' ' + df['keywords'].fillna('')
X_raw = df['features_raw']
X_train_raw, X_val_raw, y_train, y_val = train_test_split(X_raw, df['medical_specialty'],
                                                          test_size=0.2, random_state=42, stratify=df['medical_specialty'])
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

print("X_train_raw shape:", X_train_raw.shape, "y_train shape:", y_train.shape)
print("X_val_raw shape:", X_val_raw.shape, "y_val shape:", y_val.shape)

In [None]:
print("X_train shape:", X_train.shape, "y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape, "y_val shape:", y_val.shape)

In [None]:
from sklearn.manifold import TSNE
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [None]:
model_name = 'dmis-lab/biobert-base-cased-v1.1'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def extract_embeddings(texts, batch_size=16, pooling='mean'):
    embeddings = []
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = tokenizer(batch_texts.tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        if pooling == 'mean':
            attention_mask = inputs['attention_mask'].unsqueeze(-1)
            masked_embeddings = outputs.last_hidden_state * attention_mask
            batch_embeddings = masked_embeddings.sum(dim=1) / attention_mask.sum(dim=1)
        else:
            batch_embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings.append(batch_embeddings.cpu().numpy())
    return np.vstack(embeddings)

X_train_emb = extract_embeddings(X_train_raw, batch_size=16, pooling='mean')
X_val_emb = extract_embeddings(X_val_raw, batch_size=16, pooling='mean')

smote = SMOTE(random_state=42, k_neighbors=3)
X_train_emb_resampled, y_train_enc_resampled = smote.fit_resample(X_train_emb, y_train_enc)

clf_internal = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1,
                            objective='multi:softmax', eval_metric='mlogloss', random_state=42)
clf_internal.fit(X_train_emb_resampled, y_train_enc_resampled)

y_pred_internal = clf_internal.predict(X_val_emb)
acc_internal = accuracy_score(y_val_enc, y_pred_internal)
f1_internal = f1_score(y_val_enc, y_pred_internal, average='macro')
print(f"Internal LM (BioBERT Embeddings + XGBoost) - Accuracy: {acc_internal:.4f}, Macro F1: {f1_internal:.4f}")
print("Internal LM Classification Report:\n",
      classification_report(y_val, le.inverse_transform(y_pred_internal), zero_division=0))

cm_internal = confusion_matrix(y_val_enc, y_pred_internal)
plt.figure(figsize=(12, 10))
sns.heatmap(cm_internal, annot=False, cmap='Blues')
plt.title('Internal LM Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

errors_internal = pd.DataFrame({
    'Text': X_val_raw[y_val != le.inverse_transform(y_pred_internal)].values[:5],
    'True': y_val[y_val != le.inverse_transform(y_pred_internal)].values[:5],
    'Pred': le.inverse_transform(y_pred_internal)[y_val != le.inverse_transform(y_pred_internal)][:5]
})
print("Sample Errors (Internal LM):\n", errors_internal)

tsne = TSNE(n_components=2, random_state=42)
X_train_emb_2d = tsne.fit_transform(X_train_emb[:500])
plt.figure(figsize=(10, 8))
sns.scatterplot(x=X_train_emb_2d[:, 0], y=X_train_emb_2d[:, 1], hue=le.inverse_transform(y_train_enc[:500]),
                palette='tab20', legend='full')
plt.title('t-SNE Visualization of BioBERT Embeddings (Train Data)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

**EDA ON TRAIN TEST RESULTS**

In [None]:
data = {
    'Model': ['Baseline', 'Fine-Tuned BioBERT', 'Internal (XGBoost)'],
    'Train_Accuracy': [0.45,0.69,  0.52],
    'Val_Accuracy': [ 0.41,0.65, 0.52],
    'Train_Macro_F1': [0.43,0.65,  0.56],
}
results_df = pd.DataFrame(data)

train_lengths = X_train.apply(lambda x: len(x.split()))
train_class_dist = y_train.value_counts(normalize=True)

print("Results DataFrame:\n", results_df)
print("\nTrain Data Lengths Summary:\n", train_lengths.describe())
print("\nTrain Class Distribution (Top 5):\n", train_class_dist.head())

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='Model', y='Val_Accuracy', data=results_df, palette='viridis')
plt.title('Validation Accuracy Across Models')
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.ylim(0, 1)
for i, v in enumerate(results_df['Val_Accuracy']):
    plt.text(i, v + 0.02, f'{v:.2f}', ha='center')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.lineplot(data=results_df[['Train_Accuracy', 'Val_Accuracy']].T, markers=True)
plt.title('Train vs. Validation Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Model Index (0=BioBERT, 1=Baseline, 2=Internal)')
plt.legend(['Train', 'Validation'])
plt.xticks([0, 1, 2], results_df['Model'])
plt.ylim(0, 1)
plt.show()