# Task 3: Pre-trained Transformers

## Imports

In [1]:
from datasets import load_dataset
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import utils.task3_baseline_utils as base_utils
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [43]:
%reload_ext autoreload

## Load Dataset

In [2]:
dataset = load_dataset("argilla/medical-domain", split="train")

print("Features available:")
print(dataset.column_names)
print("\nFormat of 'prediction' column:")
print(dataset.features['prediction'])
print("\nDataset length: ", len(dataset))

Features available:
['text', 'inputs', 'prediction', 'prediction_agent', 'annotation', 'annotation_agent', 'multi_label', 'explanation', 'id', 'metadata', 'status', 'event_timestamp', 'metrics']

Format of 'prediction' column:
List({'label': Value('string'), 'score': Value('float64')})

Dataset length:  4966


In [3]:
dataset_df = dataset.to_pandas()

View labels and imbalance

In [4]:
labels = [x[0]['label'] for x in dataset_df.loc[:,"prediction"]]
labels = [x[1:] for x in labels] # remove whitespace before the label
unique_labels = list(sorted(set(labels)))
label_counts = {}
for l in labels:
    if l not in label_counts.keys():
        label_counts[l] = 0
    label_counts[l] += 1
    
print(" Unique labels and counts: ".center(50, '-'))
for l in unique_labels:
    print(l, label_counts[l])

----------- Unique labels and counts: ------------
Allergy / Immunology 7
Autopsy 8
Bariatrics 18
Cardiovascular / Pulmonary 371
Chiropractic 14
Consult - History and Phy. 516
Cosmetic / Plastic Surgery 27
Dentistry 27
Dermatology 29
Diets and Nutritions 10
Discharge Summary 108
ENT - Otolaryngology 96
Emergency Room Reports 75
Endocrinology 19
Gastroenterology 224
General Medicine 259
Hematology - Oncology 90
Hospice - Palliative Care 6
IME-QME-Work Comp etc. 16
Lab Medicine - Pathology 8
Letters 23
Nephrology 81
Neurology 223
Neurosurgery 94
Obstetrics / Gynecology 155
Office Notes 50
Ophthalmology 83
Orthopedic 355
Pain Management 61
Pediatrics - Neonatal 70
Physical Medicine - Rehab 21
Podiatry 47
Psychiatry / Psychology 53
Radiology 273
Rheumatology 10
SOAP / Chart / Progress Notes 166
Sleep Medicine 20
Speech - Language 9
Surgery 1088
Urology 156


In [5]:
# Get train/val split
train_texts = dataset_df.loc[:,"text"]
train_labels = [x[0]['label'] for x in dataset_df.loc[:,"prediction"]]

split = 0.7
split_idx = int(len(train_texts) * 0.7)

test_texts = train_texts.iloc[split_idx:]
test_labels = train_labels[split_idx:]

train_texts = train_texts.iloc[:split_idx]
train_labels = train_labels[:split_idx]

print(f"Train/test split: {split}, {round(1-split,1)}")
print("Train set length: ", len(train_texts))
print("Test set length: ", len(test_texts))

Train/test split: 0.7, 0.3
Train set length:  3476
Test set length:  1490


## Subtask 1: Baseline

Build and tune a strong classical baseline appropriate to the task (e.g., TF IDF + Logistic Regression / Linear SVM or XGBoost for classification/NER). Record metrics as the anchor row of a single results table.

In [6]:
nlp = spacy.load("en_core_web_md", disable=["ner", "parser"])

def spacy_tokenizer(text):
    doc = nlp(text)
    return [ # TODO modify this
        token.lemma_.lower()
        for token in doc
        if not token.is_stop and not token.is_punct and not token.is_space
    ]

vectorizer = TfidfVectorizer(
    tokenizer=spacy_tokenizer,
    ngram_range=(1, 2),
    min_df=5
)

# =====================================
# TF-IDF + Logistic Regression Pipeline
# =====================================
log_reg_pipeline = Pipeline([
    ("tfidf", vectorizer
    ),
    ("clf", LogisticRegression(
        max_iter=1000,
        n_jobs=-1,
    )) # Multinomial by default
])

# ============================
# TF-IDF + Linear SVM Pipeline
# ============================
linear_svm_pipeline = Pipeline([
    ("tfidf", vectorizer
     ),
    ("svm", LinearSVC(
        C=1.0, # TODO tune this
        class_weight="balanced"
	))
])

# ==============================================
# TF-IDF + SGDClassifier Pipeline (SVM with SGD)
# ==============================================
sgd_clf_pipeline = Pipeline([
    ("tfidf", vectorizer
     ),
    ("clf", SGDClassifier(
		loss="hinge",
		alpha=1e-4,
		class_weight="balanced",
		max_iter=1000
	))
])



### Logistic regression

In [None]:
lr_model = log_reg_pipeline.fit(train_texts, train_labels)
preds = lr_model.predict(test_texts)
base_utils.store_model_metrics_manual(test_labels, preds, unique_labels, 'Task3/results/logistic_regression.csv')

### Linear SVM

In [None]:
svm_model = linear_svm_pipeline.fit(train_texts, train_labels)
preds = svm_model.predict(test_texts)

base_utils.store_model_metrics_manual(test_labels, preds, unique_labels, 'Task3/results/linear_svm.csv')



Metrics saved to  Task3/results/linear_svm.csv


### SGD Classifier

In [None]:
sgd_clf = sgd_clf_pipeline.fit(train_texts, train_labels)
preds = sgd_clf.predict(test_texts)

base_utils.store_model_metrics_manual(test_labels, preds, unique_labels, 'Task3/results/sgd.csv')



Metrics saved to  Task3/results/linear_svm.csv
