In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import tensorflow_hub as hub
import tensorflow_text as text  # Required for BERT

# Check for GPU
print(f"TensorFlow version: {tf.__version__}")
if len(tf.config.list_physical_devices('GPU')) > 0:
    print("GPU is available")
else:
    print("GPU is NOT available (Training will be slow)")

# --- Helper Function to Read Data ---
def get_lines(filename):
    """Reads filename and returns the lines of text as a list."""
    with open(filename, "r") as f:
        return f.readlines()

def preprocess_text_with_line_numbers(filename):
    """
    Returns a list of dictionaries of abstract line data.
    Takes in filename, reads it and sorts through each line,
    extracting things like the target label, the text, etc.
    """
    input_lines = get_lines(filename)
    abstract_lines = ""
    abstract_samples = []
    
    for line in input_lines:
        if line.startswith("###"): # Check if it's an ID line
            abstract_id = line
            abstract_lines = "" # reset abstract string
        elif line.isspace(): # Check if it's a new line (end of abstract)
            abstract_line_split = abstract_lines.splitlines()
            
            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
                line_data = {}
                target_text_split = abstract_line.split("\t") # Split label and text
                line_data["target"] = target_text_split[0] # Label (e.g., BACKGROUND)
                line_data["text"] = target_text_split[1].lower() # Text content
                line_data["line_number"] = abstract_line_number # Position in abstract
                line_data["total_lines"] = len(abstract_line_split) - 1 
                abstract_samples.append(line_data)
        else:
            abstract_lines += line
            
    return abstract_samples

# --- Evaluation Metric Function ---
def calculate_results(y_true, y_pred):
    """Calculates model accuracy, precision, recall and f1 score."""
    model_accuracy = accuracy_score(y_true, y_pred) * 100
    model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
    model_results = {"accuracy": model_accuracy,
                     "precision": model_precision,
                     "recall": model_recall,
                     "f1": model_f1}
    return model_results

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# UPDATE THIS PATH to where your dataset is located locally
data_dir = "pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/"
train_path = data_dir + "train.txt"
val_path = data_dir + "dev.txt"
test_path = data_dir + "test.txt"

# Process data
train_samples = preprocess_text_with_line_numbers(train_path)
val_samples = preprocess_text_with_line_numbers(val_path)
test_samples = preprocess_text_with_line_numbers(test_path)

# Convert to DataFrame
train_df = pd.DataFrame(train_samples)
val_df = pd.DataFrame(val_samples)
test_df = pd.DataFrame(test_samples)

# Extract sentences and labels
train_sentences = train_df["text"].tolist()
val_sentences = val_df["text"].tolist()
test_sentences = test_df["text"].tolist()

# One-hot encode labels (for Deep Learning)
one_hot_encoder = OneHotEncoder(sparse=False)
train_labels_one_hot = one_hot_encoder.fit_transform(train_df["target"].to_numpy().reshape(-1, 1))
val_labels_one_hot = one_hot_encoder.transform(val_df["target"].to_numpy().reshape(-1, 1))
test_labels_one_hot = one_hot_encoder.transform(test_df["target"].to_numpy().reshape(-1, 1))

# Label Encode labels (for Baseline)
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_df["target"].to_numpy())
val_labels_encoded = label_encoder.transform(val_df["target"].to_numpy())
test_labels_encoded = label_encoder.transform(test_df["target"].to_numpy())

# Get class names and vocabulary size
class_names = label_encoder.classes_
print(f"Classes: {class_names}")

In [None]:
# Create a pipeline
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
  ("tf-idf", TfidfVectorizer()),
  ("clf", MultinomialNB())
])

# Fit the pipeline to the training data
model_0.fit(X=train_sentences, y=train_labels_encoded)

# Evaluate baseline
baseline_preds = model_0.predict(val_sentences)
baseline_results = calculate_results(y_true=val_labels_encoded, y_pred=baseline_preds)
print(f"Baseline Results: {baseline_results}")

In [None]:
# 1. Setup Text Vectorization
max_tokens = 68000 # Approx number of words in vocab
avg_sent_len = int(np.mean([len(i.split()) for i in train_sentences])) # Average sentence length

text_vectorizer = layers.TextVectorization(max_tokens=max_tokens, output_sequence_length=55) # padded to 55
text_vectorizer.adapt(train_sentences)

# 2. Create Embedding Layer
token_embed = layers.Embedding(input_dim=len(text_vectorizer.get_vocabulary()),
                               output_dim=128,
                               mask_zero=True,
                               name="token_embedding")

# 3. Build Model
inputs = layers.Input(shape=(1,), dtype=tf.string)
text_vectors = text_vectorizer(inputs)
token_embeddings = token_embed(text_vectors)
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(token_embeddings)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(len(class_names), activation="softmax")(x)
model_1 = tf.keras.Model(inputs, outputs, name="model_1_conv1d_token")

# 4. Compile and Fit
model_1.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

history_1 = model_1.fit(tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE),
                        steps_per_epoch=int(0.1 * len(train_sentences) // 32), # 10% of data for speed in demo
                        epochs=3,
                        validation_data=tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE),
                        validation_steps=int(0.1 * len(val_sentences) // 32))

In [None]:
# Download Universal Sentence Encoder
tf_hub_embedding_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        trainable=False,
                                        name="universal_sentence_encoder")

# Build Model
inputs = layers.Input(shape=[], dtype=tf.string)
pretrained_embedding = tf_hub_embedding_layer(inputs)
x = layers.Dense(128, activation="relu")(pretrained_embedding)
outputs = layers.Dense(len(class_names), activation="softmax")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_USE")

# Compile and Fit
model_2.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history_2 = model_2.fit(tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE),
                        steps_per_epoch=int(0.1 * len(train_sentences) // 32),
                        epochs=3,
                        validation_data=tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE),
                        validation_steps=int(0.1 * len(val_sentences) // 32))

In [None]:
# Create Character Vectorizer
def split_chars(text):
    return " ".join(list(text))

train_chars = [split_chars(sentence) for sentence in train_sentences]
val_chars = [split_chars(sentence) for sentence in val_sentences]

char_vectorizer = layers.TextVectorization(max_tokens=28, output_sequence_length=290, standardize="lower_and_strip_punctuation")
char_vectorizer.adapt(train_chars)

char_embed = layers.Embedding(input_dim=len(char_vectorizer.get_vocabulary()), output_dim=25, mask_zero=True, name="char_embed")

# Build Model
inputs = layers.Input(shape=(1,), dtype=tf.string)
char_vectors = char_vectorizer(inputs)
char_embeddings = char_embed(char_vectors)
x = layers.Conv1D(64, kernel_size=5, padding="same", activation="relu")(char_embeddings)
x = layers.GlobalMaxPooling1D()(x)
outputs = layers.Dense(len(class_names), activation="softmax")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_3_conv1d_char")

# Compile and Fit
model_3.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history_3 = model_3.fit(tf.data.Dataset.from_tensor_slices((train_chars, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE),
                        steps_per_epoch=int(0.1 * len(train_sentences) // 32),
                        epochs=3,
                        validation_data=tf.data.Dataset.from_tensor_slices((val_chars, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE),
                        validation_steps=int(0.1 * len(val_sentences) // 32))

In [None]:
# Using a smaller BERT model for speed/memory efficiency (Small BERT)
tfhub_handle_encoder = "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1"
tfhub_handle_preprocess = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"

bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
bert_model = hub.KerasLayer(tfhub_handle_encoder)

# Build Model
text_input = layers.Input(shape=(), dtype=tf.string, name='text')
encoder_inputs = bert_preprocess_model(text_input)
outputs = bert_model(encoder_inputs)
net = outputs['pooled_output'] # Pooling the sequence of outputs
net = layers.Dropout(0.1)(net)
net = layers.Dense(64, activation='relu')(net)
final_output = layers.Dense(len(class_names), activation='softmax')(net)

model_4 = tf.keras.Model(text_input, final_output)

# Compile and Fit
model_4.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
history_4 = model_4.fit(tf.data.Dataset.from_tensor_slices((train_sentences, train_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE),
                        steps_per_epoch=int(0.1 * len(train_sentences) // 32),
                        epochs=3,
                        validation_data=tf.data.Dataset.from_tensor_slices((val_sentences, val_labels_one_hot)).batch(32).prefetch(tf.data.AUTOTUNE),
                        validation_steps=int(0.1 * len(val_sentences) // 32))

In [None]:
# Gather results
model_1_preds = model_1.predict(val_sentences)
model_1_results = calculate_results(y_true=val_labels_encoded, y_pred=tf.argmax(model_1_preds, axis=1))

model_2_preds = model_2.predict(val_sentences)
model_2_results = calculate_results(y_true=val_labels_encoded, y_pred=tf.argmax(model_2_preds, axis=1))

model_3_preds = model_3.predict(val_chars)
model_3_results = calculate_results(y_true=val_labels_encoded, y_pred=tf.argmax(model_3_preds, axis=1))

model_4_preds = model_4.predict(val_sentences)
model_4_results = calculate_results(y_true=val_labels_encoded, y_pred=tf.argmax(model_4_preds, axis=1))

# Combine into DataFrame
all_results = pd.DataFrame({
    "Baseline": baseline_results,
    "Conv1D Token": model_1_results,
    "Pretrained (USE)": model_2_results,
    "Conv1D Char": model_3_results,
    "BERT (LLM)": model_4_results
}).transpose()

# Scale accuracy to 0-1
all_results["accuracy"] = all_results["accuracy"] / 100

# Plot
all_results.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0))
plt.title("Model Performance Comparison")
plt.show()

print(all_results)