In [5]:
!python3 -m venv .venv

In [6]:
!source .venv/bin/activate

In [7]:
#!pip install -r requirements.txt

In [8]:
#pip install --user --no-cache-dir -r requirements.txt

In [9]:
#pip install -q ipywidgets==8.1.2 jupyterlab_widgets

In [None]:
import json
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression  # kept import to preserve structure (not used)
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.utils import resample
from sklearn.metrics import f1_score, classification_report
from sklearn.calibration import CalibratedClassifierCV
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from datasets import Dataset
from scipy.special import softmax
import torch
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Keep case for BioBERT (cased) while still cleaning text.
    # If you ever switch back to uncased models, you can re-enable lowercasing.
    # text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)  # keep case, strip non-letters
    tokens = text.split()
    # compare to stopwords in lowercase so we can preserve token case
    tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(tokens)

with open("QTL_text.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)
df['text'] = (df['Title'] + ' ' + df['Abstract']).apply(clean_text)
df['label'] = df['Category'].astype(int)

df_majority = df[df.label == 0]
df_minority = df[df.label == 1]
df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)
df_balanced = pd.concat([df_majority_downsampled, df_minority]).sample(frac=1, random_state=42)

X = df_balanced['text'].tolist()
y = df_balanced['label'].tolist()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# TF-IDF + SVM (replacing Logistic Regression), then calibrate for probabilities
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english')),
    ('clf', LinearSVC(class_weight='balanced'))
])

param_grid = {
    'vectorizer__ngram_range': [(1, 2), (1, 3)],
    'vectorizer__max_df': [0.8, 0.9],
    'vectorizer__min_df': [1, 2],
    'vectorizer__max_features': [10000, 15000],
    'clf__C': [0.1, 1, 10]  # SVM regularization strength
}

grid = GridSearchCV(pipeline, param_grid, scoring='f1', cv=StratifiedKFold(n_splits=5), verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

calibrated_model = CalibratedClassifierCV(grid.best_estimator_, method='sigmoid', cv=5)
calibrated_model.fit(X_train, y_train)

# keep original variable name to preserve structure; this now holds SVM probabilities
val_probs_lr = calibrated_model.predict_proba(X_val)[:, 1]

# BioBERT fine-tuning (swap from bert-base-uncased)
bio_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(bio_name)
train_df = pd.DataFrame({"text": X_train, "label": y_train})
val_df = pd.DataFrame({"text": X_val, "label": y_val})

def tokenize_fn(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)

train_ds = Dataset.from_pandas(train_df).map(tokenize_fn, batched=True)
val_ds = Dataset.from_pandas(val_df).map(tokenize_fn, batched=True)
train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])

model = AutoModelForSequenceClassification.from_pretrained(bio_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./bert_output",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=6,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",
    save_strategy="epoch"
)

def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": (pred.label_ids == preds).mean(),
        "f1": f1_score(pred.label_ids, preds)
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()
val_outputs_bert = trainer.predict(val_ds)
val_probs_bert = softmax(val_outputs_bert.predictions, axis=1)[:, 1]

# Ensemble (keep same structure/variable names; val_probs_lr now comes from calibrated SVM)
f1_lr = f1_score(y_val, (val_probs_lr >= 0.5).astype(int))      # SVM F1 (name preserved)
f1_bert = f1_score(y_val, (val_probs_bert >= 0.5).astype(int))
w_lr = f1_lr / (f1_lr + f1_bert)
w_bert = f1_bert / (f1_lr + f1_bert)

ensemble_val_probs = (w_lr * val_probs_lr) + (w_bert * val_probs_bert)
ensemble_val_preds = (ensemble_val_probs >= 0.5).astype(int)
print("Validation F1 Score (Ensemble):", f1_score(y_val, ensemble_val_preds, average="weighted"))
print(classification_report(y_val, ensemble_val_preds))

In [None]:
# Test predictions
test_df = pd.read_csv("QTL_test_unlabeled.tsv", sep="\t")
test_df['text'] = (test_df['Title'] + ' ' + test_df['Abstract']).apply(clean_text)
X_test = test_df['text'].tolist()
test_probs_lr = calibrated_model.predict_proba(X_test)[:, 1]

test_raw_df = pd.DataFrame({"text": X_test})
test_ds = Dataset.from_pandas(test_raw_df).map(tokenize_fn, batched=True)
test_ds.set_format("torch", columns=["input_ids", "attention_mask"])
test_outputs_bert = trainer.predict(test_ds)
test_probs_bert = softmax(test_outputs_bert.predictions, axis=1)[:, 1]

ensemble_test_probs = (w_lr * test_probs_lr) + (w_bert * test_probs_bert)
ensemble_test_preds = (ensemble_test_probs >= 0.5).astype(int)

submission = pd.DataFrame({
    "PMID": test_df["PMID"],
    "Label": ensemble_test_preds
})
submission.to_csv("Project 2_Submission_final.csv", index=False)
print("Project 2_Submission_final.csv")