In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("alz_disease_pairs_cleaned.csv")

# Clean missing values
df = df.dropna(subset=["input_text", "relation_label"])

# Encode string labels into numeric class IDs
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["relation_label"])

# Prepare training and test data
X = df["input_text"]
y = df["label_id"]
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Create the baseline model (TF-IDF + Logistic Regression)
baseline_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 2))),  # you can adjust n-grams here
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))  # balanced handles class imbalance
])

# Train the model
baseline_pipeline.fit(train_X, train_y)

# Predict and evaluate
y_pred = baseline_pipeline.predict(test_X)
report = classification_report(test_y, y_pred, target_names=label_encoder.classes_)

# Print evaluation report
print("TF-IDF + Logistic Regression Baseline Results:\n")
print(report)


TF-IDF + Logistic Regression Baseline Results:

                 precision    recall  f1-score   support

      ambiguous       0.57      0.89      0.70        18
associated_with       1.00      0.54      0.70        13
     equivalent       0.40      0.25      0.31         8
      unrelated       1.00      0.50      0.67         2

       accuracy                           0.63        41
      macro avg       0.74      0.54      0.59        41
   weighted avg       0.69      0.63      0.62        41



# Avg. Embedding

In [None]:
import os
import zipfile
import numpy as np
import pandas as pd
import requests
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Download GloVe embeddings
if not os.path.exists("glove.6B.100d.txt"):
    print("🔽 Downloading GloVe...")
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    r = requests.get(url)
    with open("glove.zip", "wb") as f:
        f.write(r.content)
    with zipfile.ZipFile("glove.zip", "r") as zip_ref:
        zip_ref.extract("glove.6B.100d.txt")

# Load GloVe into a dictionary
print("📦 Loading GloVe vectors...")
embeddings = {}
with open("glove.6B.100d.txt", "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split()
        word = parts[0]
        vector = np.array(parts[1:], dtype=np.float32)
        embeddings[word] = vector

EMBED_DIM = 100

# Average embeddings for each sentence
def average_embedding(text):
    tokens = word_tokenize(text.lower())
    valid_vecs = [embeddings[token] for token in tokens if token in embeddings]
    if valid_vecs:
        return np.mean(valid_vecs, axis=0)
    else:
        return np.zeros(EMBED_DIM)

# Load your data
df = pd.read_csv("alz_disease_pairs_cleaned.csv").dropna(subset=["input_text", "relation_label"])
label_encoder = LabelEncoder()
df["label_id"] = label_encoder.fit_transform(df["relation_label"])

# Compute average embeddings
print("🧠 Averaging sentence embeddings...")
X = np.vstack([average_embedding(text) for text in tqdm(df["input_text"])])
y = df["label_id"].values

# Train/test split
train_X, test_X, train_y, test_y = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Train classifier
clf = LogisticRegression(max_iter=1000, class_weight="balanced")
clf.fit(train_X, train_y)

# Evaluate
y_pred = clf.predict(test_X)
print("📊 Average Embedding Baseline Performance:\n")
print(classification_report(test_y, y_pred, target_names=label_encoder.classes_))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Annie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


🔽 Downloading GloVe...
📦 Loading GloVe vectors...
🧠 Averaging sentence embeddings...


100%|██████████| 201/201 [00:00<00:00, 4343.27it/s]

📊 Average Embedding Baseline Performance:

                 precision    recall  f1-score   support

      ambiguous       0.60      0.67      0.63        18
associated_with       0.75      0.46      0.57        13
     equivalent       0.36      0.50      0.42         8
      unrelated       0.50      0.50      0.50         2

       accuracy                           0.56        41
      macro avg       0.55      0.53      0.53        41
   weighted avg       0.60      0.56      0.57        41




