<a href="https://colab.research.google.com/github/ALMACihan/IS584_Term_Project/blob/main/Project_Cihan_Alma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 **IS 584 Term Project - Cihan Alma**

In [None]:
!pip install -q "transformers==4.52.3" "datasets>=2.0.0" "scikit-learn>=1.3.0" "wandb>=0.16.0"


import wandb
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [None]:
!pip install -q scikit-learn pandas


In [None]:
import pandas as pd
import csv

df = pd.read_csv(
    "/content/asap_review_dataset.csv",
    on_bad_lines='skip',        # Replaces error_bad_lines
    quoting=csv.QUOTE_ALL,      # Handle quoted fields
    encoding="utf-8",
    engine="python"
)

df.head()


In [None]:
# Optional: filter to most common aspect labels (optional cleanup)
print(df['aspect_label'].value_counts())

# Drop duplicates
df = df.drop_duplicates(subset=["paper_id", "aspect_label", "review_text"])


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Split
X_train, X_test, y_train, y_test = train_test_split(
    df["review_text"], df["aspect_label"], test_size=0.2, random_state=42, stratify=df["aspect_label"]
)

# Vectorize
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train
clf = LogisticRegression(max_iter=1000, class_weight="balanced")
clf.fit(X_train_vec, y_train)

# Evaluate
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred))


In [None]:
# Sample ~4k examples evenly from all labels
import pandas as pd

df = pd.read_csv(
    "/content/asap_review_dataset.csv",
    on_bad_lines='skip',        # Replaces error_bad_lines
    quoting=csv.QUOTE_ALL,      # Handle quoted fields
    encoding="utf-8",
    engine="python"
)

df_sampled = df.groupby("aspect_label").apply(lambda x: x.sample(n=min(len(x), 300), random_state=42)).reset_index(drop=True)


In [None]:


import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from datasets import Dataset
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
import wandb

wandb.login()


In [None]:
# Load the CSV
df = pd.read_csv(
    "/content/asap_review_dataset.csv",
    on_bad_lines='skip',        # Replaces error_bad_lines
    quoting=csv.QUOTE_ALL,      # Handle quoted fields
    encoding="utf-8",
    engine="python"
)
df = df.dropna(subset=["review_text", "aspect_label"])
df = df[df["review_text"].str.strip().astype(bool)]

# Encode labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["aspect_label"])

df = df.groupby("aspect_label").apply(lambda x: x.sample(n=min(len(x), 200), random_state=42)).reset_index(drop=True)


# Create HuggingFace Dataset
dataset = Dataset.from_pandas(df[["review_text", "label"]])
dataset = dataset.train_test_split(test_size=0.2, seed=42)

label2id = {label: i for i, label in enumerate(le.classes_)}
id2label = {i: label for label, i in label2id.items()}
num_labels = len(label2id)


In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(example):
    return tokenizer(example["review_text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)


In [None]:
import transformers
print(transformers.__version__)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="wandb"
)



In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    report = classification_report(labels, preds, output_dict=True, zero_division=0)
    return {
        "accuracy": report["accuracy"],
        "macro_f1": report["macro avg"]["f1-score"]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
results = trainer.evaluate()
print("Final Eval Results:", results)


Second Training
learning_rate=5e-5,
batch_size=16,



In [None]:
import transformers
print(transformers.__version__)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="wandb"
)



In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    report = classification_report(labels, preds, output_dict=True, zero_division=0)
    return {
        "accuracy": report["accuracy"],
        "macro_f1": report["macro avg"]["f1-score"]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
results = trainer.evaluate()
print("Final Eval Results:", results)


In [None]:
import transformers
print(transformers.__version__)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./results_baseline",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="wandb"
)




In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    report = classification_report(labels, preds, output_dict=True, zero_division=0)
    return {
        "accuracy": report["accuracy"],
        "macro_f1": report["macro avg"]["f1-score"]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
import transformers
print(transformers.__version__)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./results_batch16",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="wandb"
)




In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    report = classification_report(labels, preds, output_dict=True, zero_division=0)
    return {
        "accuracy": report["accuracy"],
        "macro_f1": report["macro avg"]["f1-score"]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
import transformers
print(transformers.__version__)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./results_warmup_wd",
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.05,           # new
    warmup_steps=500,            # new
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="wandb"
)




In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    report = classification_report(labels, preds, output_dict=True, zero_division=0)
    return {
        "accuracy": report["accuracy"],
        "macro_f1": report["macro avg"]["f1-score"]
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()
