In [1]:
print("Hello")

Hello


In [16]:
!pip install tqdm 

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable


In [5]:
import torch
from torch.utils.data import DataLoader
from transformers import (
        AutoTokenizer,
        AutoModelForSequenceClassification,
        T5ForSequenceClassification
    )

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [7]:
# Define Sentiment Predictor Class
class SentimentPredictor:
    def __init__(self, model, tokenizer, device):
        self.model = model.eval().to(device)
        self.tokenizer = tokenizer
        self.device = device

    def predict(self, text: str):
        return self.predict_batch([text])

    def predict_batch(self, texts: list):
        inputs = self.tokenizer(
            texts,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=512
        ).to(self.device)

        with torch.no_grad():
            logits = self.model(**inputs).logits
            probs = torch.nn.functional.softmax(logits, dim=1)

        id2label = self.model.config.id2label

        # Build results per sample
        results = []
        for prob in probs:
            result = [
                {"label": id2label[i], "score": round(prob[i].item(), 4)}
                for i in range(len(prob))
            ]
            results.append(result)
        return results

In [9]:
 # Load model & tokenizer from local checkpoint
model_T5_Ag_News = AutoModelForSequenceClassification.from_pretrained("/scratch/gilbreth/abelde/NLP_Score_Based_Attacks/saved_models/T5-classifier/T5-classifier-agnews")
tokenizer_T5_Ag_News = AutoTokenizer.from_pretrained("/scratch/gilbreth/abelde/NLP_Score_Based_Attacks/saved_models/T5-classifier/T5-classifier-agnews")

In [10]:
T5_predictor = SentimentPredictor(model_T5_Ag_News, tokenizer_T5_Ag_News, device)

In [13]:
T5_predictor.predict("Bush Vows Rapid Aid to Hurricane Victims PUNTA GORDA, Fla. - Residents left homeless by Hurricane Charley's 145 mph winds dug through their ravaged homes on Sunday, sweeping up shattered glass and rescuing what they could as President Bush promised rapid delivery of disaster aid...")

[[{'label': 'World', 'score': 0.4128},
  {'label': 'Sports', 'score': 0.2319},
  {'label': 'Business', 'score': 0.1805},
  {'label': 'Sci/Tech', 'score': 0.1748}]]

In [17]:
import pandas as pd
import random
from tqdm import tqdm

# Load full test dataset
test_path = "/scratch/gilbreth/abelde/NLP_Score_Based_Attacks/data/ag_news/ag_news_test.csv"
df = pd.read_csv(test_path)

# Set seed and sample 1000 examples
random.seed(30)
df_sampled = df.sample(n=1000, random_state=30).reset_index(drop=True)

# True labels
true_labels = df_sampled["label"].tolist()

# Predict labels
predicted_labels = []
for text in tqdm(df_sampled["text"], desc="Predicting"):
    output = T5_predictor.predict(text)
    # Get top-1 label
    top_label = max(output[0], key=lambda x: x["score"])["label"]
    predicted_labels.append(top_label)

# Label mapping (id ↔ text)
label_map = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}
# Convert true int labels to text
true_label_names = [label_map[l] for l in true_labels]

# Compute accuracy
correct = sum([pred == true for pred, true in zip(predicted_labels, true_label_names)])
accuracy = correct / len(true_label_names)

print(f"✅ Accuracy on 1000 samples (seed=30): {accuracy:.4f}")


ImportError: C extension: None not built. If you want to import pandas from the source directory, you may need to run 'python setup.py build_ext' to build the C extensions first.

In [18]:
import pandas

ImportError: C extension: None not built. If you want to import pandas from the source directory, you may need to run 'python setup.py build_ext' to build the C extensions first.

In [19]:
!python setup.py build_ext

python: can't open file '/scratch/gilbreth/abelde/NLP_Score_Based_Attacks/scripts/setup.py': [Errno 2] No such file or directory


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
