In [1]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

import torch
import pandas as pd

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
df = pd.read_csv("../Data/preprocessed/stance_preprocessed.csv")

In [11]:
df["combined"] = (
        df["headline_prep"].fillna("").astype(str) +
        " [SEP] " +
        df["body_prep"].fillna("").astype(str)
)

X = df["combined"].astype(str).tolist()

In [12]:
le = LabelEncoder()
y = le.fit_transform(df["Stance"])

In [13]:
num_classes = len(le.classes_)

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [15]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")



In [16]:
max_len = 128

In [17]:
print(type(X_train[0]))
print(X_train[:5])

<class 'str'>
['accountant claim comcast got fired job reporting customer service issue [SEP] innovative might first word come mind describing u telephone cable company comcast giving company credit credit due let u face harrowing minute customer service call went viral july looked like comcast nothing strive reached peak horrible yet report week accurate comcast continued shine innovator competitive field corporate terribleness got customer employer fire complaining shoddy customer service full story sadistic glory consumerist sparknotes version customer named conal started subscribing comcast service early issue erroneous charge bill never arrived discount applied conal considered cancelling service decided stick comcast rep promised issue would resolved threw free perk another friendly gesture company also allegedly sent billed conal equipment would never ordered get crazy conal understandably frustrated decided bypass customer service department take complaint straight comcast cont

In [18]:
train_enc = tokenizer(
    X_train,
    padding="max_length",
    truncation=True,
    max_length=max_len,
    return_tensors="pt"
)

In [19]:
val_enc = tokenizer(
    X_val,
    padding="max_length",
    truncation=True,
    max_length=max_len,
    return_tensors="pt"
)

In [20]:
train_input_ids = train_enc["input_ids"]
train_attention = train_enc["attention_mask"]
train_labels_tensor = torch.tensor(y_train)

In [21]:
val_input_ids = val_enc["input_ids"]
val_attention = val_enc["attention_mask"]
val_labels_tensor = torch.tensor(y_val)

In [22]:
device = "cpu"

In [23]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_classes
).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [25]:
criterion = torch.nn.CrossEntropyLoss()

In [26]:
batch_size = 8
num_epochs = 2

In [27]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for i in range(0, len(train_input_ids), batch_size):
        batch_input = train_input_ids[i:i+batch_size].to(device)
        batch_mask  = train_attention[i:i+batch_size].to(device)
        batch_labels = train_labels_tensor[i:i+batch_size].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=batch_input, attention_mask=batch_mask)
        logits = outputs.logits

        loss = criterion(logits, batch_labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {total_loss:.4f}")

Epoch 1/2 - Loss: 1167.4377
Epoch 2/2 - Loss: 504.6964


In [28]:
from torch.utils.data import TensorDataset, DataLoader

In [29]:
val_dataset = TensorDataset(val_input_ids, val_attention, val_labels_tensor)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

Create validation batch otherwise, it requires over 50GB of memory to validate the model

In [30]:
model.eval()
all_preds = []
all_labels = []

In [31]:
all_logits = []

In [32]:
with torch.no_grad():
    for ids, mask, labels in val_loader:
        ids = ids.to(device)
        mask = mask.to(device)

        logits = model(input_ids=ids, attention_mask=mask).logits
        preds = logits.argmax(dim=1).cpu().numpy()

        all_preds.extend(preds)
        all_labels.extend(labels.numpy())

In [33]:
acc  = accuracy_score(all_labels, all_preds)
prec = precision_score(all_labels, all_preds, average="weighted")
rec  = recall_score(all_labels, all_preds, average="weighted")
f1   = f1_score(all_labels, all_preds, average="weighted")

In [34]:
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")

Accuracy:  0.9671
Precision: 0.9704
Recall:    0.9671
F1 Score:  0.9682


In [35]:
model.save_pretrained("distilbert_stanceD")
tokenizer.save_pretrained("distilbert_tokenizer_stanceD")

('distilbert_tokenizer_stanceD\\tokenizer_config.json',
 'distilbert_tokenizer_stanceD\\special_tokens_map.json',
 'distilbert_tokenizer_stanceD\\vocab.txt',
 'distilbert_tokenizer_stanceD\\added_tokens.json',
 'distilbert_tokenizer_stanceD\\tokenizer.json')