In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [18]:
# Load dataset
data = pd.read_csv('/Users/amishi/Downloads/anti_doping_testimonies.csv')
data.head()

Unnamed: 0,Title,Testimony
0,Suspicious Behavior at Gym,I noticed an athlete injecting themselves with...
1,Supplier Name in Conversation,I overheard a conversation where two athletes ...
2,Package Exchange at Competition,"During a recent competition, I saw a coach han..."
3,Unusual Medical Treatment,One athlete I train with regularly has been vi...
4,Bulk Purchase of Supplements,"I work at a store that sells supplements, and ..."


In [19]:
data.shape

(10000, 2)

In [3]:
# Encode labels
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Title'])

In [4]:
# Split into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['Testimony'], data['Label'], test_size=0.2, random_state=42)

In [5]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

  torch.utils._pytree._register_pytree_node(


In [6]:
# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [7]:
# Dataset Preparation
class TestimonyDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer(
            text, truncation=True, padding="max_length", max_length=self.max_len, return_tensors="pt"
        )
        return {key: val.squeeze(0) for key, val in encoding.items()}, torch.tensor(label)

In [8]:
# Create Datasets
train_dataset = TestimonyDataset(train_texts, train_labels, tokenizer)
test_dataset = TestimonyDataset(test_texts, test_labels, tokenizer)

In [9]:
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

In [10]:
from transformers import AdamW

In [11]:
# Model Initialization
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(data['Label'].unique()))
optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Training Loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [13]:
for epoch in range(3):  # Number of epochs
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

Epoch 1, Loss: 0.07066068850527517
Epoch 2, Loss: 0.000572715820162557
Epoch 3, Loss: 0.0001755527284258278


In [14]:
from sklearn.metrics import classification_report

In [15]:
# Evaluation Loop
model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)
        outputs = model(**inputs)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, axis=-1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())



In [16]:
print(classification_report(true_labels, predictions, target_names=label_encoder.classes_))

                                 precision    recall  f1-score   support

   Bulk Purchase of Supplements       1.00      1.00      1.00       189
     Hidden Documents in Locker       1.00      1.00      1.00       183
Package Exchange at Competition       1.00      1.00      1.00       212
           Private Lab Sessions       1.00      1.00      1.00       196
    Rapid Recovery After Injury       1.00      1.00      1.00       195
     Strange Deliveries at Home       1.00      1.00      1.00       204
  Supplier Name in Conversation       1.00      1.00      1.00       195
     Suspicious Behavior at Gym       1.00      1.00      1.00       219
    Unofficial Training Regimen       1.00      1.00      1.00       189
      Unusual Medical Treatment       1.00      1.00      1.00       218

                       accuracy                           1.00      2000
                      macro avg       1.00      1.00      1.00      2000
                   weighted avg       1.00      1

In [21]:
torch.save(model.state_dict(), "model/fine_tuned_model.pth")