In [2]:
import numpy as np
import pandas as pd

training_data = pd.read_csv("/Users/chaitanyasuma/Downloads/train.csv")

In [3]:
!pip install torch
!pip install transformers

Collecting torch
  Obtaining dependency information for torch from https://files.pythonhosted.org/packages/5b/46/3def5bdaae03c21a7662673e6bda1f60a046afce48e0d6319ce4542bca31/torch-2.1.1-cp311-none-macosx_11_0_arm64.whl.metadata
  Downloading torch-2.1.1-cp311-none-macosx_11_0_arm64.whl.metadata (25 kB)
Downloading torch-2.1.1-cp311-none-macosx_11_0_arm64.whl (59.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: torch
Successfully installed torch-2.1.1


In [4]:
from huggingface_hub import notebook_login

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [6]:
training_data_hf = pd.read_csv("/Users/chaitanyasuma/Downloads/train.csv")
training_data_hf.head()

test_data_hf = pd.read_csv("/Users/chaitanyasuma/Downloads/test.csv")

In [7]:
texts = training_data_hf['text']
labels = training_data_hf['target']

test_texts = test_data_hf['text']

print(type(texts))

<class 'pandas.core.series.Series'>


In [8]:
texts_list = texts.tolist()
labels_list = labels.tolist()
test_texts_list = test_texts.tolist()
# print(texts_list)

In [9]:
train_texts, test_texts, train_labels, test_labels = train_test_split(texts_list, labels_list, test_size=0.2, random_state=42)

In [10]:
# Tokenize the texts using DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [11]:
# Convert labels to PyTorch tensors
train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

print(train_labels.size)

<built-in method size of Tensor object at 0x11d52c950>


In [12]:
train_dataset = TensorDataset(torch.tensor(train_encodings.input_ids),
                              torch.tensor(train_encodings.attention_mask),
                              train_labels)


In [13]:
test_dataset = TensorDataset(torch.tensor(test_encodings.input_ids),
                             torch.tensor(test_encodings.attention_mask),
                             test_labels)


In [14]:
# Load pre-trained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [16]:
# Set up optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

In [17]:
# Training loop
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [18]:
# Evaluation
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())

In [19]:
# Print classification report
print(classification_report(test_labels.numpy(), predictions))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84       874
           1       0.78      0.78      0.78       649

    accuracy                           0.81      1523
   macro avg       0.81      0.81      0.81      1523
weighted avg       0.81      0.81      0.81      1523



In [20]:
# Optimizer and learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

# Training loop with early stopping
best_validation_loss = float('inf')
early_stopping_counter = 0
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    validation_loss = 0.0
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss = criterion(logits, labels)
            validation_loss += loss.item()

    validation_loss /= len(test_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Validation Loss: {validation_loss}')

    # Check for early stopping
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= 3:
            print("Early stopping triggered.")
            break

    # Adjust learning rate
    scheduler.step()

Epoch 1/3, Validation Loss: 0.5206262663601658
Epoch 2/3, Validation Loss: 0.656284960844988
Epoch 3/3, Validation Loss: 1.0940156853306766


In [21]:
# Evaluation
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())

In [22]:
# Print classification report
print(classification_report(test_labels.numpy(), predictions))

              precision    recall  f1-score   support

           0       0.87      0.69      0.77       874
           1       0.67      0.86      0.76       649

    accuracy                           0.76      1523
   macro avg       0.77      0.78      0.76      1523
weighted avg       0.79      0.76      0.76      1523

