In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load model with multi-class classification
num_classes = 8  # Adjust based on your dataset
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_classes)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Sample data (text, category)
data = pd.read_csv('../data/preprocessed.csv')

# Convert category labels into numerical values
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['category'])

print('Class Mapping:', dict(enumerate(label_encoder.classes_)))


Class Mapping: {0: 'Computers and Technology', 1: 'E-Commerce', 2: 'Education', 3: 'Food', 4: 'Games', 5: 'Health and Fitness', 6: 'News', 7: 'Social Networking and Messaging', 8: 'Sports', 9: 'Travel'}


In [None]:
from torch.utils.data import Dataset

class WebsiteDataset(Dataset):
    def __init__(self, texts: list[str], labels: list[str], tokenizer: DistilBertTokenizer, max_length: int = 64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
        # Tokenize text
        inputs = self.tokenizer(
            self.texts[idx], 
            padding='max_length', 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors='pt'
        )

        # Convert label to tensor (single integer, not a list)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': label_tensor
        }

# Create dataset
dataset = WebsiteDataset(data['text'].tolist(), data['label'].tolist(), tokenizer)

In [None]:
from transformers import Trainer, TrainingArguments
from torch.utils.data import DataLoader
from transformers import AdamW

# Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='no',
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01
)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# Train Model
trainer.train()



Step,Training Loss


TrainOutput(global_step=10, training_loss=1.3661443710327148, metrics={'train_runtime': 0.6317, 'train_samples_per_second': 47.494, 'train_steps_per_second': 15.831, 'total_flos': 496805898240.0, 'train_loss': 1.3661443710327148, 'epoch': 10.0})