In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch, pandas as pd
from server.lib.constants import ModelConfig

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(ModelConfig.name)

# Load model with multi-class classification
model = AutoModelForSequenceClassification.from_pretrained(ModelConfig.name, num_labels=ModelConfig.num_classes).to(ModelConfig.device)

# Sample data (text, category)
df = pd.read_csv('../data/preprocessed.csv')
df

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,keywords,label
0,official site good hotel accommodation big sav...,9
1,expedia hotel book sites like use vacation wor...,9
2,tripadvisor hotel book sites like previously d...,9
3,cheap flights search compare flights momondo f...,9
4,bot create free account create free account si...,9
...,...,...
980,mobile fun share perangkat lunak mobile gratis...,0
981,alternativepedia discover free open source wel...,0
982,japanese female facial expression jaffe datase...,0
983,error error request url find server know,0


In [2]:
from torch.utils.data import Dataset, DataLoader, random_split
from server.models.website_clf import encode_keywords

class WebsiteDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
        # Tokenize keywords
        inputs = encode_keywords(self.df['keywords'].iloc[idx], tokenizer)
        # Convert label to tensor (single integer, not a list)
        label_tensor = torch.tensor(self.df['label'].iloc[idx], dtype=torch.long)
        return {'labels': label_tensor, **inputs}


dataset = WebsiteDataset(df, tokenizer)

In [3]:
# Split dataset into training and testing sets
train_size = int(len(dataset) * ModelConfig.train_ratio)
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create training and testing dataloaders
train_loader = DataLoader(train_dataset, batch_size=ModelConfig.batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=ModelConfig.batch_size, shuffle=False)

print(f'Training samples: {len(train_dataset)}')
print(f'Testing samples: {len(test_dataset)}')

Training samples: 788
Testing samples: 197


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=ModelConfig.training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()
trainer.model.half()
trainer.model.save_pretrained("website_clf_fp16")

Epoch,Training Loss,Validation Loss
1,1.9546,1.876393
2,1.4599,1.447211
3,1.0716,1.121188
4,0.7905,0.889733
5,0.4911,0.726119
6,0.298,0.628338
7,0.2089,0.552241
8,0.1669,0.577879
9,0.1605,0.556925
10,0.165,0.558338
