In [None]:
!pip install dataset evaluate scikit-learn
!pip install transformers[torch]

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import Dataset, DatasetDict
from sklearn.metrics import roc_auc_score

import torch.nn as nn
import pandas as pd
import numpy as np
import evaluate
import torch

Loading the Dataset

In [6]:
train= pd.read_csv("train_cleaned_v2.csv")
test = pd.read_csv("test_cleaned_v2.csv")

Creating Labels

In [7]:
labels = train['sub_category'].unique().tolist()
labels = [s.strip() for s in labels ]
NUM_LABELS= len(labels)

id2label={id:label for id,label in enumerate(labels)}
label2id={label:id for id,label in enumerate(labels)}
id2label

{0: 'Cyber Bullying  Stalking  Sexting',
 1: 'Fraud CallVishing',
 2: 'Online Gambling  Betting',
 3: 'Online Job Fraud',
 4: 'UPI Related Frauds',
 5: 'Internet Banking Related Fraud',
 6: 'Other',
 7: 'Profile Hacking Identity Theft',
 8: 'DebitCredit Card FraudSim Swap Fraud',
 9: 'EWallet Related Fraud',
 10: 'Data Breach/Theft',
 11: 'Denial of Service (DoS)/Distributed Denial of Service (DDOS) attacks',
 12: 'FakeImpersonating Profile',
 13: 'Cryptocurrency Fraud',
 14: 'Malware Attack',
 15: 'Business Email CompromiseEmail Takeover',
 16: 'Email Hacking',
 17: 'Cheating by Impersonation',
 18: 'Hacking/Defacement',
 19: 'Unauthorised AccessData Breach',
 20: 'SQL Injection',
 21: 'Provocative Speech for unlawful acts',
 22: 'Ransomware Attack',
 23: 'Cyber Terrorism',
 24: 'Tampering with computer source documents',
 25: 'DematDepository Fraud',
 26: 'Online Trafficking',
 27: 'Online Matrimonial Fraud',
 28: 'Website DefacementHacking',
 29: 'Damage to computer computer systems

In [5]:
model_path = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=NUM_LABELS, 
                                                           id2label=id2label, 
                                                           label2id=label2id,)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model Configuration

In [6]:
# freeze base model parameters
for name, param in model.base_model.named_parameters():
    param.requires_grad = False

# unfreeze base model pooling layers
for name, param in model.base_model.named_parameters():
    if "pooler" in name:
        param.requires_grad = True

In [None]:
train['crimeaditionalinfo'] = train['crimeaditionalinfo'].astype(str)
test['crimeaditionalinfo'] = test['crimeaditionalinfo'].astype(str)

labels = train['sub_category'].unique().tolist()
label_to_id = {label: idx for idx, label in enumerate(labels)}
train['label'] = train['sub_category'].map(label_to_id)
test['label'] = test['sub_category'].map(label_to_id)

train_dataset = Dataset.from_pandas(train[['crimeaditionalinfo', 'label']])
test_dataset = Dataset.from_pandas(test[['crimeaditionalinfo', 'label']])

dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

def preprocess_function(examples):
    return tokenizer(examples["crimeaditionalinfo"], truncation=True)

tokenized_data = dataset_dict.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map:   0%|          | 0/85892 [00:00<?, ? examples/s]

Map:   0%|          | 0/28619 [00:00<?, ? examples/s]

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    print(pred)
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [None]:
lr = 2e-4
batch_size = 32
num_epochs = 32

training_args = TrainingArguments(
    output_dir="bert-cyberguard-classifier-V2",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)
trainer.train()