In [1]:
# pip install --upgrade nltk

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk import ngrams
import re
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import Trainer

In [69]:
import torch
from sklearn.utils.class_weight import compute_class_weight
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import numpy as np

In [43]:
train = pd.read_csv("/kaggle/input/cyber-crime/train.csv")
test = pd.read_csv("/kaggle/input/cyber-crime/test.csv")

In [44]:
df_train = train.copy()
df_train = df_train.dropna(subset=["crimeaditionalinfo"])

In [45]:
df_test = test.copy()
df_test = df_test.dropna(subset=['crimeaditionalinfo'])

In [46]:
# Ensure you have downloaded NLTK's punkt tokenizer
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [47]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [48]:
def pre_clean_text(text):
    text = text.lower()  # Lowercase conversion
    text = re.sub(r'\b[A-Z]{2,4}\b', ' ', text)  # Remove abbreviations identified earlier
    return text

df_train['cleaned_text'] = df_train['crimeaditionalinfo'].apply(pre_clean_text)
df_test['cleaned_text'] = df_test['crimeaditionalinfo'].apply(pre_clean_text)

In [49]:
abbreviation_dict = {
    "atm": "automated teller machine",
    "id": "identification",
    "dr": "doctor"
}

In [50]:
def clean_text(text):
    # Standardize abbreviations
    for abbr, full_form in abbreviation_dict.items():
        text = re.sub(r'\b' + re.escape(abbr) + r'\b', full_form, text)
    # Remove HTML tags, punctuation, and excessive whitespace
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text
df_train['cleaned_text'] = df_train['cleaned_text'].apply(clean_text)
df_test['cleaned_text'] = df_test['cleaned_text'].apply(clean_text)

In [51]:
def normalize_tokenize(text):
    # Convert to lowercase, remove stop words
    words = [word for word in nltk.word_tokenize(text.lower()) if word.isalpha() and word not in stop_words]
    return words

df_train['tokens'] = df_train['cleaned_text'].apply(normalize_tokenize)
df_test['tokens'] = df_test['cleaned_text'].apply(normalize_tokenize)

In [52]:
df_train['lemmatized_tokens'] = df_train['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])
df_test['lemmatized_tokens'] = df_test['tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [53]:
custom_dictionary = {
    "fraud": ["scam", "deceit", "trickery"],
    "theft": ["stealing", "robbery", "burglary"]
}

In [54]:
def tag_domain_terms(tokens, custom_dict):
    tagged_terms = []
    for word in tokens:
        for key, synonyms in custom_dict.items():
            if word in synonyms:
                tagged_terms.append(key)  # Replace synonym with standard term
            else:
                tagged_terms.append(word)
    return tagged_terms

df_train['tagged_terms'] = df_train['lemmatized_tokens'].apply(lambda x: tag_domain_terms(x, custom_dictionary))
df_test['tagged_terms'] = df_test['lemmatized_tokens'].apply(lambda x: tag_domain_terms(x, custom_dictionary))

In [55]:
df_test['text'] = df_test['lemmatized_tokens'].apply(lambda x: ' '.join(x))
df_test['labels'] = list(zip(df_test['category'], df_test['sub_category']))

In [56]:
df_train['text'] = df_train['lemmatized_tokens'].apply(lambda x: ' '.join(x))  # Join tokens into a single string
df_train['labels'] = list(zip(df_train['category'], df_train['sub_category']))

# Encode Labels
le_category = LabelEncoder()
le_sub_category = LabelEncoder()
df_train['category'] = le_category.fit_transform(df_train['category'])
df_train['sub_category'] = le_sub_category.fit_transform(df_train['sub_category'])

# Train-Validation Split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_train['text'].tolist(),
    df_train[['category', 'sub_category']].values.tolist(),
    test_size=0.2,
    random_state=42
)

In [18]:
# Load Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# 2. Custom Dataset Class for Multi-Output Labels
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.long)
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': labels
        }

# Prepare datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

# 3. Define Custom BERT Model for Multi-Output Classification
class MultiOutputBERT(nn.Module):
    def __init__(self, num_labels1, num_labels2):
        super(MultiOutputBERT, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.drop = nn.Dropout(p=0.2)  # Lower dropout rate for better information retention
        self.fc_category = nn.Linear(self.bert.config.hidden_size, num_labels1)
        self.fc_sub_category = nn.Linear(self.bert.config.hidden_size, num_labels2)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.drop(outputs.pooler_output)
        category_logits = self.fc_category(pooled_output)
        sub_category_logits = self.fc_sub_category(pooled_output)

        # Calculate loss if labels are provided
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss_category = loss_fct(category_logits, labels[:, 0])
            loss_sub_category = loss_fct(sub_category_logits, labels[:, 1])
            loss = loss_category + loss_sub_category
        return (loss, category_logits, sub_category_logits)

# 4. Initialize Model
num_categories = df_train['category'].nunique()
num_sub_categories = df_train['sub_category'].nunique()
model = MultiOutputBERT(num_labels1=num_categories, num_labels2=num_sub_categories)

# 5. Training Arguments with Optimizations for Space, Speed, and Accuracy
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Increased for better training
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.05,
    learning_rate=2e-5,
    lr_scheduler_type='cosine',  # Smoother learning rate decay
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Match save strategy to evaluation strategy
    fp16=True,  # Mixed precision for faster computation
    gradient_accumulation_steps=4,  # Accumulate to increase batch size effectively
    save_total_limit=1,  # Keep only the latest checkpoint to save space
    load_best_model_at_end=True  # Load best model after early stopping
)

# 6. Trainer Setup with Early Stopping
class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs, labels=labels)
        loss = outputs[0]
        return (loss, outputs) if return_outputs else loss

trainer = MultiLabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # Stop early if no improvement
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss
0,2.1852,2.081829
1,1.9788,2.006339
2,1.7959,2.003662
4,1.5196,2.126788


TrainOutput(global_step=11705, training_loss=1.9244757525587834, metrics={'train_runtime': 6372.6248, 'train_samples_per_second': 58.792, 'train_steps_per_second': 1.837, 'total_flos': 0.0, 'train_loss': 1.9244757525587834, 'epoch': 4.998398633500587})

In [20]:
eval_results = trainer.evaluate()
print(f"Validation Loss: {eval_results['eval_loss']}")

Validation Loss: 2.003661632537842


In [23]:
torch.save(model.state_dict(), "multi_output_bert_state_dict.pth")

# Save the tokenizer using Hugging Face's save_pretrained method
tokenizer.save_pretrained("best_model")

('best_model/tokenizer_config.json',
 'best_model/special_tokens_map.json',
 'best_model/vocab.txt',
 'best_model/added_tokens.json')

In [24]:
predictions = trainer.predict(val_dataset)
pred_logits = predictions.predictions

# Step 2: Separate logits for category and sub_category
category_logits = pred_logits[0]  # First set of logits for category
sub_category_logits = pred_logits[1]  # Second set of logits for sub_category

# Step 3: Get the predicted labels by taking the argmax of logits
category_preds = category_logits.argmax(axis=1)
sub_category_preds = sub_category_logits.argmax(axis=1)

In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(category_preds, [val_label[0] for val_label in val_labels])

0.7710457481449848

In [28]:
accuracy_score(sub_category_preds, [val_label[1] for val_label in val_labels])

0.5697966155981423

In [57]:
train_category_decoded = le_category.inverse_transform(df_train['category'])
train_category_decoded

array(['Online and Social Media Related Crime', 'Online Financial Fraud',
       'Online Gambling  Betting', ..., 'Online Financial Fraud',
       'Online and Social Media Related Crime', 'Online Financial Fraud'],
      dtype=object)

In [58]:
train_category_decoded = list(train_category_decoded)
train_category_decoded = list(set(train_category_decoded))

In [59]:
df_test = df_test[df_test['category'].isin(train_category_decoded)]
df_test['category'] = le_category.transform(df_test['category'])
df_test['sub_category'] = le_sub_category.transform(df_test['sub_category'])

# Train-Validation Split
# train_texts, val_texts, train_labels, val_labels = train_test_split(
#     df_train['text'].tolist(),
#     df_train[['category', 'sub_category']].values.tolist(),
#     test_size=0.2,
#     random_state=42
# )

In [60]:
test_texts = df_test['text'].tolist()
test_labels = df_test[['category', 'sub_category']].values.tolist()

In [61]:
test_dataset = CustomDataset(test_texts, test_labels, tokenizer)

In [62]:
predictions = trainer.predict(test_dataset)
pred_logits = predictions.predictions

# Step 2: Separate logits for category and sub_category
category_logits = pred_logits[0]  # First set of logits for category
sub_category_logits = pred_logits[1]  # Second set of logits for sub_category

# Step 3: Get the predicted labels by taking the argmax of logits
category_preds = category_logits.argmax(axis=1)
sub_category_preds = sub_category_logits.argmax(axis=1)

In [63]:
accuracy_score(category_preds, [test_label[0] for test_label in test_labels])

0.7627330386315587

In [65]:
accuracy_score(sub_category_preds, [test_label[1] for test_label in test_labels])

0.5674610801460696