In [4]:
# Importing the required libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Loading the Datasets
train_df = pd.read_csv('/content/drive/MyDrive/cleaned_data.csv')
test_df = pd.read_csv('/content/drive/MyDrive/cleaned_test.csv')

# Choosing the cleaned and preprocessed crime info and labeling them for classification
train_df['cleaned_crime_info'] = train_df['cleaned_crime_info'].astype(str).fillna("")
test_df['cleaned_crime_info'] = test_df['cleaned_crime_info'].astype(str).fillna("")

label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['category'])

train_categories = set(train_df['category'])

# Filtering the unique category from the train and test dataset
filtered_test_df = test_df[test_df['category'].isin(train_categories)]
filtered_test_df['label'] = label_encoder.transform(filtered_test_df['category'])

#Using DistilBERT tokenizer for tokenization
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_data(dataframe):
    return tokenizer(
        list(dataframe['cleaned_crime_info']),
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

train_encodings = tokenize_data(train_df)
filtered_test_encodings = tokenize_data(filtered_test_df)

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = CustomDataset(train_encodings, train_df['label'].tolist())
filtered_test_dataset = CustomDataset(filtered_test_encodings, filtered_test_df['label'].tolist())

num_labels = len(label_encoder.classes_)

# Loading the DistilBert model from Transformers pipeling
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

# Adding the Training arguments for training the model
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=filtered_test_dataset,
    tokenizer=tokenizer
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_test_df['label'] = label_encoder.transform(filtered_test_df['category'])
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Training the Model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,0.6843,0.67148
2,0.6036,0.667673
3,0.5465,0.716637
4,0.3589,0.80126
5,0.3271,0.962957
6,0.2237,1.108531


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=15768, training_loss=0.4623445891097135, metrics={'train_runtime': 5869.6774, 'train_samples_per_second': 85.956, 'train_steps_per_second': 2.686, 'total_flos': 1.671245028705024e+16, 'train_loss': 0.4623445891097135, 'epoch': 6.0})

In [9]:
# Evaluating the metrics
metrics = trainer.evaluate()
print("Evaluation Metrics:", metrics)

# Saving the trained model
model.save_pretrained('./distilbert-fine-tuned')
tokenizer.save_pretrained('./distilbert-fine-tuned')
import joblib
joblib.dump(label_encoder, './distilbert-fine-tuned/label_encoder.pkl')

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Evaluation Metrics: {'eval_loss': 0.6676731109619141, 'eval_runtime': 104.5272, 'eval_samples_per_second': 268.026, 'eval_steps_per_second': 16.752, 'epoch': 6.0}


['./distilbert-fine-tuned/label_encoder.pkl']

In [8]:
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    log_loss,
)

# Predicting the Model
predictions = trainer.predict(filtered_test_dataset)
predicted_probs = predictions.predictions
predicted_labels = np.argmax(predicted_probs, axis=1)

y_test = filtered_test_df['label'].tolist()

# Printing the Accuracy of the Model
accuracy = accuracy_score(y_test, predicted_labels)
print(f"Accuracy: {accuracy:.4f}")

precision = precision_score(y_test, predicted_labels, average='weighted')
recall = recall_score(y_test, predicted_labels, average='weighted')
f1 = f1_score(y_test, predicted_labels, average='weighted')

# Printing the Precision, Recall and F1 Score of the Model
print(f"Precision (Weighted): {precision:.4f}")
print(f"Recall (Weighted): {recall:.4f}")
print(f"F1 Score (Weighted): {f1:.4f}")

# Printing the ROC curve of the Model
try:
    auc_roc = roc_auc_score(
        np.eye(len(label_encoder.classes_))[y_test],
        predicted_probs,
        multi_class='ovr',
        average='weighted'
    )
    print(f"AUC-ROC (Weighted): {auc_roc:.4f}")
except ValueError:
    print("AUC-ROC could not be calculated (requires probabilities for all classes).")

conf_matrix = confusion_matrix(y_test, predicted_labels)

# Printing the Confusion Matrix of the Model prediction
print("Confusion Matrix:")
print(conf_matrix)

present_labels = sorted(list(set(y_test)))

filtered_predicted_probs = predicted_probs[:, present_labels]

logloss = log_loss(y_test, filtered_predicted_probs, labels=present_labels)
print(f"Log Loss: {logloss:.4f}")

unique_labels_test = np.unique(y_test)

target_names_test = label_encoder.classes_[unique_labels_test]

classification_report_output = classification_report(
    y_test,
    predicted_labels,
    target_names=target_names_test,
    labels=unique_labels_test,
    zero_division=0
)

# Print the Final Classification Report
print("Classification Report:")
print(classification_report_output)

tp = np.diag(conf_matrix)
fp = conf_matrix.sum(axis=0) - tp
fn = conf_matrix.sum(axis=1) - tp
tn = conf_matrix.sum() - (fp + fn + tp)

# Print the Breakdownn of the Confusion Matrix
print("\nConfusion Matrix Breakdown (Per Class):")
for i, label in enumerate(target_names_test):
    print(
        f"Class: {label}\n"
        f"True Positives: {tp[i]}, False Positives: {fp[i]}, "
        f"False Negatives: {fn[i]}, True Negatives: {tn[i]}"
    )

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Accuracy: 0.7690
Precision (Weighted): 0.7360
Recall (Weighted): 0.7690
F1 Score (Weighted): 0.7346
AUC-ROC could not be calculated (requires probabilities for all classes).
Confusion Matrix:
[[  693     0    41     0     0   138     0  2073     6   328     0     1
      1    10]
 [    3    45     2     0     0     2     0    18     0    25     0     3
      3    14]
 [    0     0   114     0     0     0     0    35     0     2     0     0
      0     0]
 [    0     0     0  1261     0     0     0     0     0     0     0     0
      0     0]
 [    5     0     1     0     0     3     0    28     0    10     0     0
      0     0]
 [   50     0     4     0     0   208     0   151     1    93     6     1
      0     0]
 [    3     0     0     0     0     6     0    34     0    13     0     1
      0     0]
 [  249     0    53     0     0    82     0 16964    11   248     0     0
      0     0]
 [    9     0     2     0     0     4     0    83    15     5     0     0
      0     0]
 [  165

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Log Loss: 0.3923
Classification Report:
                                                      precision    recall  f1-score   support

                               Any Other Cyber Crime       0.57      0.21      0.31      3291
Child Pornography CPChild Sexual Abuse Material CSAM       0.57      0.39      0.46       115
                                Cryptocurrency Crime       0.50      0.75      0.60       151
                      Cyber Attack/ Dependent Crimes       1.00      1.00      1.00      1261
                                     Cyber Terrorism       0.00      0.00      0.00        47
      Hacking  Damage to computercomputer system etc       0.32      0.40      0.36       514
                            Online Cyber Trafficking       0.00      0.00      0.00        57
                              Online Financial Fraud       0.82      0.96      0.89     17607
                            Online Gambling  Betting       0.42      0.13      0.19       118
               Onli