In [None]:
pip install transformers datasets accelerate torch


Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="crime_dataset_india.csv")
print(dataset)


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Report Number', 'Date Reported', 'Date of Occurrence', 'Time of Occurrence', 'City', 'Crime Code', 'Crime Description', 'Victim Age', 'Victim Gender', 'Weapon Used', 'Crime Domain', 'Police Deployed', 'Case Closed', 'Date Case Closed'],
        num_rows: 40160
    })
})


In [None]:
# Check unique labels in the dataset
unique_labels = set(dataset['train']['Crime Domain'])  # or 'Crime Code' if that is the label column
num_labels = len(unique_labels)

print(f"Number of unique labels: {num_labels}")


Number of unique labels: 4


In [None]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["Crime Description"], padding=True, truncation=True)

dataset = dataset.map(tokenize, batched=True)


Map:   0%|          | 0/40160 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForSequenceClassification

model_name = "distilbert-base-uncased"
num_labels = 500  # Ensure this matches your dataset

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Assuming "Crime Code" is the label column, rename it to "labels"
dataset = dataset.rename_column("Crime Code", "labels")

In [None]:
dataset = dataset.map(lambda x: {"labels": x["labels"] - 100})


Map:   0%|          | 0/40160 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./crime_classification_results",  # Directory to save results
    eval_strategy="epoch",  # Use 'eval_strategy' instead of 'evaluation_strategy'
    save_strategy="epoch",  # Save model at each epoch
    learning_rate=3e-5,  # Adjust learning rate
    per_device_train_batch_size=16,  # Increase batch size if GPU allows
    per_device_eval_batch_size=16,  # Match training batch size
    num_train_epochs=5,  # Train for more epochs if dataset is large
    weight_decay=0.01,
    logging_dir="./logs",  # Optional: Directory for logging
    logging_steps=500,  # Log every 500 steps
    save_total_limit=2,  # Keep only the 2 most recent models
    load_best_model_at_end=True,  # Load best checkpoint at the end
    metric_for_best_model="accuracy",  # Use accuracy to select best model
    report_to="none"  # Set to 'wandb' if using Weights & Biases for tracking
)


In [None]:
from transformers import Trainer, TrainingArguments
import numpy as np
import evaluate
from datasets import DatasetDict

# Load accuracy metric
accuracy = evaluate.load("accuracy")

# Function to compute evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# 🚀 Check if dataset has 'test' split; if not, split it manually
if "test" not in dataset:
    dataset = dataset["train"].train_test_split(test_size=0.2)  # 80% train, 20% test

# Convert dataset to DatasetDict (ensures correct structure)
dataset = DatasetDict(dataset)
# Assuming "Crime Code" is the label column, rename it to "labels"
# Check if the column "Crime Code" exists before renaming it
if "Crime Code" in dataset["train"].column_names:
    dataset = dataset.rename_column("Crime Code", "labels")
# If "Crime Code" has already been renamed to "labels", do nothing. Otherwise, raise an error
elif "labels" not in dataset["train"].column_names:
    raise ValueError("Neither 'Crime Code' nor 'labels' column found in the dataset. Please check your dataset.")


# Ensure labels are integers (required for classification)
dataset = dataset.map(lambda x: {"labels": int(x["labels"])}, num_proc=4)

# Check dataset structure
print(dataset)



# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],  # Train dataset
    eval_dataset=dataset["test"],  # Test dataset (created if missing)
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

Map (num_proc=4):   0%|          | 0/32128 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/8032 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Report Number', 'Date Reported', 'Date of Occurrence', 'Time of Occurrence', 'City', 'labels', 'Crime Description', 'Victim Age', 'Victim Gender', 'Weapon Used', 'Crime Domain', 'Police Deployed', 'Case Closed', 'Date Case Closed', 'input_ids', 'attention_mask'],
        num_rows: 32128
    })
    test: Dataset({
        features: ['Report Number', 'Date Reported', 'Date of Occurrence', 'Time of Occurrence', 'City', 'labels', 'Crime Description', 'Victim Age', 'Victim Gender', 'Weapon Used', 'Crime Domain', 'Police Deployed', 'Case Closed', 'Date Case Closed', 'input_ids', 'attention_mask'],
        num_rows: 8032
    })
})


Epoch,Training Loss,Validation Loss,Accuracy
1,6.2165,6.216506,0.001619
2,6.2148,6.216545,0.001494


In [None]:
print(dataset.column_names)


{'train': ['Report Number', 'Date Reported', 'Date of Occurrence', 'Time of Occurrence', 'City', 'Crime Code', 'Crime Description', 'Victim Age', 'Victim Gender', 'Weapon Used', 'Crime Domain', 'Police Deployed', 'Case Closed', 'Date Case Closed', 'input_ids', 'attention_mask', 'labels'], 'test': ['Report Number', 'Date Reported', 'Date of Occurrence', 'Time of Occurrence', 'City', 'Crime Code', 'Crime Description', 'Victim Age', 'Victim Gender', 'Weapon Used', 'Crime Domain', 'Police Deployed', 'Case Closed', 'Date Case Closed', 'input_ids', 'attention_mask', 'labels']}


In [None]:
eval_results = trainer.evaluate()
print(eval_results)


{'eval_loss': 6.218129634857178, 'eval_accuracy': 0.0023655378486055778, 'eval_runtime': 109.5919, 'eval_samples_per_second': 73.29, 'eval_steps_per_second': 4.581, 'epoch': 5.0}
