 ## BERT Model ## 

In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch
import concurrent.futures

# Function to load the tokenizer with a timeout
def load_tokenizer_with_timeout(model_name, timeout=30):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future = executor.submit(BertTokenizer.from_pretrained, model_name)
        try:
            tokenizer = future.result(timeout=timeout)
        except concurrent.futures.TimeoutError:
            print("Timeout while loading tokenizer. Skipping.")
            return None
        except Exception as e:
            print(f"Error while loading tokenizer: {e}")
            return None
    return tokenizer

# Function to load the model with a timeout
def load_model_with_timeout(model_name, num_labels, timeout=30):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future = executor.submit(BertForSequenceClassification.from_pretrained, model_name, num_labels=num_labels)
        try:
            model = future.result(timeout=timeout)
        except concurrent.futures.TimeoutError:
            print("Timeout while loading model. Skipping.")
            return None
        except Exception as e:
            print(f"Error while loading model: {e}")
            return None
    return model

# Load the dataset
data = pd.read_csv('Police_Department_Incidents_-_Previous_Year__2016_.csv')

# Data Preprocessing
data.dropna(subset=['PdDistrict'], inplace=True)  # Drop rows with missing PdDistrict
data = data[data['Category'].notna()]  # Drop rows with missing Category


data = data.sample(1000)  

# Load tokenizer and model with timeout
tokenizer = load_tokenizer_with_timeout('bert-base-uncased')
if tokenizer is None:
    raise RuntimeError("Tokenizer could not be loaded. Exiting.")

# Prepare labels
label_to_id = {label: i for i, label in enumerate(data['Category'].unique())}
data['label_id'] = data['Category'].map(label_to_id)

# Prepare text data for BERT
def encode_data(texts, labels, max_length=64):  # Reduced max_length for faster processing
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    labels = torch.tensor(labels.tolist())
    return encodings, labels

# Encode the dataset
X_train, X_test, y_train, y_test = train_test_split(data['Descript'], data['label_id'], test_size=0.2, random_state=42)
train_encodings, train_labels = encode_data(X_train, y_train)
test_encodings, test_labels = encode_data(X_test, y_test)

# Load the model
model = load_model_with_timeout('bert-base-uncased', num_labels=len(label_to_id))
if model is None:
    raise RuntimeError("Model could not be loaded. Exiting.")

# Define a dataset class
class CrimeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CrimeDataset(train_encodings, train_labels)
test_dataset = CrimeDataset(test_encodings, test_labels)

# Training
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,  # Reduce the number of epochs
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Save model at the end of each epoch
    save_total_limit=2,  # Save only the last two checkpoints
    load_best_model_at_end=True,  # Load the best model when finished training
    fp16=False,  # Disable mixed precision for CPU compatibility
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

try:
    trainer.train()
except KeyboardInterrupt:
    print("Training interrupted. Saving partial model...")
    trainer.save_model("interrupted_model_save")
    tokenizer.save_pretrained("interrupted_tokenizer_save")

# Save the model and tokenizer
model_path = 'bert_crime_model.bin'
tokenizer_path = 'bert_crime_tokenizer'

model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)

# Prediction and Evaluation
preds = trainer.predict(test_dataset)
y_pred = torch.argmax(torch.tensor(preds.predictions), axis=1)

# Print classification report
from sklearn.metrics import classification_report
print("BERT Classification Report:")
print(classification_report(y_test, y_pred))

  from .autonotebook import tqdm as notebook_tqdm





Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

{'loss': 3.3888, 'learning_rate': 2.5e-06, 'epoch': 0.05}


 10%|█         | 20/200 [00:32<05:02,  1.68s/it]

{'loss': 3.3985, 'learning_rate': 5e-06, 'epoch': 0.1}


 15%|█▌        | 30/200 [00:45<03:55,  1.38s/it]

{'loss': 3.2575, 'learning_rate': 7.5e-06, 'epoch': 0.15}


 20%|██        | 40/200 [00:59<03:45,  1.41s/it]

{'loss': 3.2603, 'learning_rate': 1e-05, 'epoch': 0.2}


 25%|██▌       | 50/200 [01:13<03:45,  1.50s/it]

{'loss': 3.1124, 'learning_rate': 1.25e-05, 'epoch': 0.25}


 30%|███       | 60/200 [01:31<03:52,  1.66s/it]

{'loss': 2.9834, 'learning_rate': 1.5e-05, 'epoch': 0.3}


 35%|███▌      | 70/200 [01:48<03:36,  1.67s/it]

{'loss': 2.5583, 'learning_rate': 1.75e-05, 'epoch': 0.35}


 40%|████      | 80/200 [02:04<03:08,  1.57s/it]

{'loss': 2.7037, 'learning_rate': 2e-05, 'epoch': 0.4}


 45%|████▌     | 90/200 [02:19<02:48,  1.53s/it]

{'loss': 2.493, 'learning_rate': 2.25e-05, 'epoch': 0.45}


 50%|█████     | 100/200 [02:34<02:29,  1.49s/it]

{'loss': 2.1608, 'learning_rate': 2.5e-05, 'epoch': 0.5}


 55%|█████▌    | 110/200 [02:49<02:13,  1.48s/it]

{'loss': 2.3393, 'learning_rate': 2.7500000000000004e-05, 'epoch': 0.55}


 60%|██████    | 120/200 [03:04<01:57,  1.47s/it]

{'loss': 1.7075, 'learning_rate': 3e-05, 'epoch': 0.6}


 65%|██████▌   | 130/200 [03:20<01:47,  1.53s/it]

{'loss': 1.7962, 'learning_rate': 3.2500000000000004e-05, 'epoch': 0.65}


 70%|███████   | 140/200 [03:36<01:35,  1.58s/it]

{'loss': 1.4009, 'learning_rate': 3.5e-05, 'epoch': 0.7}


 75%|███████▌  | 150/200 [03:50<01:07,  1.34s/it]

{'loss': 1.8175, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.75}


 80%|████████  | 160/200 [04:02<00:46,  1.15s/it]

{'loss': 1.5389, 'learning_rate': 4e-05, 'epoch': 0.8}


 85%|████████▌ | 170/200 [04:13<00:33,  1.13s/it]

{'loss': 1.275, 'learning_rate': 4.25e-05, 'epoch': 0.85}


 90%|█████████ | 180/200 [04:25<00:22,  1.12s/it]

{'loss': 1.4409, 'learning_rate': 4.5e-05, 'epoch': 0.9}


 95%|█████████▌| 190/200 [04:36<00:11,  1.13s/it]

{'loss': 1.3328, 'learning_rate': 4.75e-05, 'epoch': 0.95}


100%|██████████| 200/200 [04:50<00:00,  1.51s/it]***** Running Evaluation *****
  Num examples = 200
  Batch size = 4


{'loss': 0.9774, 'learning_rate': 0.0, 'epoch': 1.0}


                                                 
100%|██████████| 200/200 [04:56<00:00,  1.51s/it]Saving model checkpoint to ./results\checkpoint-200
Configuration saved in ./results\checkpoint-200\config.json


{'eval_loss': 0.8584941625595093, 'eval_runtime': 6.5193, 'eval_samples_per_second': 30.678, 'eval_steps_per_second': 7.67, 'epoch': 1.0}


Model weights saved in ./results\checkpoint-200\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results\checkpoint-200 (score: 0.8584941625595093).
100%|██████████| 200/200 [05:02<00:00,  1.51s/it]
Configuration saved in bert_crime_model.bin\config.json


{'train_runtime': 302.2254, 'train_samples_per_second': 2.647, 'train_steps_per_second': 0.662, 'train_loss': 2.247146849632263, 'epoch': 1.0}


Model weights saved in bert_crime_model.bin\pytorch_model.bin
tokenizer config file saved in bert_crime_tokenizer\tokenizer_config.json
Special tokens file saved in bert_crime_tokenizer\special_tokens_map.json
***** Running Prediction *****
  Num examples = 200
  Batch size = 4
100%|██████████| 50/50 [00:06<00:00,  7.47it/s]

BERT Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       0.98      1.00      0.99        57
           3       0.20      1.00      0.33         1
           4       0.79      1.00      0.88        11
           5       0.85      1.00      0.92        11
           6       0.73      0.79      0.76        24
           7       0.00      0.00      0.00         3
           8       1.00      1.00      1.00        26
           9       0.00      0.00      0.00         1
          10       0.69      1.00      0.82        27
          11       0.00      0.00      0.00         3
          12       0.00      0.00      0.00         2
          13       0.00      0.00      0.00         4
          14       1.00      1.00      1.00         4
          15       1.00      1.00      1.00         1
          16       0.78      0.78      0.78         9
          17       0.00      0.00      0.00         1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
