In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import classification_report
import requests

class CrimeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_model(model, train_loader, val_loader, device, epochs=3):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )
            
            loss = outputs.loss
            train_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        val_loss = 0
        predictions = []
        actual_labels = []
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    labels=labels
                )
                
                val_loss += outputs.loss.item()
                preds = torch.argmax(outputs.logits, dim=1)
                predictions.extend(preds.cpu().numpy())
                actual_labels.extend(labels.cpu().numpy())
        
        print(f'Epoch {epoch + 1}:')
        print(f'Average training loss: {train_loss / len(train_loader)}')
        print(f'Average validation loss: {val_loss / len(val_loader)}')
        print('Classification Report:')
        print(classification_report(actual_labels, predictions))

def predict_crime_type(text, model, tokenizer, label_encoder, device):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=True,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        token_type_ids = encoding['token_type_ids'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        probs = torch.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probs, dim=1)
        
    return label_encoder.inverse_transform(predicted_class.cpu().numpy())[0], probs[0][predicted_class].item()

# Add this to your existing code after loading the data
def setup_crime_prediction_model(data):
    # Prepare the data
    features = data['City'] + ' ' + data['Location'] + ' ' + data['Time'].astype(str)
    labels = data['Crime Description']
    
    # Encode labels
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)
    
    # Split the data
    X_train, X_val, y_train, y_val = train_test_split(
        features, encoded_labels, test_size=0.2, random_state=42
    )
    
    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=len(label_encoder.classes_)
    )
    
    # Prepare datasets
    train_dataset = CrimeDataset(X_train, y_train, tokenizer)
    val_dataset = CrimeDataset(X_val, y_val, tokenizer)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    # Train the model
    print("Training the crime prediction model...")
    train_model(model, train_loader, val_loader, device)
    
    return model, tokenizer, label_encoder, device

# Add this to your main code
if __name__ == "__main__":
    # Your existing code...
    
    # Set up and train the prediction model
    model, tokenizer, label_encoder, device = setup_crime_prediction_model(data)
    
    # Save the trained model
    model_path = os.path.join(output_dir, 'crime_prediction_model')
    os.makedirs(model_path, exist_ok=True)
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    
    # Example prediction
    sample_input = "Mumbai Andheri West 14:00"
    predicted_crime, confidence = predict_crime_type(
        sample_input, model, tokenizer, label_encoder, device
    )
    print(f"\nSample Prediction:")
    print(f"Input: {sample_input}")
    print(f"Predicted Crime Type: {predicted_crime}")
    print(f"Confidence: {confidence:.2%}")
    
    # Create a prediction function that can be called from other scripts
    def predict_crime(city, location, time):
        input_text = f"{city} {location} {time}"
        return predict_crime_type(input_text, model, tokenizer, label_encoder, device)

ModuleNotFoundError: No module named 'transformers'