In [7]:
!pip install pandas folium matplotlib seaborn numpy requests google-api-core google-generativeai

import os
import sys
import pandas as pd
import folium
from folium.plugins import HeatMap
import matplotlib.pyplot as plt
import time
import json
from google.api_core import retry
import google.generativeai as genai
import seaborn as sns
import numpy as np
import requests

# Define the output directory
output_dir = r'./maps'

# Step 1: Create the output directory if it doesn't exist
if not os.path.exists(output_dir):
    try:
        os.makedirs(output_dir)
        print(f"Output directory created: {output_dir}")
    except Exception as e:
        print(f"Error creating output directory: {e}")
        sys.exit(1)
else:
    print(f"Output directory already exists: {output_dir}")

# Step 2: Load the dataset from SheetDB API
API_URL = "https://sheetdb.io/api/v1/uvztmda65pyb0"

def get_data_from_sheetdb():
    response = requests.get(API_URL)
    if response.status_code == 200:
        return pd.DataFrame(response.json())
    else:
        print(f"Error fetching data: {response.status_code}")
        sys.exit(1)

data = get_data_from_sheetdb()
print("Data loaded successfully.")
print(data.head())

# Display column names and check for missing values
print("\nColumns in the dataset:")
print(data.columns)

print("\nMissing values:")
print(data.isnull().sum())

# Step 3: Add latitude and longitude
# Gemini API Key
GOOGLE_API_KEY = "AIzaSyATXE22og8-HoroqLF9J5wlb1l58aHOhhU"

# Configure Google Generative AI
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-pro')

# Implement caching
cache_file = 'city_coordinates_cache.json'
try:
    with open(cache_file, 'r') as f:
        city_coordinates = json.load(f)
except FileNotFoundError:
    city_coordinates = {}

@retry.Retry(predicate=retry.if_exception_type(Exception))
def get_gemini_response(question):
    response = model.generate_content(question)
    return response.text

def get_lat_long_from_gemini(city_name):
    """Function to get latitude and longitude from Gemini API with caching and rate limiting"""
    if city_name in city_coordinates:
        return city_coordinates[city_name]

    try:
        response = get_gemini_response(f"Provide only the latitude and longitude coordinates for {city_name}, India. Format the response as two decimal numbers separated by a comma.")
        lat, lon = map(float, response.split(','))
        city_coordinates[city_name] = (lat, lon)
        
        # Save updated cache
        with open(cache_file, 'w') as f:
            json.dump(city_coordinates, f)
        
        return lat, lon
    except Exception as e:
        print(f"Error fetching coordinates for {city_name}: {e}")
        return None, None

# Populate the city_coordinates dictionary
for city in data['City'].unique():
    if city not in city_coordinates:
        lat, lon = get_lat_long_from_gemini(city)
        if lat and lon:
            city_coordinates[city] = (lat, lon)
        time.sleep(1)  # Add a 1-second delay between API calls

# Add Latitude and Longitude columns
data['Latitude'] = data['City'].map(lambda x: city_coordinates.get(x, (None, None))[0])
data['Longitude'] = data['City'].map(lambda x: city_coordinates.get(x, (None, None))[1])

# Calculate Crime Rate (assuming it's based on the number of crimes per city)
crime_counts = data['City'].value_counts()
data['Crime Rate'] = data['City'].map(crime_counts)

# Prepare data for HeatMap
heat_data = [[row['Latitude'], row['Longitude'], row['Crime Rate']] for index, row in data.iterrows() if pd.notnull(row['Latitude']) and pd.notnull(row['Longitude'])]

# Create a base map centered on India
m = folium.Map(location=[20.5937, 78.9629], zoom_start=5)
HeatMap(heat_data).add_to(m)

# Add HeatMap layer
HeatMap(heat_data).add_to(m)

# Step 4: Save the map to an HTML file
try:
    map_path = os.path.join(output_dir, 'crime_heatmap.html')
    m.save(map_path)
    print(f"Heatmap saved to: {map_path}")
except Exception as e:
    print(f"Error saving heatmap: {e}")

# Step 5: Identify top 10 cities with highest crime rates
top_10_cities = data.groupby('City')['Crime Rate'].mean().nlargest(10).reset_index()

# Create a bar plot for top 10 cities
plt.figure(figsize=(12, 6))
plt.bar(top_10_cities['City'], top_10_cities['Crime Rate'])
plt.title('Top 10 Cities with Highest Crime Rates')
plt.xlabel('City')
plt.ylabel('Crime Rate')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Step 6: Save the top 10 cities plot
try:
    cities_path = os.path.join(output_dir, 'top_10_crime_cities.png')
    plt.savefig(cities_path)
    plt.close()
    print(f"Top 10 crime cities plot saved to: {cities_path}")
except Exception as e:
    print(f"Error saving top 10 crime cities plot: {e}")

# Step 7: Additional analysis based on the new data structure
# Crime type distribution
crime_type_dist = data['Crime Description'].value_counts().nlargest(10)
plt.figure(figsize=(12, 6))
crime_type_dist.plot(kind='bar')
plt.title('Top 10 Most Common Crime Types')
plt.xlabel('Crime Type')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

try:
    crime_type_path = os.path.join(output_dir, 'top_10_crime_types.png')
    plt.savefig(crime_type_path)
    plt.close()
    print(f"Top 10 crime types plot saved to: {crime_type_path}")
except Exception as e:
    print(f"Error saving top 10 crime types plot: {e}")

# Victim age distribution (improved version)
plt.figure(figsize=(12, 8))
sns.histplot(data['Victim Age'].astype(float), bins=20, kde=True, color="skyblue", edgecolor="darkblue")

plt.title("Distribution of Victim Ages", fontsize=16, fontweight='bold')
plt.xlabel("Age", fontsize=12)
plt.ylabel("Frequency", fontsize=12)

# Add summary statistics
mean_age = data['Victim Age'].astype(float).mean()
median_age = data['Victim Age'].astype(float).median()
plt.axvline(mean_age, color='red', linestyle='dashed', linewidth=2, label=f'Mean Age: {mean_age:.1f}')
plt.axvline(median_age, color='green', linestyle='dashed', linewidth=2, label=f'Median Age: {median_age:.1f}')

plt.legend(fontsize=10)

# Add text for additional statistics
plt.text(0.95, 0.95, f"Total Victims: {len(data)}\nStd Dev: {data['Victim Age'].astype(float).std():.2f}", 
         transform=plt.gca().transAxes, ha='right', va='top', 
         bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))

plt.tight_layout()

try:
    age_dist_path = os.path.join(output_dir, 'victim_age_distribution_improved.png')
    plt.savefig(age_dist_path)
    plt.close()
    print(f"Improved victim age distribution plot saved to: {age_dist_path}")
except Exception as e:
    print(f"Error saving improved victim age distribution plot: {e}")

print(f"Analysis complete. Check {output_dir} for available visualizations.")
import os
os.getcwd()

from IPython.display import IFrame
IFrame(src="maps/crime_heatmap.html", width=800, height=600)




Output directory already exists: ./maps
Data loaded successfully.
  Timestamp Report Number       City Crime Code Crime Description Victim Age  \
0                       1  Ahmedabad        576    IDENTITY THEFT         16   
1                       2    Chennai        128          HOMICIDE         37   
2                       3   Ludhiana        271        KIDNAPPING         48   
3                       4       Pune        170          BURGLARY         49   
4                       5       Pune        421         VANDALISM         30   

  Victim Gender   Weapon Used   Crime Domain Police Deployed Case Closed  
0             M  Blunt Object  Violent Crime              13          No  
1             M        Poison    Other Crime               9          No  
2             F  Blunt Object    Other Crime              15          No  
3             F       Firearm    Other Crime               1         Yes  
4             F         Other    Other Crime              18         Yes  

Co

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import classification_report
import requests
import os

def load_crime_data():
    """Load crime data from the Google Sheets URL"""
    # Google Sheets URL to CSV export
    SHEET_ID = "1Pjtr4PO1BOBiQRRZoJNbin2RBuFLmy-ryZSx7WtK0CA"
    SHEET_NAME = "Form responses 1"
    url = f"https://docs.google.com/spreadsheets/d/{SHEET_ID}/export?format=csv&gid=630647404"
    
    try:
        # Read the CSV data directly from the URL
        df = pd.read_csv(url)
        
        # Print available columns for debugging
        print("Available columns in the dataset:")
        print(df.columns.tolist())
        
        # Basic data cleaning
        # Remove any completely empty rows
        df = df.dropna(how='all')
        
        # Fill missing values for text columns with 'Unknown'
        text_columns = df.select_dtypes(include=['object']).columns
        df[text_columns] = df[text_columns].fillna('Unknown')
        
        return df
    
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW  # Updated to use PyTorch's AdamW
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import classification_report
import requests
import os

class CrimeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        # Convert pandas Series to list to avoid indexing issues
        self.texts = texts.values if isinstance(texts, pd.Series) else texts
        self.labels = labels.values if isinstance(labels, pd.Series) else labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_model(model, train_loader, val_loader, device, epochs=3):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                labels=labels
            )
            
            loss = outputs.loss
            train_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            
            if (batch_idx + 1) % 100 == 0:
                print(f'Epoch {epoch + 1}, Batch {batch_idx + 1}, Loss: {loss.item():.4f}')
        
        # Validation
        model.eval()
        val_loss = 0
        predictions = []
        actual_labels = []
        
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                labels = batch['labels'].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    labels=labels
                )
                
                val_loss += outputs.loss.item()
                preds = torch.argmax(outputs.logits, dim=1)
                predictions.extend(preds.cpu().numpy())
                actual_labels.extend(labels.cpu().numpy())
        
        print(f'Epoch {epoch + 1}:')
        print(f'Average training loss: {train_loss / len(train_loader)}')
        print(f'Average validation loss: {val_loss / len(val_loader)}')
        print('Classification Report:')
        print(classification_report(
            actual_labels, 
            predictions, 
            zero_division=0,  # Handle zero-division case
            digits=4  # Increase precision in the report
        ))

def setup_crime_prediction_model(data):
    """Set up and train the BERT model with the available columns"""
    # Print data info for debugging
    print("\nDataset Info:")
    print(data.info())
    
    # Identify available columns for features
    time_col = 'Timestamp'
    crime_type_col = 'Crime Description'
    city_col = 'City'
    
    print("\nUsing columns:")
    print(f"Time column: {time_col}")
    print(f"Crime type column: {crime_type_col}")
    print(f"City column: {city_col}")
    
    # Prepare the features using available columns
    features = data[city_col].astype(str)
    if time_col in data.columns:
        features = features + ' ' + data[time_col].astype(str)
    
    if 'Crime Domain' in data.columns:
        features = features + ' ' + data['Crime Domain'].astype(str)
    
    # Prepare labels
    labels = data[crime_type_col]
    
    # Print class distribution
    print("\nClass distribution:")
    print(labels.value_counts())
    
    # Encode labels
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(labels)
    
    # Use stratified split to maintain class distribution
    X_train, X_val, y_train, y_val = train_test_split(
        features, 
        encoded_labels, 
        test_size=0.2, 
        random_state=42,
        stratify=encoded_labels  # Ensure balanced split
    )
    
    # Initialize tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=len(label_encoder.classes_),
        # Add class weights to handle imbalance
        problem_type="single_label_classification"
    )
    
    # Calculate class weights
    class_counts = np.bincount(encoded_labels)
    class_weights = 1. / torch.tensor(class_counts, dtype=torch.float)
    class_weights = class_weights / class_weights.sum()
    class_weights = class_weights.to(device)
    
    # Add class weights to the model's config
    model.config.class_weights = class_weights.tolist()
    
    # Create datasets with balanced sampling
    train_dataset = CrimeDataset(X_train, y_train, tokenizer)
    val_dataset = CrimeDataset(X_val, y_val, tokenizer)
    
    # Calculate sample weights for training data
    train_sample_weights = [class_weights[label].item() for label in y_train]
    
    # Create weighted sampler for training data
    train_sampler = torch.utils.data.WeightedRandomSampler(
        weights=train_sample_weights,
        num_samples=len(train_sample_weights),
        replacement=True
    )
    
    # Create data loaders with weighted sampler for training
    train_loader = DataLoader(
        train_dataset,
        batch_size=16,
        sampler=train_sampler,  # Use weighted sampler
        num_workers=0
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=16,
        shuffle=False,
        num_workers=0
    )
    
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    
    # Train the model
    print("\nTraining the crime prediction model...")
    train_model(model, train_loader, val_loader, device)
    
    return model, tokenizer, label_encoder, device

def predict_crime_type(text, model, tokenizer, label_encoder, device):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=True,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        token_type_ids = encoding['token_type_ids'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        
        probs = torch.softmax(outputs.logits, dim=1)
        predicted_class = torch.argmax(probs, dim=1)
        
    return label_encoder.inverse_transform(predicted_class.cpu().numpy())[0], probs[0][predicted_class].item()

if __name__ == "__main__":
    # Define the output directory
    output_dir = 'crime_prediction_output'
    os.makedirs(output_dir, exist_ok=True)
    
    # Load the data and print column names
    print("Loading crime data...")
    data = load_crime_data()
    print("\nData shape:", data.shape)
    print("\nSample data:")
    print(data.head())
    
    # Set up and train the prediction model
    model, tokenizer, label_encoder, device = setup_crime_prediction_model(data)
    
    # Save the trained model
    model_path = os.path.join(output_dir, 'crime_prediction_model')
    os.makedirs(model_path, exist_ok=True)
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    
    # Example prediction
    sample_input = "Mumbai Andheri West 14:00"
    predicted_crime, confidence = predict_crime_type(
        sample_input, model, tokenizer, label_encoder, device
    )
    print(f"\nSample Prediction:")
    print(f"Input: {sample_input}")
    print(f"Predicted Crime Type: {predicted_crime}")
    print(f"Confidence: {confidence:.2%}")

Loading crime data...
Available columns in the dataset:
['Timestamp', 'Report Number', 'City', 'Crime Code', 'Crime Description', 'Victim Age', 'Victim Gender', 'Weapon Used', 'Crime Domain', 'Police Deployed', 'Case Closed']

Data shape: (40161, 11)

Sample data:
  Timestamp  Report Number       City  Crime Code Crime Description  \
0   Unknown              1  Ahmedabad         576    IDENTITY THEFT   
1   Unknown              2    Chennai         128          HOMICIDE   
2   Unknown              3   Ludhiana         271        KIDNAPPING   
3   Unknown              4       Pune         170          BURGLARY   
4   Unknown              5       Pune         421         VANDALISM   

   Victim Age Victim Gender   Weapon Used   Crime Domain  Police Deployed  \
0          16             M  Blunt Object  Violent Crime               13   
1          37             M        Poison    Other Crime                9   
2          48             F  Blunt Object    Other Crime               15   


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


UnboundLocalError: local variable 'device' referenced before assignment