In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from collections import Counter
import requests


In [158]:
df = pd.read_excel('Cleaned_CrimeAgainstWomen_2022.xlsx')
df.drop(df.columns[2:53], axis=1, inplace=True)

In [159]:
#drop rowns with NaN
df = df.dropna()
#drop the first column
df.drop(df.columns[0], axis=1, inplace=True)

In [160]:
#rename the columns
df.columns = ['District', 'Total_Cases']

In [161]:
def geocode_district(district_name, api_key):
    try:
        url = f"https://graphhopper.com/api/1/geocode?q={district_name}&key={api_key}"
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            if data["hits"]:
                print(district_name, data["hits"][0]["point"]["lat"], data["hits"][0]["point"]["lng"])
                return data["hits"][0]["point"]["lat"], data["hits"][0]["point"]["lng"]
        return None, None
    except:
        return None, None

In [162]:
API_KEY = 'b43ba286-2b19-46c3-8066-4bf24159527f'

In [163]:
df[["Latitude", "Longitude"]] = df["District"].apply(
    lambda x: pd.Series(geocode_district(x, API_KEY))
)

Alluri Sitharama Raju 17.88592535 82.21792795528634
Anakapalli 17.671533099999998 82.77475669632513
Anantapuramu 13.1512291 79.172475
Annamayya 13.896238400000001 78.6534801878519
Bapatla 15.9052608 80.4680501
Chittoor 13.2161224 79.0972819
Dr BR Ambedkar Konaseema 16.58556 82.0185021
East Godavari 16.99566385 81.71543778889063
Eluru 16.7104257 81.1153816
Guntakal Railway 15.172825 77.3664764
Guntur 16.2915189 80.4541588
Kakinada 16.9437385 82.2350607
Krishna 12.5188835 78.2206536
Kurnool 15.8309251 78.0425373
Nandyal 15.4736293 78.4806592
NTR 16.816911750000003 80.25022993069638
Palnadu 16.32310375 79.69750488987478
Parvathipuram Manyam 18.783186 83.4278759
Prakasam 15.66426435 79.47330014039173
Sri Potti Sriramulu Nellore 14.7174388 79.66050570996853
Sri Sathya Sai 14.1497362 77.78291847580479
Srikakulam 18.2949307 83.8938844
Tirupati 13.6316368 79.4231711
Viiayawada Railway 16.5179683 80.6195724
Visakhapatnam 17.6935526 83.2921297
Vizianagaram 18.1141255 83.4114389
West Godavari 16.

In [164]:
df

Unnamed: 0,District,Total_Cases,Latitude,Longitude
1,Alluri Sitharama Raju,138.0,17.885925,82.217928
2,Anakapalli,709.0,17.671533,82.774757
3,Anantapuramu,709.0,13.151229,79.172475
4,Annamayya,611.0,13.896238,78.653480
5,Bapatla,864.0,15.905261,80.468050
...,...,...,...,...
997,Kargil,7.0,34.559305,76.125577
998,Leh,8.0,34.164203,77.584813
1001,Lakshadweep,16.0,10.333731,72.920539
1004,Karaikal,0.0,10.915710,79.837576


In [165]:
df

Unnamed: 0,District,Total_Cases,Latitude,Longitude
1,Alluri Sitharama Raju,138.0,17.885925,82.217928
2,Anakapalli,709.0,17.671533,82.774757
3,Anantapuramu,709.0,13.151229,79.172475
4,Annamayya,611.0,13.896238,78.653480
5,Bapatla,864.0,15.905261,80.468050
...,...,...,...,...
997,Kargil,7.0,34.559305,76.125577
998,Leh,8.0,34.164203,77.584813
1001,Lakshadweep,16.0,10.333731,72.920539
1004,Karaikal,0.0,10.915710,79.837576


In [166]:
original_df = df.copy()

In [168]:
from sklearn.preprocessing import QuantileTransformer

# Quantile scaling to 0-10
qt = QuantileTransformer(n_quantiles=100, output_distribution='uniform')
df["Danger"] = qt.fit_transform(df[["Total_Cases"]]) * 10  # Scale to 0-10

In [175]:
from sklearn.preprocessing import StandardScaler

# Use a DataFrame with column names
X = df[["Latitude", "Longitude"]]  # DataFrame with columns "Latitude" and "Longitude"

# Fit the scaler (retains feature names)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # Now knows feature names are "Latitude" and "Longitude"

In [176]:
from sklearn.model_selection import train_test_split

# Split the DataFrame (preserve column names)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit on DataFrame with column names
X_test_scaled = scaler.transform(X_test)

In [181]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, max_depth=5)
rf.fit(X_train, y_train)
print("Random Forest R²:", rf.score(X_test, y_test))

Random Forest R²: 0.17594295675682114


In [182]:
def predict_danger(lat, lng):
    # Create a DataFrame with the same column names
    new_data = pd.DataFrame([[lat, lng]], columns=["Latitude", "Longitude"])
    
    # Scale the input (retains feature names)
    scaled_data = scaler.transform(new_data)  # No warning now!
    
    # Predict danger level
    danger = rf.predict(scaled_data)[0]
    return danger

In [185]:
prediction = predict_danger(34.1526, 77.5771)
print(f"Danger Level: {prediction:.2f}")

Danger Level: 4.04




In [186]:
import joblib

In [187]:
#Export the rf model using joblib
joblib.dump(rf, "CrimeLocationPrediction.pkl")

['CrimeLocationPrediction.pkl']