Data PreProcessing


In [None]:
import pandas as pd
import os
from pathlib import Path

# Load data
project_root = Path(os.getcwd()).resolve().parent
data_path = project_root / "data" / "raw" / "crime_dataset_india.csv"

df = pd.read_csv(data_path)

# Basic cleaning
df.columns = df.columns.str.strip()
df["City"] = df["City"].str.strip().str.title()  # Normalize city names

# Crime count per city
crime_city = df["City"].value_counts().reset_index()
crime_city.columns = ["City", "Crime Count"]

print(crime_city.head())

In [None]:
import pandas as pd
from pathlib import Path

def preprocess_crime_data(raw_path, save_path):
    # Load raw data
    df = pd.read_csv(raw_path)

    # Convert dates & extract year/month
    df['Date of Occurrence'] = pd.to_datetime(df['Date of Occurrence'], errors='coerce')
    df = df.dropna(subset=['Date of Occurrence'])
    df['Year'] = df['Date of Occurrence'].dt.year
    df['Month'] = df['Date of Occurrence'].dt.month

    # Normalize city names
    df['City'] = df['City'].str.strip().str.title()

    # City to State mapping (add or adjust if needed)
    city_to_state = {
        "Ahmedabad": "Gujarat",
        "Chennai": "Tamil Nadu",
        "Ludhiana": "Punjab",
        "Pune": "Maharashtra",
        "Delhi": "Delhi",
        "Mumbai": "Maharashtra",
        "Bangalore": "Karnataka",
        "Hyderabad": "Telangana",
        "Kolkata": "West Bengal",
        "Jaipur": "Rajasthan",
        "Lucknow": "Uttar Pradesh",
        "Kanpur": "Uttar Pradesh",
        "Surat": "Gujarat",
        "Nagpur": "Maharashtra",
        "Agra": "Uttar Pradesh",
        "Visakhapatnam": "Andhra Pradesh",
        "Thane": "Maharashtra",
        "Ghaziabad": "Uttar Pradesh",
        "Indore": "Madhya Pradesh",
        "Patna": "Bihar",
        "Bhopal": "Madhya Pradesh",
        "Meerut": "Uttar Pradesh",
        "Srinagar": "Jammu and Kashmir",
        "Nashik": "Maharashtra",
        "Vasai": "Maharashtra",
        "Varanasi": "Uttar Pradesh",
        "Kalyan": "Maharashtra",
        "Faridabad": "Haryana",
        "Rajkot": "Gujarat",
    }
    df['State'] = df['City'].map(city_to_state)

    # Drop rows with unmapped states
    df = df.dropna(subset=['State'])

    # Aggregate: count crimes by State, Year, Month, Crime Description
    agg_df = (
        df.groupby(['State', 'Year', 'Month', 'Crime Description'])
        .size()
        .reset_index(name='Crime Count')
    )

    # Save processed data for visualization
    save_path.parent.mkdir(parents=True, exist_ok=True)
    agg_df.to_csv(save_path, index=False)

    print(f"Processed data saved to {save_path}")


raw_path = project_root / "data" / "raw" / "crime_dataset_india.csv"
save_path = project_root / "data" / "processed" / "crime_data_processed.csv"

preprocess_crime_data(raw_path, save_path)


Visualization Testing

In [None]:
import json
import pandas as pd
import plotly.express as px

# Load the aggregated data
prepared_data_set = project_root / "data" / "processed" / "crime_data_processed.csv"
df = pd.read_csv(prepared_data_set)


geojson_loc = project_root / "json" / "cleaned.geojson"
with open(geojson_loc) as f:
    india_states = json.load(f)

all_states = [feature["properties"]["st_nm"] for feature in india_states["features"]]
all_states_df = pd.DataFrame({"State": all_states})

# print(india_states['features'][0]['properties'])
# Filter data example

year_filter = 2020
month_filter = 1
crime_filter = 'BURGLARY'

df_filtered = df[
    (df['Year'] == year_filter) &
    (df['Month'] == month_filter) &
    (df['Crime Description'] == crime_filter)
]

merged_df = all_states_df.merge(df_filtered, on="State", how="left").fillna(0)
merged_df["Crime Count"] = merged_df["Crime Count"].astype(int)

fig = px.choropleth(
    merged_df,              # Loading data
    geojson=india_states,   # Loading India states geojson
    locations='State',
    featureidkey='properties.st_nm',       
    color='Crime Count',
    color_continuous_scale='Reds',
    scope='asia',
    title=f'{crime_filter} Cases in {year_filter}-{month_filter:02d}'
)

fig.update_geos(fitbounds="locations", visible=False)
fig.show()


New NCRB dataset 

In [1]:
import pandas as pd
import os
from pathlib import Path

# Load data
project_root = Path(os.getcwd()).resolve().parent
data_path = project_root / "data" / "raw" / "NCRB_Master_2020_2023.xlsx"

df = pd.read_excel(data_path)

df.drop(columns=["Annual Total"])

df_long = df.melt(
    id_vars=["State", "year", "Crime Category"],
    value_vars=["Jan","Feb","Mar","Apr","May","Jun",
                "Jul","Aug","Sep","Oct","Nov","Dec"],
    var_name="Month",
    value_name="Crime Count"

)
df_long.head(10)

Unnamed: 0,State,year,Crime Category,Month,Crime Count
0,Andhra Pradesh,2020,Arson,Jan,0
1,Andhra Pradesh,2020,Assault,Jan,55200
2,Andhra Pradesh,2020,Burglary,Jan,9900
3,Andhra Pradesh,2020,Counter Feiting,Jan,36500
4,Andhra Pradesh,2020,Cyber Crime,Jan,38900
5,Andhra Pradesh,2020,Domestic Violence,Jan,10600
6,Andhra Pradesh,2020,Drug Offence,Jan,17300
7,Andhra Pradesh,2020,Extortion,Jan,35800
8,Andhra Pradesh,2020,FireArm Offence,Jan,1900
9,Andhra Pradesh,2020,Fraud,Jan,16000


In [4]:
df.keys()

Index(['year', 'State', 'Crime Category', 'Jan', 'Feb', 'Mar', 'Apr', 'May',
       'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Annual Total'],
      dtype='object')

In [6]:
ap_jan_2020_total = (
    df_long[
        (df_long["State"] == "Andhra Pradesh") &
        (df_long["year"] == 2023) &
        (df_long["Month"] == 'Jan')
    ]["Crime Count"]
    .sum()
)

print("Total crimes in Andhra Pradesh (Jan 2020):", ap_jan_2020_total)

Total crimes in Andhra Pradesh (Jan 2020): 450100


In [19]:
month_map = {
    "Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4,
    "May": 5, "Jun": 6, "Jul": 7, "Aug": 8,
    "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12
}

df_long["Month"] = df_long["Month"].map(month_map)

In [20]:
df_long = df_long.rename(columns={
    "year": "Year",
    "Crime Category": "Crime Description"
})

In [21]:
df_long["Crime Description"] = (
    df_long["Crime Description"]
    .str.upper()
    .str.replace(" ", "_", regex=False)
)

In [22]:
df_long = df_long[
    ["State", "Year", "Month", "Crime Description", "Crime Count"]
]

In [23]:
df_long = df_long[df_long["Crime Count"] > 0]

In [24]:
df_long.head(10)

Unnamed: 0,State,Year,Month,Crime Description,Crime Count
1,Andhra Pradesh,2020,1,ASSAULT,55200
2,Andhra Pradesh,2020,1,BURGLARY,9900
3,Andhra Pradesh,2020,1,COUNTER_FEITING,36500
4,Andhra Pradesh,2020,1,CYBER_CRIME,38900
5,Andhra Pradesh,2020,1,DOMESTIC_VIOLENCE,10600
6,Andhra Pradesh,2020,1,DRUG_OFFENCE,17300
7,Andhra Pradesh,2020,1,EXTORTION,35800
8,Andhra Pradesh,2020,1,FIREARM_OFFENCE,1900
9,Andhra Pradesh,2020,1,FRAUD,16000
10,Andhra Pradesh,2020,1,HOMICIDE,6900


In [27]:
df_long["State"] = df_long["State"].replace({
    "Chattisgarh": "Chhattisgarh"
})
df_long["State"] = df_long["State"].replace({
    "Jarkhand": "Jharkhand"
})
df_long["State"] = df_long["State"].replace({
    "Gujarath": "Gujarat"
})
df_long["State"] = df_long["State"].replace({
    "Maharstra": "Maharashtra"
})

In [29]:
print(df_long["State"].unique())

['Andhra Pradesh' 'Arunachal Pradesh' 'Assam' 'Bihar' 'Chhattisgarh' 'Goa'
 'Gujarat' 'Haryana' 'Himachal Pradesh' 'Jammu & Kashmir' 'Jharkhand'
 'Karnataka' 'Kerala' 'Ladakh' 'Madhya Pradesh' 'Maharashtra' 'Manipur'
 'Meghalaya' 'Mizoram' 'Nagaland' 'Odisha' 'Punjab' 'Rajasthan' 'Sikkim'
 'Tamil Nadu' 'Telangana' 'Tripura' 'Uttar Pradesh' 'Uttarakhand'
 'West Bengal']


In [30]:
save_path = project_root / "data" / "processed" / "crime_ncrb_processed.csv"
df_long.to_csv(save_path, index=False)