In [1]:
import pandas as pd 
import numpy as np
from datetime import datetime

In [2]:
df = pd.read_csv("crime_dataset_india.csv")
df.head()

Unnamed: 0,Report Number,Date Reported,Date of Occurrence,Time of Occurrence,City,Crime Code,Crime Description,Victim Age,Victim Gender,Weapon Used,Crime Domain,Police Deployed,Case Closed,Date Case Closed
0,1,02-01-2020 00:00,01-01-2020 00:00,01-01-2020 01:11,Ahmedabad,576,IDENTITY THEFT,16,M,Blunt Object,Violent Crime,13,No,
1,2,01-01-2020 19:00,01-01-2020 01:00,01-01-2020 06:26,Chennai,128,HOMICIDE,37,M,Poison,Other Crime,9,No,
2,3,02-01-2020 05:00,01-01-2020 02:00,01-01-2020 14:30,Ludhiana,271,KIDNAPPING,48,F,Blunt Object,Other Crime,15,No,
3,4,01-01-2020 05:00,01-01-2020 03:00,01-01-2020 14:46,Pune,170,BURGLARY,49,F,Firearm,Other Crime,1,Yes,29-04-2020 05:00
4,5,01-01-2020 21:00,01-01-2020 04:00,01-01-2020 16:51,Pune,421,VANDALISM,30,F,Other,Other Crime,18,Yes,08-01-2020 21:00


In [3]:
# Change Columns Names

data = df.rename(columns={
    'Report Number': 'report_number',
    'Date Reported': 'date_reported',
    'Date of Occurrence': 'date_of_occurrence',
    'Time of Occurrence': 'time_of_occurrence',
    'City': 'city',
    'Crime Code': 'crime_code',
    'Crime Description': 'crime_description',
    'Victim Age': 'victim_age',
    'Victim Gender': 'victim_gender',
    'Weapon Used': 'weapon_used',
    'Crime Domain': 'crime_domain',
    'Police Deployed': 'police_deployed',
    'Case Closed': 'case_closed',
    'Date Case Closed': 'date_case_closed'
})

In [4]:
data = data.set_index("report_number")
data.sample(5)

Unnamed: 0_level_0,date_reported,date_of_occurrence,time_of_occurrence,city,crime_code,crime_description,victim_age,victim_gender,weapon_used,crime_domain,police_deployed,case_closed,date_case_closed
report_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
904,07-02-2020 22:00,02-07-2020 15:00,07-02-2020 22:02,Lucknow,441,TRAFFIC VIOLATION,62,M,Other,Traffic Fatality,15,Yes,19-02-2020 22:00
34108,23-11-2023 01:00,11-22-2023 03:00,22-11-2023 11:32,Chennai,491,HOMICIDE,23,M,Other,Other Crime,7,No,
6072,11-09-2020 22:00,09-09-2020 23:00,10-09-2020 14:08,Chennai,201,CYBERCRIME,63,M,Firearm,Other Crime,13,No,
11424,23-04-2021 03:00,04-20-2021 23:00,21-04-2021 06:38,Faridabad,117,VEHICLE - STOLEN,10,M,Explosives,Violent Crime,19,No,
402,18-01-2020 05:00,01-17-2020 17:00,18-01-2020 03:18,Agra,169,BURGLARY,36,F,,Other Crime,12,Yes,29-04-2020 05:00


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40160 entries, 1 to 40160
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   date_reported       40160 non-null  object
 1   date_of_occurrence  40160 non-null  object
 2   time_of_occurrence  40160 non-null  object
 3   city                40160 non-null  object
 4   crime_code          40160 non-null  int64 
 5   crime_description   40160 non-null  object
 6   victim_age          40160 non-null  int64 
 7   victim_gender       40160 non-null  object
 8   weapon_used         34370 non-null  object
 9   crime_domain        40160 non-null  object
 10  police_deployed     40160 non-null  int64 
 11  case_closed         40160 non-null  object
 12  date_case_closed    20062 non-null  object
dtypes: int64(3), object(10)
memory usage: 4.3+ MB


In [6]:
print(f"Column Names : {data.columns}")

Column Names : Index(['date_reported', 'date_of_occurrence', 'time_of_occurrence', 'city',
       'crime_code', 'crime_description', 'victim_age', 'victim_gender',
       'weapon_used', 'crime_domain', 'police_deployed', 'case_closed',
       'date_case_closed'],
      dtype='object')


In [7]:
print("dataset shape = ", data.shape)

dataset shape =  (40160, 13)


# Data Cleaning and Preprocessing

In [8]:
data.isnull().sum()

date_reported             0
date_of_occurrence        0
time_of_occurrence        0
city                      0
crime_code                0
crime_description         0
victim_age                0
victim_gender             0
weapon_used            5790
crime_domain              0
police_deployed           0
case_closed               0
date_case_closed      20098
dtype: int64

In [9]:
# Function to parse mixed date-time formats for day-first formats (Date Reported and Time of Occurrence)
def parse_mixed_date_format_date(date_str):
    if isinstance(date_str, pd.Timestamp):  # If already a Timestamp, return it directly
        return date_str
    if pd.isna(date_str):  # If NaN, return NaT
        return pd.NaT
    for fmt in ("%d/%m/%Y %H:%M", "%d-%m-%Y %H:%M"):
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            continue
    return pd.NaT  # Return NaT if no format matches

# Function to parse mixed date-time formats for month-first formats
def parse_mixed_date_format_month(date_str):
    if isinstance(date_str, pd.Timestamp):  # If already a Timestamp, return it directly
        return date_str
    if pd.isna(date_str):  # If NaN, return NaT
        return pd.NaT
    for fmt in ("%m/%d/%Y %H:%M", "%m-%d-%Y %H:%M"):
        try:
            return datetime.strptime(date_str, fmt)
        except ValueError:
            continue
    return pd.NaT  # Return NaT if no format matches


In [10]:
# Change object to datatime 
data['date_reported'] = data['date_reported'].apply(parse_mixed_date_format_date)
data['date_of_occurrence'] = data['date_of_occurrence'].apply(parse_mixed_date_format_month)
data['time_of_occurrence'] = data['time_of_occurrence'].apply(parse_mixed_date_format_date)
data['date_case_closed'] = data['date_case_closed'].apply(parse_mixed_date_format_date)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40160 entries, 1 to 40160
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date_reported       40160 non-null  datetime64[ns]
 1   date_of_occurrence  40160 non-null  datetime64[ns]
 2   time_of_occurrence  40160 non-null  datetime64[ns]
 3   city                40160 non-null  object        
 4   crime_code          40160 non-null  int64         
 5   crime_description   40160 non-null  object        
 6   victim_age          40160 non-null  int64         
 7   victim_gender       40160 non-null  object        
 8   weapon_used         34370 non-null  object        
 9   crime_domain        40160 non-null  object        
 10  police_deployed     40160 non-null  int64         
 11  case_closed         40160 non-null  object        
 12  date_case_closed    20062 non-null  datetime64[ns]
dtypes: datetime64[ns](4), int64(3), object(6)
memory us

In [12]:
print("Null Values : \n")
data.isnull().sum()

Null Values : 



date_reported             0
date_of_occurrence        0
time_of_occurrence        0
city                      0
crime_code                0
crime_description         0
victim_age                0
victim_gender             0
weapon_used            5790
crime_domain              0
police_deployed           0
case_closed               0
date_case_closed      20098
dtype: int64

In [13]:
data.sample(3)

Unnamed: 0_level_0,date_reported,date_of_occurrence,time_of_occurrence,city,crime_code,crime_description,victim_age,victim_gender,weapon_used,crime_domain,police_deployed,case_closed,date_case_closed
report_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
14615,2021-09-02 05:00:00,2021-08-31 22:00:00,2021-09-01 12:28:00,Mumbai,483,BURGLARY,50,F,Firearm,Other Crime,6,No,NaT
4808,2020-07-20 00:00:00,2020-07-19 07:00:00,2020-07-19 22:11:00,Varanasi,467,BURGLARY,12,F,Blunt Object,Other Crime,14,No,NaT
35930,2024-02-06 17:00:00,2024-02-06 01:00:00,2024-02-06 22:21:00,Kolkata,323,DOMESTIC VIOLENCE,63,F,,Violent Crime,8,No,NaT


In [14]:
data['weapon_used'] = data['weapon_used'].fillna('Weapon')
data['date_case_closed'] = data['date_case_closed'].fillna(pd.to_datetime('2050-01-01 00:00:00'))

In [15]:
data.isnull().sum()

date_reported         0
date_of_occurrence    0
time_of_occurrence    0
city                  0
crime_code            0
crime_description     0
victim_age            0
victim_gender         0
weapon_used           0
crime_domain          0
police_deployed       0
case_closed           0
date_case_closed      0
dtype: int64

In [16]:
data = data.drop_duplicates()

In [17]:
data.to_pickle("./crime_dataset.pkl")