# Notebook Goals:
- [] Remove major errors, duplicates, and outliers
- [x] Remove unwanted data field(s)
    * Removed all irrelevant field(s).
- [x] Bring structure
    * Parsed the Date into two columns.
- [x] Handle missing data
    * Filled all NA values with zeroes (0).

In [165]:
import pandas as pd
import numpy as np

In [166]:
# Optimize loading the DataFrame into memory by only requesting the columns that are relevant towards the problem statement.
requested_columns = ["INCIDENT_DATE", "INCIDENT_NUMBER", "LOCATION_DISTRICT", "OFFENSE_DESCRIPTION", "WEAPON_TYPE", "INCIDENT_LOCATION", "ZIP", "LATITUDE", "LONGITUDE"]

lrpd = pd.read_csv("../data/lrpd.csv", usecols=requested_columns)

In [167]:
# Drop all duplicate rows!
prev_shape = lrpd.shape
lrpd = lrpd.drop_duplicates(subset=["INCIDENT_NUMBER"], keep='last')
new_shape = lrpd.shape

In [168]:
diff = prev_shape[0] - new_shape[0]
print(f'Dropped {diff} duplicates!')

Dropped 6369 duplicates!


In [169]:
lrpd.set_index("INCIDENT_NUMBER", inplace=True)

In [170]:
# Fill all the Pandas-handled Missing Data with zeroes (0).
lrpd = lrpd.fillna(0)

In [171]:
lrpd["WEAPON_TYPE"] = lrpd["WEAPON_TYPE"].replace(0, "NO WEAPON")
lrpd["WEAPON_TYPE"] = lrpd["WEAPON_TYPE"].replace("1", "NO WEAPON")

In [172]:
lrpd.loc[lrpd["WEAPON_TYPE"] == "1"]

Unnamed: 0_level_0,INCIDENT_DATE,LOCATION_DISTRICT,OFFENSE_DESCRIPTION,WEAPON_TYPE,INCIDENT_LOCATION,ZIP,LATITUDE,LONGITUDE
INCIDENT_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1


In [173]:
lrpd.head()

Unnamed: 0_level_0,INCIDENT_DATE,LOCATION_DISTRICT,OFFENSE_DESCRIPTION,WEAPON_TYPE,INCIDENT_LOCATION,ZIP,LATITUDE,LONGITUDE
INCIDENT_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-092971,08/04/2021 10:21:00 AM,72.0,THEFT FROM MOTOR VEHICLE,NO WEAPON,15601 KANIS RD,72204.0,0.0,0.0
2017-029450,03/16/2017 06:30:00 PM,82.0,THEFT OF MOTOR VEHICLE PARTS,NO WEAPON,10801 IRONTON CUTOFF RD,72206.0,34.649362,-92.301959
2017-156453,09/01/2017 12:00:00 PM,53.0,RAPE,NO WEAPON,0,0.0,0.0,0.0
2019-130717,10/17/2019 09:00:00 PM,51.0,RAPE,NO WEAPON,0,0.0,0.0,0.0
2020-073313,07/02/2020 07:10:00 PM,50.0,RAPE,NO WEAPON,0,0.0,0.0,0.0


In [174]:
# Splits the incident_date string into a 3-element tuple.
# [0] -> MM/DD/YYYY
# [1] -> HH:MM:SS
# [2] -> AM/PM
def split_incident_date(incident_date):
    split = incident_date.split(' ')
    return split[0], split[1], split[2]

# Parses the split incident into a Date format (MM/DD/YYYY).
def get_incident_date(incident_date):
    date = split_incident_date(incident_date)
    return date[0]

# Parses the incident into a Time format (HH:MM AM/PM).
def get_incident_time(incident_date):
    time = split_incident_date(incident_date)
    hms = time[1].split(':')
    return hms[0] + ':' + hms[1] + ' ' + time[2]

In [175]:
# Replace INCIDENT_DATE column with more concise columns.
temp_date = lrpd["INCIDENT_DATE"]
lrpd["INCIDENT_DATE"] = temp_date.apply(get_incident_date)
lrpd["INCIDENT_TIME"] = temp_date.apply(get_incident_time)

In [176]:
lrpd.head()

Unnamed: 0_level_0,INCIDENT_DATE,LOCATION_DISTRICT,OFFENSE_DESCRIPTION,WEAPON_TYPE,INCIDENT_LOCATION,ZIP,LATITUDE,LONGITUDE,INCIDENT_TIME
INCIDENT_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-092971,08/04/2021,72.0,THEFT FROM MOTOR VEHICLE,NO WEAPON,15601 KANIS RD,72204.0,0.0,0.0,10:21 AM
2017-029450,03/16/2017,82.0,THEFT OF MOTOR VEHICLE PARTS,NO WEAPON,10801 IRONTON CUTOFF RD,72206.0,34.649362,-92.301959,06:30 PM
2017-156453,09/01/2017,53.0,RAPE,NO WEAPON,0,0.0,0.0,0.0,12:00 PM
2019-130717,10/17/2019,51.0,RAPE,NO WEAPON,0,0.0,0.0,0.0,09:00 PM
2020-073313,07/02/2020,50.0,RAPE,NO WEAPON,0,0.0,0.0,0.0,07:10 PM


In [177]:
# Handle outliers.
# Reference: https://hersanyagci.medium.com/detecting-and-handling-outliers-with-pandas-7adbfcd5cad8
lrpd.describe()

Unnamed: 0,LOCATION_DISTRICT,ZIP,LATITUDE,LONGITUDE
count,81833.0,81833.0,81833.0,81833.0
mean,65.955446,71271.371867,33.596511,-89.350141
std,15.510309,8160.766312,6.157083,16.374495
min,0.0,0.0,0.0,-92.545466
25%,54.0,72204.0,34.689606,-92.387244
50%,63.0,72206.0,34.732894,-92.346238
75%,81.0,72209.0,34.752135,-92.314721
max,93.0,72227.0,34.881691,0.0


In [178]:
Q1 = lrpd.quantile(0.25)
Q3 = lrpd.quantile(0.75)
IQR = Q3 - Q1

In [179]:
lower_limit = Q1 - 1.5 * IQR
upper_limit = Q3 + 1.5 * IQR

In [180]:
# Drop the outliers.
outliers_15_low = (lrpd < lower_limit)
outliers_15_up = (lrpd > upper_limit)

  outliers_15_low = (lrpd < lower_limit)
  outliers_15_up = (lrpd > upper_limit)


In [181]:
# risk assessment
all_crimes = lrpd["OFFENSE_DESCRIPTION"].unique()
all_weapons = lrpd["WEAPON_TYPE"].unique()

In [182]:
all_crimes

array(['THEFT FROM MOTOR VEHICLE', 'THEFT OF MOTOR VEHICLE PARTS', 'RAPE',
       'ALL OTHER LARCENY', 'SHOPLIFTING', 'ROBBERY', 'BURGLARY/B&E',
       'THEFT FROM BUILDING', 'AGGRAVATED ASSAULT', 'MOTOR VEHICLE THEFT',
       'PURSE-SNATCHING', 'MURDER & NONNEGLIGENT MANSLAUGHTER',
       'POCKET-PICKING', 'THEFT FROM COIN-OPERATED MACHINE'], dtype=object)

In [183]:
all_weapons

array(['NO WEAPON', 'PERSONAL WEAPONS (HANDS, FISTS, ETC)', 'UNKNOWN',
       'FIREARM', 'KNIFE/CUTTING INSTRUMENT', 'OTHER', 'BLUNT OBJECT',
       'HANDGUN', 'MOTOR VEHICLE', 'ASPHYXIATION', 'SHOTGUN',
       'FIRE/INCENDIARY DEVICE', 'RIFLE', 'OTHER FIREARM',
       'DRUGS/NARCOTICS', 'POISON', 'EXPLOSIVES'], dtype=object)

In [184]:
violent_crimes = ['RAPE', 'AGGRAVATED ASSAULT', 'ALL OTHER LARCENY', 'ROBBERY', 'BURGLARY/B&E',
                  'MURDER & NONNEGLIGENT MANSLAUGHTER']
nonviolent_crimes = ['THEFT FROM MOTOR VEHICLE', 'MOTOR VEHICLE THEFT', 'THEFT OF MOTOR VEHICLE PARTS',
                     'SHOPLIFTING', 'THEFT FROM BUILDING', 'POCKET-PICKING', 'THEFT FROM COIN-OPERATED MACHINE',
                     'PURSE-SNATCHING']

In [185]:
def determine_crime_type(crime):
    if crime in violent_crimes:
        return 'Violent Crime'
    elif crime in nonviolent_crimes:
        return 'Non-Violent Crime'
    else:
        return 'Crime Type Unknown'

In [186]:
lrpd["CRIME_TYPE"] = lrpd["OFFENSE_DESCRIPTION"].apply(determine_crime_type)

In [187]:
lrpd.loc[(lrpd['CRIME_TYPE'] == 'Violent Crime') | (
        lrpd['WEAPON_TYPE'] != 'NO WEAPON'), "RISK_TYPE"] = "High Risk"
lrpd.loc[(lrpd['CRIME_TYPE'] == 'Violent Crime') & (
        lrpd['WEAPON_TYPE'] != 'NO WEAPON'), "RISK_TYPE"] = "High Risk"
lrpd.loc[(lrpd['CRIME_TYPE'] == 'Non-Violent Crime') & (
        lrpd['WEAPON_TYPE'] != 'NO WEAPON'), "RISK_TYPE"] = "High Risk"
lrpd.loc[(lrpd['CRIME_TYPE'] == 'Non-Violent Crime') & (
        lrpd['WEAPON_TYPE'] == 'NO WEAPON'), "RISK_TYPE"] = "Low Risk"

In [188]:
lrpd[:20]

Unnamed: 0_level_0,INCIDENT_DATE,LOCATION_DISTRICT,OFFENSE_DESCRIPTION,WEAPON_TYPE,INCIDENT_LOCATION,ZIP,LATITUDE,LONGITUDE,INCIDENT_TIME,CRIME_TYPE,RISK_TYPE
INCIDENT_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-092971,08/04/2021,72.0,THEFT FROM MOTOR VEHICLE,NO WEAPON,15601 KANIS RD,72204.0,0.0,0.0,10:21 AM,Non-Violent Crime,Low Risk
2017-029450,03/16/2017,82.0,THEFT OF MOTOR VEHICLE PARTS,NO WEAPON,10801 IRONTON CUTOFF RD,72206.0,34.649362,-92.301959,06:30 PM,Non-Violent Crime,Low Risk
2017-156453,09/01/2017,53.0,RAPE,NO WEAPON,0,0.0,0.0,0.0,12:00 PM,Violent Crime,High Risk
2019-130717,10/17/2019,51.0,RAPE,NO WEAPON,0,0.0,0.0,0.0,09:00 PM,Violent Crime,High Risk
2020-073313,07/02/2020,50.0,RAPE,NO WEAPON,0,0.0,0.0,0.0,07:10 PM,Violent Crime,High Risk
2020-103105,09/05/2020,40.0,RAPE,"PERSONAL WEAPONS (HANDS, FISTS, ETC)",0,0.0,0.0,0.0,11:29 PM,Violent Crime,High Risk
2020-127857,11/01/2020,54.0,RAPE,UNKNOWN,0,0.0,0.0,0.0,12:09 AM,Violent Crime,High Risk
2021-002311,01/10/2021,71.0,RAPE,UNKNOWN,0,0.0,0.0,0.0,10:08 PM,Violent Crime,High Risk
2021-113605,09/16/2021,92.0,RAPE,"PERSONAL WEAPONS (HANDS, FISTS, ETC)",0,0.0,0.0,0.0,02:37 PM,Violent Crime,High Risk
2022-014768,02/07/2022,80.0,RAPE,UNKNOWN,0,0.0,0.0,0.0,10:58 AM,Violent Crime,High Risk


In [189]:
lrpd.query('WEAPON_TYPE == "NO WEAPON" & CRIME_TYPE == "Non-Violent Crime"').head(10)

Unnamed: 0_level_0,INCIDENT_DATE,LOCATION_DISTRICT,OFFENSE_DESCRIPTION,WEAPON_TYPE,INCIDENT_LOCATION,ZIP,LATITUDE,LONGITUDE,INCIDENT_TIME,CRIME_TYPE,RISK_TYPE
INCIDENT_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-092971,08/04/2021,72.0,THEFT FROM MOTOR VEHICLE,NO WEAPON,15601 KANIS RD,72204.0,0.0,0.0,10:21 AM,Non-Violent Crime,Low Risk
2017-029450,03/16/2017,82.0,THEFT OF MOTOR VEHICLE PARTS,NO WEAPON,10801 IRONTON CUTOFF RD,72206.0,34.649362,-92.301959,06:30 PM,Non-Violent Crime,Low Risk
2022-302467,09/03/2022,39.0,SHOPLIFTING,NO WEAPON,1321 MAIN STREET,72202.0,0.0,0.0,04:45 PM,Non-Violent Crime,Low Risk
2021-027276,03/17/2021,82.0,THEFT OF MOTOR VEHICLE PARTS,NO WEAPON,5010 OPAL ST,72209.0,34.673893,-92.334594,06:12 PM,Non-Violent Crime,Low Risk
2017-158478,12/17/2017,91.0,THEFT OF MOTOR VEHICLE PARTS,NO WEAPON,9911 INTERSTATE 30,72209.0,34.673161,-92.372574,08:00 PM,Non-Violent Crime,Low Risk
2017-122878,10/01/2017,90.0,THEFT FROM BUILDING,NO WEAPON,7500 S UNIVERSITY AVE,72209.0,34.679524,-92.353965,07:32 PM,Non-Violent Crime,Low Risk
2017-056757,05/15/2017,81.0,SHOPLIFTING,NO WEAPON,8824 GEYER SPRINGS RD,72209.0,34.669221,-92.344341,04:00 PM,Non-Violent Crime,Low Risk
2021-110722,09/10/2021,82.0,MOTOR VEHICLE THEFT,NO WEAPON,6100 MITCHELL DR,72209.0,34.67784,-92.345397,09:30 AM,Non-Violent Crime,Low Risk
2017-057898,05/17/2017,90.0,THEFT OF MOTOR VEHICLE PARTS,NO WEAPON,6600 CAROLINA DR,72209.0,34.69088,-92.360054,10:00 PM,Non-Violent Crime,Low Risk
2022-302817,10/16/2022,92.0,THEFT FROM MOTOR VEHICLE,NO WEAPON,36 ANGEL COURT,72103.0,0.0,0.0,11:00 PM,Non-Violent Crime,Low Risk


In [190]:
CrimeTypeGroup = lrpd.groupby("CRIME_TYPE")

In [191]:
CrimeTypeGroup.size()

CRIME_TYPE
Non-Violent Crime    43598
Violent Crime        38235
dtype: int64

In [192]:
RiskTypeGroup = lrpd.groupby("RISK_TYPE")

In [193]:
RiskTypeGroup.size()

RISK_TYPE
High Risk    38435
Low Risk     43398
dtype: int64

In [194]:
lrpd['RISK_TYPE_NUM'] = lrpd.RISK_TYPE.map({'High Risk': 1, 'Low Risk': 0})

In [195]:
lrpd.head(10)

Unnamed: 0_level_0,INCIDENT_DATE,LOCATION_DISTRICT,OFFENSE_DESCRIPTION,WEAPON_TYPE,INCIDENT_LOCATION,ZIP,LATITUDE,LONGITUDE,INCIDENT_TIME,CRIME_TYPE,RISK_TYPE,RISK_TYPE_NUM
INCIDENT_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-092971,08/04/2021,72.0,THEFT FROM MOTOR VEHICLE,NO WEAPON,15601 KANIS RD,72204.0,0.0,0.0,10:21 AM,Non-Violent Crime,Low Risk,0
2017-029450,03/16/2017,82.0,THEFT OF MOTOR VEHICLE PARTS,NO WEAPON,10801 IRONTON CUTOFF RD,72206.0,34.649362,-92.301959,06:30 PM,Non-Violent Crime,Low Risk,0
2017-156453,09/01/2017,53.0,RAPE,NO WEAPON,0,0.0,0.0,0.0,12:00 PM,Violent Crime,High Risk,1
2019-130717,10/17/2019,51.0,RAPE,NO WEAPON,0,0.0,0.0,0.0,09:00 PM,Violent Crime,High Risk,1
2020-073313,07/02/2020,50.0,RAPE,NO WEAPON,0,0.0,0.0,0.0,07:10 PM,Violent Crime,High Risk,1
2020-103105,09/05/2020,40.0,RAPE,"PERSONAL WEAPONS (HANDS, FISTS, ETC)",0,0.0,0.0,0.0,11:29 PM,Violent Crime,High Risk,1
2020-127857,11/01/2020,54.0,RAPE,UNKNOWN,0,0.0,0.0,0.0,12:09 AM,Violent Crime,High Risk,1
2021-002311,01/10/2021,71.0,RAPE,UNKNOWN,0,0.0,0.0,0.0,10:08 PM,Violent Crime,High Risk,1
2021-113605,09/16/2021,92.0,RAPE,"PERSONAL WEAPONS (HANDS, FISTS, ETC)",0,0.0,0.0,0.0,02:37 PM,Violent Crime,High Risk,1
2022-014768,02/07/2022,80.0,RAPE,UNKNOWN,0,0.0,0.0,0.0,10:58 AM,Violent Crime,High Risk,1


In [196]:
# Save the cleaned dataset.
lrpd.to_csv("../data/lrpd-clean.csv")