In [1]:
%matplotlib notebook

In [2]:
# Dependencies.
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np

In [3]:
# Establish file paths for csv files.
csv_path_5 = Path("Resources_2022/NIBRS_ARRESTEE.csv")
csv_path_14 = Path("Resources_2022/NIBRS_ETHNICITY.csv")
csv_path_17 = Path("Resources_2022/NIBRS_LOCATION_TYPE.csv")
csv_path_18 = Path("Resources_2022/NIBRS_OFFENDER.csv")
csv_path_19 = Path("Resources_2022/NIBRS_OFFENSE.csv")
csv_path_20 = Path("Resources_2022/NIBRS_OFFENSE_TYPE.csv")
csv_path_38 = Path("Resources_2022/REF_RACE.csv")

In [4]:
# Read in the csv files for manipulation.
csv_5 = pd.read_csv(csv_path_5)
csv_14 = pd.read_csv(csv_path_14)
csv_17 = pd.read_csv(csv_path_17)
csv_18 = pd.read_csv(csv_path_18)
csv_19 = pd.read_csv(csv_path_19)
csv_20 = pd.read_csv(csv_path_20)
csv_38 = pd.read_csv(csv_path_38)

In [5]:
# Merge csv files for crossreference.
Incident_ID = pd.merge(csv_5, csv_19, on = "incident_id", how = "outer")
Incident_ID_2 = pd.merge(Incident_ID, csv_18, on = "incident_id", how = "inner")



In [7]:
# Isolate key values for Offense Categories to plug in place of numerical data.
csv_20 = csv_20[["offense_code", "offense_category_name"]]

In [8]:
# Establish key values for Offense Categories.
def get_offense(ID):
    return csv_20.loc[csv_20["offense_code"] == ID, "offense_category_name"]

In [10]:
# Replace Offense Type ID's with Offense Category descriptions.
Incident_ID_2["offense_type"] = ""
for index, row in Incident_ID_2.iterrows():
    Incident_ID_2.loc[index, "offense_type"] = (get_offense(row["offense_code_y"]).values[0])

In [11]:
# Isolate key values for Location Categories to plug in place of numerical data.
csv_17 = csv_17[["location_id", "location_name"]]

In [12]:
# Establish key values for Location Categories.
def get_location(ID):
    return csv_17.loc[csv_17["location_id"] == ID, "location_name"]

In [13]:
# Replace Offense Type ID's with Location Name descriptions.
Incident_ID_2["location_name"] = ""
for index, row in Incident_ID_2.iterrows():
    Incident_ID_2.loc[index, "location_name"] = (get_location(row["location_id"]).values[0])

In [15]:
# Replace all nan or empty values with 0 (0 indicates Unknown in Race_Desc key).
Incident_ID_2["race_id_y"] = Incident_ID_2["race_id_y"].fillna(0)

In [16]:
# Change dtype from float to int for later conversion.
Incident_ID_2["race_id_y"] = Incident_ID_2["race_id_y"].astype(np.int64)

In [17]:
# Replace all nan or empty values with 3 (3 indicates Unknown in Ethnicity_Name key).
Incident_ID_2["ethnicity_id_y"] = Incident_ID_2["ethnicity_id_y"].fillna(3)
Incident_ID_2["ethnicity_id_y"] = Incident_ID_2["ethnicity_id_y"].replace({"":3})

In [18]:
# Change dtype from float to int for later conversion.
Incident_ID_2["ethnicity_id_y"] = Incident_ID_2["ethnicity_id_y"].astype(np.int64)

In [19]:
# Isolate key values for Race Categories to plug in place of numerical data.
csv_38 = csv_38[["race_id", "race_desc"]]

In [20]:
# Establish key values for Race Categories.
def get_race(ID):
    return csv_38.loc[csv_38["race_id"] == ID, "race_desc"]

In [21]:
# Replace Race ID's with Race descriptions.
Incident_ID_2["race"] = ""
try:
    for index, row in Incident_ID_2.iterrows():
        Incident_ID_2.loc[index, "race"] = (get_race(row["race_id_y"]).values[0])
except:
    ""

In [22]:
# Isolate key values for Ethnicity Categories to plug in place of numerical data.
csv_14 = csv_14[["ethnicity_id", "ethnicity_name"]]

In [23]:
# Establish key values for Ethnicity descriptions.
def get_ethnicity(ID):
    return csv_14.loc[csv_14["ethnicity_id"] == ID, "ethnicity_name"]

In [24]:
# Replace Ethnicity ID's with Ethnicity descriptions.
Incident_ID_2["ethnicity"] = ""
try:
    for index, row in Incident_ID_2.iterrows():
        Incident_ID_2.loc[index, "ethnicity"] = (get_ethnicity(row["ethnicity_id_y"]).values[0])
except:
    ""

In [26]:
# Remove unnecessary Columns.
Crime_Data_2020 = Incident_ID_2.drop([
    "data_year_x", 
    "arrestee_id", 
    "arrestee_seq_num", 
    "arrest_type_id", 
    "multiple_indicator", 
    "offense_code_x", 
    "age_id_x", 
    "age_num_x", 
    "sex_code_x", 
    "race_id_x", 
    "ethnicity_id_x", 
    "resident_code", 
    "under_18_disposition_code", 
    "clearance_ind", 
    "age_range_low_num_x", 
    "age_range_high_num_x", 
    "data_year_y", 
    "attempt_complete_flag", 
    "num_premises_entered", 
    "method_entry_code", 
    "data_year", 
    "offender_seq_num", 
    "age_id_y", 
    "age_num_y", 
    "age_range_low_num_y", 
    "age_range_high_num_y"], axis=1)

Crime_Data_2020.head()

Unnamed: 0,incident_id,arrest_date,offense_id,offense_code_y,location_id,offender_id,sex_code_y,race_id_y,ethnicity_id_y,offense_type,location_name,race,ethnicity
0,147462560,2022-01-10,176908831,280,35,167494978,M,10,20,Stolen Property Offenses,Residence/Home,White,Not Hispanic or Latino
1,159675946,2022-03-19,190927721,220,32,181318718,M,10,20,Burglary/Breaking & Entering,Park/Playground,White,Not Hispanic or Latino
2,159675946,2022-03-19,190927721,220,32,181318721,M,10,20,Burglary/Breaking & Entering,Park/Playground,White,Not Hispanic or Latino
3,159675946,2022-06-21,190927721,220,32,181318718,M,10,20,Burglary/Breaking & Entering,Park/Playground,White,Not Hispanic or Latino
4,159675946,2022-06-21,190927721,220,32,181318721,M,10,20,Burglary/Breaking & Entering,Park/Playground,White,Not Hispanic or Latino


In [27]:
Crime_Data_2020.to_csv("Data_Ref_2022.csv", index=False)