In [1]:
%matplotlib notebook

In [2]:
# Dependencies.
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np

In [3]:
# Establish file paths for csv files.
csv_path_5 = Path("Resources_2015/nibrs_arrestee.csv")
csv_path_14 = Path("Resources_2015/nibrs_ethnicity.csv")
csv_path_17 = Path("Resources_2015/nibrs_location_type.csv")
csv_path_18 = Path("Resources_2015/nibrs_offender.csv")
csv_path_19 = Path("Resources_2015/nibrs_offense.csv")
csv_path_20 = Path("Resources_2015/nibrs_offense_type.csv")
csv_path_38 = Path("Resources_2015/ref_race.csv")

In [4]:
# Read in the csv files for manipulation.
csv_5 = pd.read_csv(csv_path_5)
csv_14 = pd.read_csv(csv_path_14)
csv_17 = pd.read_csv(csv_path_17)
csv_18 = pd.read_csv(csv_path_18)
csv_19 = pd.read_csv(csv_path_19)
csv_20 = pd.read_csv(csv_path_20)
csv_38 = pd.read_csv(csv_path_38)

In [5]:
# Merge csv files for crossreference.
Incident_ID = pd.merge(csv_5, csv_19, on = "incident_id", how = "outer")
Incident_ID_2 = pd.merge(Incident_ID, csv_18, on = "incident_id", how = "inner")



In [6]:
# Isolate key values for Offense Categories to plug in place of numerical data.
csv_20 = csv_20[["offense_type_id", "offense_category_name"]]

In [7]:
# Establish key values for Offense Categories.
def get_offense(ID):
    return csv_20.loc[csv_20["offense_type_id"] == ID, "offense_category_name"]

In [8]:
# Replace Offense Type ID's with Offense Category descriptions.
Incident_ID_2["offense_type"] = ""
for index, row in Incident_ID_2.iterrows():
    Incident_ID_2.loc[index, "offense_type"] = (get_offense(row["offense_type_id_y"]).values[0])

In [9]:
# Isolate key values for Location Categories to plug in place of numerical data.
csv_17 = csv_17[["location_id", "location_name"]]

In [10]:
# Establish key values for Location Categories.
def get_location(ID):
    return csv_17.loc[csv_17["location_id"] == ID, "location_name"]

In [11]:
# Replace Offense Type ID's with Location Name descriptions.
Incident_ID_2["location_name"] = ""
for index, row in Incident_ID_2.iterrows():
    Incident_ID_2.loc[index, "location_name"] = (get_location(row["location_id"]).values[0])

In [12]:
# Replace all nan or empty values with 0 (0 indicates Unknown in Race_Desc key).
Incident_ID_2["race_id_y"] = Incident_ID_2["race_id_y"].fillna(0)

In [13]:
# Change dtype from float to int for later conversion.
Incident_ID_2["race_id_y"] = Incident_ID_2["race_id_y"].astype(np.int64)

In [14]:
# Replace all nan or empty values with 3 (3 indicates Unknown in Ethnicity_Name key).
Incident_ID_2["ethnicity_id_y"] = Incident_ID_2["ethnicity_id_y"].fillna(3)
Incident_ID_2["ethnicity_id_y"] = Incident_ID_2["ethnicity_id_y"].replace({"":3})

In [15]:
# Change dtype from float to int for later conversion.
Incident_ID_2["ethnicity_id_y"] = Incident_ID_2["ethnicity_id_y"].astype(np.int64)

In [16]:
# Isolate key values for Race Categories to plug in place of numerical data.
csv_38 = csv_38[["race_id", "race_desc"]]

In [17]:
# Establish key values for Race Categories.
def get_race(ID):
    return csv_38.loc[csv_38["race_id"] == ID, "race_desc"]

In [18]:
# Replace Race ID's with Race descriptions.
Incident_ID_2["race"] = ""
try:
    for index, row in Incident_ID_2.iterrows():
        Incident_ID_2.loc[index, "race"] = (get_race(row["race_id_y"]).values[0])
except:
    ""

In [19]:
# Isolate key values for Ethnicity Categories to plug in place of numerical data.
csv_14 = csv_14[["ethnicity_id", "ethnicity_name"]]

In [20]:
# Establish key values for Ethnicity descriptions.
def get_ethnicity(ID):
    return csv_14.loc[csv_14["ethnicity_id"] == ID, "ethnicity_name"]

In [21]:
# Replace Ethnicity ID's with Ethnicity descriptions.
Incident_ID_2["ethnicity"] = ""
try:
    for index, row in Incident_ID_2.iterrows():
        Incident_ID_2.loc[index, "ethnicity"] = (get_ethnicity(row["ethnicity_id_y"]).values[0])
except:
    ""

In [26]:
# Remove unnecessary Columns.
Crime_Data_2015 = Incident_ID_2.drop([
    "arrestee_id", 
    "arrestee_seq_num", 
    "arrest_num", 
    "arrest_date", 
    "arrest_type_id", 
    "multiple_indicator", 
    "offense_type_id_x", 
    "age_id_x", 
    "age_num_x", 
    "sex_code_x", 
    "race_id_x", 
    "ethnicity_id_x", 
    "resident_code", 
    "under_18_disposition_code", 
    "clearance_ind", 
    "ff_line_number_x", 
    "age_range_low_num_x", 
    "age_range_high_num_x", 
    "attempt_complete_flag", 
    "num_premises_entered", 
    "method_entry_code", 
    "ff_line_number_y", 
    "offender_seq_num", 
    "age_id_y", 
    "age_num_y", 
    "ff_line_number", 
    "age_range_low_num_y", 
    "age_range_high_num_y"], axis=1)

Crime_Data_2015.head()

Unnamed: 0,incident_id,offense_id,offense_type_id_y,location_id,offender_id,sex_code_y,race_id_y,ethnicity_id_y,offense_type,location_name,race,ethnicity
0,80535041,86175036,51,18,90789928,M,1,2,Assault Offenses,Parking Lot/Garage,White,Not Hispanic or Latino
1,80535041,86175037,5,18,90789928,M,1,2,Destruction/Damage/Vandalism of Property,Parking Lot/Garage,White,Not Hispanic or Latino
2,83435754,91099609,5,18,94045222,M,1,2,Destruction/Damage/Vandalism of Property,Parking Lot/Garage,White,Not Hispanic or Latino
3,83045572,90650976,51,40,93589141,M,1,2,Assault Offenses,School-College/University,White,Not Hispanic or Latino
4,83044664,90650978,16,40,93589145,M,1,2,Drug/Narcotic Offenses,School-College/University,White,Not Hispanic or Latino


In [28]:
Crime_Data_2015.to_csv("Data_Ref_2015.csv", index=False)