## Test API / Data retrieval 

In [3]:
# import dependencies 
import requests 
import pandas as pd
import time 
from datetime import datetime
import numpy as np 
import config 

In [4]:
# API request for crime data 
data_request = requests.get("https://data.austintexas.gov/resource/fdj4-gpfu.json?$limit=100&$where=occ_date between '2018-01-01T00:00:00.000' and '2020-12-31T00:00:00.000'")
# status for API request
data_request.status_code

200

In [675]:
len(r.json())

310981

In [33]:
data_df = pd.DataFrame.from_records(data_request.json())

----------------

## Data 
Cleaning and transforming

In [34]:
data_df.occ_date

0     2018-01-01T00:00:00.000
1     2018-01-01T00:00:00.000
2     2018-01-01T00:00:00.000
3     2018-01-01T00:00:00.000
4     2018-01-01T00:00:00.000
               ...           
95    2018-01-01T00:00:00.000
96    2018-01-01T00:00:00.000
97    2018-01-01T00:00:00.000
98    2018-01-01T00:00:00.000
99    2018-01-01T00:00:00.000
Name: occ_date, Length: 100, dtype: object

In [35]:
# display all column 
[data_df.columns][0:5]

[Index(['incident_report_number', 'crime_type', 'ucr_code', 'family_violence',
        'occ_date_time', 'occ_date', 'occ_time', 'rep_date_time', 'rep_date',
        'rep_time', 'location_type', 'address', 'zip_code', 'council_district',
        'sector', 'district', 'pra', 'x_coordinate', 'y_coordinate', 'latitude',
        'longitude', 'location', 'census_tract', 'clearance_status',
        'clearance_date', 'ucr_category', 'category_description'],
       dtype='object')]

In [41]:
#create list of unneeded/redundant columns to drop 
drop_column = ['latitude', 'longitude', 'location',
                    'occ_date', 'occ_time','rep_date','rep_time','category_description', 'address',
                    'ucr_category']

In [42]:
# drop columns )
data_df.drop(drop_column, axis=1, inplace=True)

In [43]:
# review remaining columns data types
data_df.info()
# print remaining number of columns 
data_df.columns.size

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 17 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   incident_report_number  100 non-null    object
 1   crime_type              100 non-null    object
 2   ucr_code                100 non-null    object
 3   family_violence         100 non-null    object
 4   occ_date_time           100 non-null    object
 5   rep_date_time           100 non-null    object
 6   location_type           99 non-null     object
 7   zip_code                99 non-null     object
 8   council_district        99 non-null     object
 9   sector                  99 non-null     object
 10  district                99 non-null     object
 11  pra                     99 non-null     object
 12  x_coordinate            100 non-null    object
 13  y_coordinate            100 non-null    object
 14  census_tract            6 non-null      object
 15  clearan

17

In [44]:
# review data 
data_df.sample(5)

Unnamed: 0,incident_report_number,crime_type,ucr_code,family_violence,occ_date_time,rep_date_time,location_type,zip_code,council_district,sector,district,pra,x_coordinate,y_coordinate,census_tract,clearance_status,clearance_date
37,201811906,FAMILY DISTURBANCE,3400,N,2018-01-01T22:53:00.000,2018-01-01T22:53:00.000,RESIDENCE / HOME,78750,6,AD,6,827,3095390,3095390,,N,2018-01-14T00:00:00.000
59,20205029910,RAPE OF A CHILD,204,N,2018-01-01T13:31:00.000,2020-08-03T13:31:00.000,RESIDENCE / HOME,78751,9,BA,5,332,0,0,,N,2020-08-18T00:00:00.000
14,20185000085,BURGLARY OF VEHICLE,601,N,2018-01-01T17:00:00.000,2018-01-01T22:12:00.000,PARKING LOTS / GARAGE,78704,9,DA,1,439,3113141,3113141,,N,2018-01-03T00:00:00.000
9,201810847,CRIMINAL MISCHIEF,1400,N,2018-01-01T07:20:00.000,2018-01-01T07:20:00.000,PARKING LOTS / GARAGE,78745,2,FR,3,528,3103136,3103136,,N,2018-01-03T00:00:00.000
69,20188000028,BURGLARY OF VEHICLE,601,N,2018-01-01T03:00:00.000,2018-01-02T09:52:00.000,RESIDENCE / HOME,78741,3,HE,2,441,3120119,3120119,,N,2018-01-04T00:00:00.000


In [45]:
data_df.isnull().sum()

incident_report_number     0
crime_type                 0
ucr_code                   0
family_violence            0
occ_date_time              0
rep_date_time              0
location_type              1
zip_code                   1
council_district           1
sector                     1
district                   1
pra                        1
x_coordinate               0
y_coordinate               0
census_tract              94
clearance_status          11
clearance_date            11
dtype: int64

### missing values

In [566]:
stripped_crime_data.isnull().sum()

incident_report_number       0
crime_type                   0
ucr_code                     0
family_violence              0
occ_date_time                0
rep_date_time                0
location_type                1
address                      0
zip_code                   102
council_district           117
sector                      77
district                    80
pra                         90
ucr_category              6070
category_description      6070
location                   166
clearance_status          2622
clearance_date            2622
dtype: int64

In [570]:
round(len(stripped_crime_data.columns) * 0.30)

5

In [572]:
# remove rows with missing values
# test 30% threshold
stripped_crime_data.dropna(thresh=13 , axis=0).isnull().sum()

incident_report_number       0
crime_type                   0
ucr_code                     0
family_violence              0
occ_date_time                0
rep_date_time                0
location_type                1
address                      0
zip_code                     7
council_district            22
sector                       0
district                     0
pra                          0
ucr_category              5991
category_description      5991
location                    76
clearance_status          2583
clearance_date            2583
dtype: int64

In [None]:
#check if priority columns (location_type, offense type) contain NAN, these cannot contain Nan due to table relationships
[( ) & ()]

### datetime formats 

In [241]:
# test visualization of datetime format
import plotly.graph_objects as go
import numpy as np

fig = go.Figure(data=go.Scatter(x=stripped_crime_data['occ_date_time'][20:80], y=stripped_crime_data['ucr_code'][20:80]))
fig.write_html('viz_0.html')

In [335]:

#remove timestamp denotation
date_series = stripped_crime_data.occ_date_time.apply(lambda x: x.replace('T',' '))
date_series[0]

#convert from string to datetime format 
date_string = date_series[0]
date_string

date_object1 = datetime.strptime(date_string,"%Y-%m-%d %H:%M:%S.%f")
date_object1

# access date in from datetime 
date_object2 = date_object1.date()
date_object2

#access time from datetime 
date_object3 = date_object1.time()
date_object3


'2021-02-20 17:00:00.000'

'2021-02-20 17:00:00.000'

datetime.datetime(2021, 2, 20, 17, 0)

datetime.date(2021, 2, 20)

datetime.time(17, 0)

In [None]:
stripped_crime_data['occ_date_time'] = stripped_crime_data['occ_date_time'].apply(lambda x: x.replace('T',' '))
stripped_crime_data['rep_date_time'] = stripped_crime_data['rep_date_time'].apply(lambda x: x.replace('T',' '))
stripped_crime_data['clearance_date'] = stripped_crime_data['clearance_date'].apply(lambda x: x.replace('T',' '))

---------------------------

## Database  
creating dataframes to map database table relationships


### Offense table 

In [22]:
# offense_type table 
offense_type_df = crime_incident_data[['ucr_code','crime_type']].copy()
offense_type_df.drop_duplicates(subset='ucr_code', inplace=True)

In [23]:
# 189 unique offense codes
offense_type_df.head()
offense_type_df.shape

Unnamed: 0,ucr_code,crime_type
0,3400,FAMILY DISTURBANCE
1,2717,CRUELTY TO ANIMALS
2,300,AGG ROBBERY/DEADLY WEAPON
3,1803,POSSESSION OF MARIJUANA
4,4022,IDENTITY THEFT


(42, 2)

In [212]:
offense_type_df.isnull().values.any()

False

In [214]:
#rename test 
offense_type_df.rename(columns={'ucr_code':'offense_code','crime_type':'offense_type'})

Unnamed: 0,offense_code,offense_description
0,601,BURGLARY OF VEHICLE
1,502,BURGLARY NON RESIDENCE
2,1400,CRIMINAL MISCHIEF
3,905,RESISTING ARREST OR SEARCH
4,2716,CRIMINAL TRESPASS
...,...,...
9301,1716,FORCED SODOMY
9572,2718,THEFT BY EXTORTION
9616,907,TAMPERING WITH CONSUMER PROD
9721,2601,OBSCENITY


In [26]:
offense_type_df.copy().set_index('ucr_code', inplace=True)

In [28]:
offense_type_df.head()

Unnamed: 0,ucr_code,crime_type
0,3400,FAMILY DISTURBANCE
1,2717,CRUELTY TO ANIMALS
2,300,AGG ROBBERY/DEADLY WEAPON
3,1803,POSSESSION OF MARIJUANA
4,4022,IDENTITY THEFT


In [18]:
offense_type_df.set_index('ucr_code', inplace=True)

In [None]:
offense_type_df.to_dict()['crime_type']

## location table

In [337]:
stripped_crime_data.head()

Unnamed: 0,incident_report_number,crime_type,ucr_code,family_violence,occ_date_time,rep_date_time,location_type,address,zip_code,council_district,sector,district,pra,ucr_category,category_description,location,clearance_status,clearance_date
0,20215006549,BURGLARY OF VEHICLE,601,N,2021-02-20T17:00:00.000,2021-02-20T23:35:00.000,RESTAURANTS,6319 N IH 35 SVRD NB,78752,4,ID,1,280,23F,Theft,"{'latitude': '30.32530599', 'longitude': '-97....",,
1,20215006547,BURGLARY NON RESIDENCE,502,N,2021-02-20T14:47:00.000,2021-02-20T18:41:00.000,PARKING LOTS / GARAGE,13301 CENTER LAKE DR,78753,7,ED,6,206,220,Burglary,"{'latitude': '30.4088995', 'longitude': '-97.6...",,
2,2021510833,CRIMINAL MISCHIEF,1400,N,2021-02-20T14:41:00.000,2021-02-20T14:41:00.000,CONVENIENCE STORE,10111 N LAMAR BLVD,78753,4,ED,1,240,,,"{'latitude': '30.37078232', 'longitude': '-97....",,
3,2021510758,RESISTING ARREST OR SEARCH,905,N,2021-02-20T14:30:00.000,2021-02-20T16:30:00.000,STREETS / HWY / ROAD / ALLEY,310 E 6TH ST,78701,9,GE,2,381,,,"{'latitude': '30.26755605', 'longitude': '-97....",,
4,2021210510728,CRIMINAL TRESPASS,2716,N,2021-02-20T13:47:00.000,2021-02-20T13:47:00.000,CONVENIENCE STORE,10111 N LAMAR BLVD,78753,4,ED,1,240,,,"{'latitude': '30.37078232', 'longitude': '-97....",,


In [381]:
#43 unique location types out of 1000 rows (by default does not count null values)
stripped_crime_data.location_type.nunique()

43

In [529]:
# create location dataframe with only unique values  
location_df = stripped_crime_data['location_type'].copy()
location_df = location_df.to_frame()
 
location_df.drop_duplicates(inplace=True)

In [530]:
# check for null values, if any remove
location_null_list = location_df[location_df['location_type'].isnull()].index
# print ammount of null values
len(location_null_list)

#dro null value(s)
location_df.drop(location_null_list, axis=0,inplace=True)

1

In [531]:
#create numbered column to be used as unique identifier between tables 
location_df['location_code'] = np.arange(len(location_df))
# rearange dataframe 

In [532]:
location_df.info() 
print('\n')
location_df.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43 entries, 0 to 9981
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   location_type  43 non-null     object
 1   location_code  43 non-null     int32 
dtypes: int32(1), object(1)
memory usage: 860.0+ bytes




Unnamed: 0,location_type,location_code
0,RESTAURANTS,0
1,PARKING LOTS / GARAGE,1
2,CONVENIENCE STORE,2
3,STREETS / HWY / ROAD / ALLEY,3
7,RESIDENCE / HOME,4


In [539]:
# rearange dataframe
location_df = location_df[['location_code','location_type']]
location_df.head()

Unnamed: 0,location_code,location_type
0,0,RESTAURANTS
1,1,PARKING LOTS / GARAGE
2,2,CONVENIENCE STORE
3,3,STREETS / HWY / ROAD / ALLEY
7,4,RESIDENCE / HOME


-----

In [533]:
# creating mapping of location type to use as encoder for main incident dataframe
map_location_df = location_df.copy()
map_location_df.set_index('location_type', inplace=True)

In [534]:
location_mapper = map_location_df.to_dict()['location_code'].copy()
print(location_mapper)

{'RESTAURANTS': 0, 'PARKING LOTS / GARAGE': 1, 'CONVENIENCE STORE': 2, 'STREETS / HWY / ROAD / ALLEY': 3, 'RESIDENCE / HOME': 4, 'RENTAL STORAGE FACILITY': 5, 'DEPARTMENT / DISCOUNT STORE': 6, 'BANKS / SAVINGS & LOAN': 7, 'TRANSPORTATION (AIR / BUS / TRAIN - TERMINALS)': 8, 'HOTEL / MOTEL / ETC.': 9, 'DRUG STORE / DR. OFFICE / HOSPITAL': 10, 'GAS / SERVICE STATIONS': 11, 'SHELTER-MISSION / HOMELESS': 12, 'COMMERCIAL / OFFICE BUILDING': 13, 'OTHER / UNKNOWN': 14, 'SPECIALTY  STORE (TV  FUR ETC.)': 15, 'ARENA / STADIUM / FAIRGROUNDS / COLISEUM': 16, 'LIQUOR STORE': 17, 'INDUSTRIAL SITE': 18, 'GROCERY / SUPERMARKET': 19, 'COMMUNITY CENTER': 20, 'BAR / NIGHT CLUB': 21, 'CHURCH / SYNAGOGUE / TEMPLE / MOSQUE': 22, 'SHOPPING MALL': 23, 'GOVERNMENT / PUBLIC BUILDING': 24, 'CYBERSPACE': 25, 'AMUSEMENT PARK': 26, 'PARK / PLAYGROUND': 27, 'CAMP / CAMPGROUND': 28, 'AUTO DEALERSHIP NEW / USED': 29, 'SCHOOL - ELEMENTARY / SECONDARY': 30, 'CONSTRUCTION SITE': 31, 'SCHOOL - COLLEGE / UNIVERSITY': 32, 

In [551]:
location_mapper[stripped_crime_data['location_type'][0]]

0

In [548]:
# test mapper 
stripped_crime_data[stripped_crime_data.columns[0:7]].head(2)

Unnamed: 0,incident_report_number,crime_type,ucr_code,family_violence,occ_date_time,rep_date_time,location_type
0,20215006549,BURGLARY OF VEHICLE,601,N,2021-02-20T17:00:00.000,2021-02-20T23:35:00.000,RESTAURANTS
1,20215006547,BURGLARY NON RESIDENCE,502,N,2021-02-20T14:47:00.000,2021-02-20T18:41:00.000,PARKING LOTS / GARAGE


In [None]:
# replace and encode location column 
stripped_crime_data['location_type'] = stripped_crime_data['location_type'].apply(lambda x: location_mapper[x])