## ETL Data pipeline

In [1]:
# import dependencies 
import requests 
import pandas as pd
import time 
from datetime import datetime
import numpy as np 
import config 

### Extraction 

In [18]:
url = "https://data.austintexas.gov/resource/fdj4-gpfu.json?$limit=100&$where=occ_date between '2018-01-01T00:00:00.000' and '2020-12-31T00:00:00.000'"

In [24]:
# optional parameter for aditonal api parameters/filters
def extract_data(api_endpoint,parameters=None): 
    data_request = requests.get(url=api_endpoint, params=parameters)
    
    data_df = pd.DataFrame.from_records(data_request.json())
    
    return data_df

# print no of rows retrieved 

In [25]:
crime_df = extract_data(url)

In [27]:
crime_df.head(2)

Unnamed: 0,incident_report_number,crime_type,ucr_code,family_violence,occ_date_time,occ_date,occ_time,rep_date_time,rep_date,rep_time,...,x_coordinate,y_coordinate,latitude,longitude,location,census_tract,clearance_status,clearance_date,ucr_category,category_description
0,201811444,FAMILY DISTURBANCE,3400,N,2018-01-01T16:58:00.000,2018-01-01T00:00:00.000,1658,2018-01-01T16:58:00.000,2018-01-01T00:00:00.000,1658,...,3093814,3093814,30.19737037,-97.80960579,"{'latitude': '30.19737037', 'longitude': '-97....",,,,,
1,20195024716,CRUELTY TO ANIMALS,2717,N,2018-01-01T14:00:00.000,2018-01-01T00:00:00.000,1400,2019-06-17T10:20:00.000,2019-06-17T00:00:00.000,1020,...,3103707,3103707,30.14716858,-97.77960983,"{'latitude': '30.14716858', 'longitude': '-97....",24.28,N,2019-06-18T00:00:00.000,,


In [41]:
crime_df.columns

Index(['incident_report_number', 'crime_type', 'ucr_code', 'family_violence',
       'occ_date_time', 'occ_date', 'occ_time', 'rep_date_time', 'rep_date',
       'rep_time', 'location_type', 'address', 'zip_code', 'council_district',
       'sector', 'district', 'pra', 'x_coordinate', 'y_coordinate', 'latitude',
       'longitude', 'location', 'census_tract', 'clearance_status',
       'clearance_date', 'ucr_category', 'category_description'],
      dtype='object')

In [285]:
test_df = crime_df.copy()
drop_columns = ['latitude', 'longitude', 'location',
                    'occ_date', 'occ_time','rep_date','rep_time','category_description', 'address',
                    'ucr_category','pra','sector','census_tract','council_district']

test_df.drop(drop_columns, axis=1, inplace=True)
test_df.isnull().sum()

incident_report_number     0
crime_type                 0
ucr_code                   0
family_violence            0
occ_date_time              0
rep_date_time              0
location_type              1
zip_code                   1
district                   1
x_coordinate               0
y_coordinate               0
clearance_status          11
clearance_date            11
dtype: int64

In [164]:
def drop_f(data_df):
    column_checklist = ['incident_report_number', 'crime_type', 'ucr_code', 'family_violence',
       'occ_date_time', 'rep_date_time', 'location_type', 'zip_code','x_coordinate', 'y_coordinate']
    for i in column_list:
        if data_df[i].isnull().sum() > 0:
            data_df.drop(data_df[data_df[i].isnull()].index, inplace=True)
    return data_df

In [284]:
def data_drop(data_df):
        # drop rows where crucial data is necesary 
    column_checklist = ['incident_report_number', 'crime_type', 'ucr_code', 'family_violence',
                       'occ_date_time', 'rep_date_time', 'location_type', 'zip_code',
                        'x_coordinate', 'y_coordinate']
    row_counter = 0
    for i in column_list:
        if data_df[i].isnull().sum() > 0:
            row_counter += data_df[i].isnull().sum()
            data_df.drop(data_df[data_df[i].isnull()].index, inplace=True)
    print(pd.DataFrame({'Total Rows Dropped':{0:row_counter}}))
    return data_df

In [286]:
data_drop(test_df)

   Total Rows Dropped
0                   2


Unnamed: 0,incident_report_number,crime_type,ucr_code,family_violence,occ_date_time,rep_date_time,location_type,zip_code,district,x_coordinate,y_coordinate,clearance_status,clearance_date
0,201811444,FAMILY DISTURBANCE,3400,N,2018-01-01T16:58:00.000,2018-01-01T16:58:00.000,STREETS / HWY / ROAD / ALLEY,78745,1,3093814,3093814,,
1,20195024716,CRUELTY TO ANIMALS,2717,N,2018-01-01T14:00:00.000,2019-06-17T10:20:00.000,RESIDENCE / HOME,78747,7,3103707,3103707,N,2019-06-18T00:00:00.000
2,201811667,AGG ROBBERY/DEADLY WEAPON,300,N,2018-01-01T19:47:00.000,2018-01-01T19:47:00.000,DEPARTMENT / DISCOUNT STORE,78753,1,3128703,3128703,C,2018-01-03T00:00:00.000
3,201810550,POSSESSION OF MARIJUANA,1803,N,2018-01-01T02:51:00.000,2018-01-01T06:49:00.000,STREETS / HWY / ROAD / ALLEY,78744,6,3118581,3118581,,
4,20205045035,IDENTITY THEFT,4022,N,2018-01-01T12:00:00.000,2020-11-06T10:37:00.000,RESIDENCE / HOME,78758,4,3127109,3127109,N,2020-11-06T00:00:00.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,201811617,PUBLIC INTOXICATION,2300,N,2018-01-01T19:45:00.000,2018-01-01T20:59:00.000,GAS / SERVICE STATIONS,78758,4,3128613,3128613,C,2018-01-01T00:00:00.000
96,201810723,ASSAULT BY CONTACT FAM/DATING,902,Y,2018-01-01T03:56:00.000,2018-01-01T04:45:00.000,STREETS / HWY / ROAD / ALLEY,78729,4,3099634,3099634,N,2018-01-04T00:00:00.000
97,201820726,THEFT OF TRAILER,613,N,2018-01-01T12:00:00.000,2018-01-02T17:00:00.000,STREETS / HWY / ROAD / ALLEY,78751,5,3121407,3121407,N,2018-01-05T00:00:00.000
98,201811794,ASSAULT W/INJURY-FAM/DATE VIOL,900,N,2018-01-01T21:25:00.000,2018-01-01T21:49:00.000,RESIDENCE / HOME,78735,6,3068872,3068872,N,2018-01-09T00:00:00.000


In [287]:
test_df.isnull().sum()

incident_report_number     0
crime_type                 0
ucr_code                   0
family_violence            0
occ_date_time              0
rep_date_time              0
location_type              0
zip_code                   0
district                   0
x_coordinate               0
y_coordinate               0
clearance_status          11
clearance_date            11
dtype: int64

In [170]:
column_checklist = ['incident_report_number', 'crime_type', 'ucr_code', 'family_violence',
                       'occ_date_time', 'rep_date_time', 'location_type', 'zip_code',
                        'x_coordinate', 'y_coordinate']
len(column_checklist)
len(test_df.columns)

10

13

### Transform   

In [183]:
test_df.clearance_date.isnull().sum() + 3

14

In [190]:
test_df[['incident_report_number','crime_type']][0:3].to_dict()

{'incident_report_number': {0: '201811444', 1: '20195024716', 2: '201811667'},
 'crime_type': {0: 'FAMILY DISTURBANCE',
  1: 'CRUELTY TO ANIMALS',
  2: 'AGG ROBBERY/DEADLY WEAPON'}}

In [202]:
pd.DataFrame({'Total Rows Dropped':{0:row_counter}})

Unnamed: 0,total rows dropped
0,1


In [None]:
def clean_data(data_df):
    
    #drop columns 
    drop_columns = ['latitude', 'longitude', 'location',
                    'occ_date', 'occ_time','rep_date','rep_time','category_description', 'address',
                    'ucr_category']
    
    data_df.drop(drop_column, axis=1, inplace=True)
    
    #drop null values 
    def data_drop(data_df):
        # drop rows where crucial data is necesary 
        column_checklist = ['incident_report_number', 'crime_type', 'ucr_code', 'family_violence',
                       'occ_date_time', 'rep_date_time', 'location_type', 'zip_code',
                        'x_coordinate', 'y_coordinate']
        for i in column_list:
            row_counter = 0
            if data_df[i].isnull().sum() > 0:
                row_counter += data_df[i].isnull().sum()
                data_df.drop(data_df[data_df[i].isnull()].index, inplace=True)
        print(pd.DataFrame({'Total Rows Dropped':{0:row_counter}}))
        return data_df
    
    #output frame of ammount being dropped
          
    #fill in clearance date 
    
    #rename columns 
    column_names = {'ucr_code':'offense_code','occ_date_time':'ocurred_date',
                    'rep_date_time':'reported_date','crime_type':'offense_type'}
    
    data_df.rename(columns=column_names, inplace=True)
    
    
    
    return cleaned_data_df

In [None]:
def create_tables(cleaned_data_df):
     
    #offense_type_table 
    offense_df = cleaned_data_df[['offense_code','offense_type']].copy()
    offense_df.drop_duplicates(subset='offense_code',inplace=True)
    
    #create incident_location_table 
    location_df = cleaned_data_df['location_type'].copy()
    location_df.to_frame()
    location_df.drop_duplicates(inplace=True)
    location_df['location_code'] = np.arrange(len(incident_loc_df))
    
    #create mapping for location_code column
    loc_mapping_df = location_df.copy().set_index('location_type', inplace=True)
    location_mapper = loc_mapping_df.to_dict()['location_code']
    
    # rearrange location_df
    location_df = lcoation_df[['locations_code','location_type']]
    
    #crime_incidents_table 
    #create encoded location_code column
    crime_incident_df = cleaned_data_df.copy()
    crime_incident_df['location_code'] = crime_incident_df['location_type'] \
    .apply(lambda x: location_mapper[x])
    
    #drop columns 
    drop_column_2 = ['offense_type','offense_code']
    crime_incident_df.drop(drop_column_2, inplace=True)
    
    
    return crime_incident_df, location_df, offense_df 

