## ETL Data pipeline

In [351]:
# import dependencies 
import requests 
import pandas as pd
import time 
from datetime import datetime
import numpy as np 
import config 

### Extraction 

In [18]:
url = "https://data.austintexas.gov/resource/fdj4-gpfu.json?$limit=100&$where=occ_date between '2018-01-01T00:00:00.000' and '2020-12-31T00:00:00.000'"

In [430]:
# optional parameter for aditonal api parameters/filters
def extract_data(api_endpoint,parameters=None): 
    data_request = requests.get(url=api_endpoint, params=parameters)
    
    data_df = pd.DataFrame.from_records(data_request.json())
    print(pd.DataFrame({'Rows Retrieved':{0:len(data_df)}}))
    
    return data_df

# print no of rows retrieved 

### Transform   

In [431]:
extract_data(url)

   Rows Retrieved
0             100


Unnamed: 0,incident_report_number,crime_type,ucr_code,family_violence,occ_date_time,occ_date,occ_time,rep_date_time,rep_date,rep_time,...,x_coordinate,y_coordinate,latitude,longitude,location,census_tract,clearance_status,clearance_date,ucr_category,category_description
0,201811444,FAMILY DISTURBANCE,3400,N,2018-01-01T16:58:00.000,2018-01-01T00:00:00.000,1658,2018-01-01T16:58:00.000,2018-01-01T00:00:00.000,1658,...,3093814,3093814,30.19737037,-97.80960579,"{'latitude': '30.19737037', 'longitude': '-97....",,,,,
1,20195024716,CRUELTY TO ANIMALS,2717,N,2018-01-01T14:00:00.000,2018-01-01T00:00:00.000,1400,2019-06-17T10:20:00.000,2019-06-17T00:00:00.000,1020,...,3103707,3103707,30.14716858,-97.77960983,"{'latitude': '30.14716858', 'longitude': '-97....",24.28,N,2019-06-18T00:00:00.000,,
2,201811667,AGG ROBBERY/DEADLY WEAPON,300,N,2018-01-01T19:47:00.000,2018-01-01T00:00:00.000,1947,2018-01-01T19:47:00.000,2018-01-01T00:00:00.000,1947,...,3128703,3128703,30.36582905,-97.69453919,"{'latitude': '30.36582905', 'longitude': '-97....",,C,2018-01-03T00:00:00.000,120,Robbery
3,201810550,POSSESSION OF MARIJUANA,1803,N,2018-01-01T02:51:00.000,2018-01-01T00:00:00.000,251,2018-01-01T06:49:00.000,2018-01-01T00:00:00.000,649,...,3118581,3118581,30.19247824,-97.73132061,"{'latitude': '30.19247824', 'longitude': '-97....",,,,,
4,20205045035,IDENTITY THEFT,4022,N,2018-01-01T12:00:00.000,2018-01-01T00:00:00.000,1200,2020-11-06T10:37:00.000,2020-11-06T00:00:00.000,1037,...,3127109,3127109,30.38646396,-97.69903136,"{'latitude': '30.38646396', 'longitude': '-97....",,N,2020-11-06T00:00:00.000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,201811617,PUBLIC INTOXICATION,2300,N,2018-01-01T19:45:00.000,2018-01-01T00:00:00.000,1945,2018-01-01T20:59:00.000,2018-01-01T00:00:00.000,2059,...,3128613,3128613,30.38289967,-97.69435755,"{'latitude': '30.38289967', 'longitude': '-97....",,C,2018-01-01T00:00:00.000,,
96,201810723,ASSAULT BY CONTACT FAM/DATING,902,Y,2018-01-01T03:56:00.000,2018-01-01T00:00:00.000,356,2018-01-01T04:45:00.000,2018-01-01T00:00:00.000,445,...,3099634,3099634,30.44549079,-97.78463319,"{'latitude': '30.44549079', 'longitude': '-97....",,N,2018-01-04T00:00:00.000,,
97,201820726,THEFT OF TRAILER,613,N,2018-01-01T12:00:00.000,2018-01-01T00:00:00.000,1200,2018-01-02T17:00:00.000,2018-01-02T00:00:00.000,1700,...,3121407,3121407,30.30172702,-97.71941769,"{'latitude': '30.30172702', 'longitude': '-97....",,N,2018-01-05T00:00:00.000,23H,Theft
98,201811794,ASSAULT W/INJURY-FAM/DATE VIOL,900,N,2018-01-01T21:25:00.000,2018-01-01T00:00:00.000,2125,2018-01-01T21:49:00.000,2018-01-01T00:00:00.000,2149,...,3068872,3068872,30.24674667,-97.88732546,"{'latitude': '30.24674667', 'longitude': '-97....",,N,2018-01-09T00:00:00.000,,


In [428]:
test = data_df.copy()

In [427]:
clean_data(data_df)

In [424]:
test

In [418]:
def clean_data(data_df):
    
    #drop columns 
    drop_columns = ['latitude', 'x_coordinate', 'y_coordinate',
                    'occ_date', 'occ_time','rep_date','rep_time','category_description', 'address',
                    'ucr_category','census_tract','sector','pra']
    
    data_df.drop(drop_columns, axis=1, inplace=True)
    
    #drop null values 
    # drop rows with missing data
    column_checklist = ['incident_report_number', 'crime_type', 'ucr_code', 'family_violence',
                       'occ_data_time', 'rep_date_time', 'location_type', 'zip_code',
                         'longitude', 'location','district']
    row_counter = 0
    for i in column_checklist:
        if data_df[i].isnull().sum() > 0:
            row_counter += data_df[i].isnull().sum()
            data_df.drop(data_df[data_df[i].isnull()].index, inplace=True)
    print(pd.DataFrame({'Rows Dropped':{0:row_counter}}))
        
    
          
    #fill in clearance data with placeholders 
    data_df['clearance_date'].fillna(value='0000-00-00T00:00:00.000',inplace=True)
    data_df['clearance_status'].fillna(value='N', inplace=True)
        
    #clean and convert datetiem columns 
    data_df['occ_date_time'] = data_df['occ_date_time'].apply(lambda x: x.replace('T',' '))
    data_df['rep_date_time'] = data_df['rep_date_time'].apply(lambda x: x.replace('T',' '))
    data_df['clearance_date'] = data_df['clearance_date'].apply(lambda x: x.replace('T',' '))
    
    #rename columns 
    column_names = {'ucr_code':'offense_code','occ_date_time':'occurred_date',
                    'rep_date_time':'reported_date','crime_type':'offense_type'}
    
    data_df.rename(columns=column_names, inplace=True)
    


In [359]:
def create_tables(cleaned_data_df):
     
    #offense_type_table 
    offense_df = cleaned_data_df[['offense_code','offense_type']].copy()
    offense_df.drop_duplicates(subset='offense_code',inplace=True)
    
    #create incident_location_table 
    location_df = cleaned_data_df['location_type'].copy()
    location_df.to_frame()
    location_df.drop_duplicates(inplace=True)
    location_df['location_code'] = np.arrange(len(incident_loc_df))
    
    #create mapping for location_code column
    loc_mapping_df = location_df.copy().set_index('location_type', inplace=True)
    location_mapper = loc_mapping_df.to_dict()['location_code']
    
    # rearrange location_df
    location_df = lcoation_df[['locations_code','location_type']]
    
    #crime_incidents_table 
    #create encoded location_code column
    crime_incident_df = cleaned_data_df.copy()
    crime_incident_df['location_code'] = crime_incident_df['location_type'] \
    .apply(lambda x: location_mapper[x])
    
    #drop columns 
    drop_column_2 = ['offense_type','offense_code']
    crime_incident_df.drop(drop_column_2, inplace=True)
    
    
    return crime_incident_df, location_df, offense_df 


In [None]:
def load_data():
    database = f"postgres://postgres:{config.db_password}@localhost:5432/austin_crime"
    engine = create_engine(database)
    
    crime_incident_df.to_sql(name='crime_incidents', con=engine, if_exists='replace')
    locations_df.to_sql(name='incident_location', con=engine, if_exists='replace')
    locations_df.to_sql(name='offense_type', con=engine, if_exists='replace')
    
    