## ETL Data pipeline

In [1]:
# import dependencies 
import requests 
import pandas as pd
import time 
from datetime import datetime
import numpy as np 
import config 

### Extraction 

In [18]:
url = "https://data.austintexas.gov/resource/fdj4-gpfu.json?$limit=100&$where=occ_date between '2018-01-01T00:00:00.000' and '2020-12-31T00:00:00.000'"

In [24]:
# optional parameter for aditonal api parameters/filters
def extract_data(api_endpoint,parameters=None): 
    data_request = requests.get(url=api_endpoint, params=parameters)
    
    data_df = pd.DataFrame.from_records(data_request.json())
    
    return data_df

# print no of rows retrieved 

In [25]:
crime_df = extract_data(url)

In [350]:
test_df = crime_df.copy()

### Transform   

In [300]:
def clean_data(data_df):
    
    #drop columns 
    drop_columns = ['latitude', 'x_coordinate', 'y_coordinate,
                    'occ_date', 'occ_time','rep_date','rep_time','category_description', 'address',
                    'ucr_category']
    
    data_df.drop(drop_column, axis=1, inplace=True)
    
    #drop null values 
    def data_drop(data_df):
        # drop rows with missing data
        column_checklist = ['incident_report_number', 'crime_type', 'ucr_code', 'family_violence',
                       'occ_date_time', 'rep_date_time', 'location_type', 'zip_code',
                         'longitude', 'location','district']
        row_counter = 0
        for i in column_list:
            if data_df[i].isnull().sum() > 0:
                row_counter += data_df[i].isnull().sum()
                data_df.drop(data_df[data_df[i].isnull()].index, inplace=True)
        print(pd.DataFrame({'Total Rows Dropped':{0:row_counter}}))
        
    
    #output frame of ammount being dropped
          
    #fill in clearance data with placeholders 
    data_df['clearance_date'].fillna(values='0000-00-00T00:00:00.000',inplace=True)
    data_df['clearance_status'].fillna(values='N', inplace=True)
        
    
    #rename columns 
    column_names = {'ucr_code':'offense_code','occ_date_time':'ocurred_date',
                    'rep_date_time':'reported_date','crime_type':'offense_type'}
    
    data_df.rename(columns=column_names, inplace=True)
    
    
    
    return cleaned_data_df

In [None]:
def create_tables(cleaned_data_df):
     
    #offense_type_table 
    offense_df = cleaned_data_df[['offense_code','offense_type']].copy()
    offense_df.drop_duplicates(subset='offense_code',inplace=True)
    
    #create incident_location_table 
    location_df = cleaned_data_df['location_type'].copy()
    location_df.to_frame()
    location_df.drop_duplicates(inplace=True)
    location_df['location_code'] = np.arrange(len(incident_loc_df))
    
    #create mapping for location_code column
    loc_mapping_df = location_df.copy().set_index('location_type', inplace=True)
    location_mapper = loc_mapping_df.to_dict()['location_code']
    
    # rearrange location_df
    location_df = lcoation_df[['locations_code','location_type']]
    
    #crime_incidents_table 
    #create encoded location_code column
    crime_incident_df = cleaned_data_df.copy()
    crime_incident_df['location_code'] = crime_incident_df['location_type'] \
    .apply(lambda x: location_mapper[x])
    
    #drop columns 
    drop_column_2 = ['offense_type','offense_code']
    crime_incident_df.drop(drop_column_2, inplace=True)
    
    
    return crime_incident_df, location_df, offense_df 
