In [None]:
#Import Library
#SQLAlchemy relatives
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

#Config relatives
import sys
sys.path.insert(1,('../..'))
sys.path
import config

#general
import pandas as pd

# Extract

In [None]:
#Import Dataset
data=pd.read_csv('../data/external/police_force.csv')

# Transform

In [None]:
#show data columns
data.columns

In [None]:
#add year, month, day, hour into dataset
data['date']=data['ResponseDate'].apply(lambda x:x.split(' ')[0])
data['time']=data['ResponseDate'].apply(lambda x:x.split(' ')[1])
data['year']=data['date'].apply(lambda x:x.split('/')[0])
data['month']=data['date'].apply(lambda x:x.split('/')[1])
data['day']=data['date'].apply(lambda x:x.split('/')[2])
data['hour']=data['time'].apply(lambda x:x.split(':')[0])

In [None]:
#drop duplicates columns
data = data.drop(['X','Y','CenterX','CenterY','DateAdded'],axis=1)

In [None]:
#drop the data in year 1970 (too old)
year_1970_data_list=data[data['year']=='1970'].index
data=data.drop(year_1970_data_list,axis=0)

### subject_tb

In [None]:
subject_tb = data[['OBJECTID','Race','Sex','EventAge','SubjectInjury','SubjectRole','SubjectRoleNumber','TypeOfResistance']].copy()
subject_tb = subject_tb.rename(columns={'OBJECTID':'subject_id','Race':'race','Sex':'sex','EventAge':'age',
                                     'SubjectInjury':'has_injury','SubjectRole':'role',
                                     'SubjectRoleNumber':'role_number','TypeOfResistance':'resistance'})
subject_tb['age'].fillna(0,inplace=True)
subject_tb['role'].fillna('no data',inplace=True)
subject_tb['role_number'].fillna(-1,inplace=True)
subject_tb['resistance'].fillna("no data",inplace=True)
subject_tb['sex'].fillna('no data',inplace=True)
subject_tb['sex'].replace('not recorded','no data',inplace=True)
subject_tb['role']=subject_tb['role'].apply(lambda x: str(x).strip().replace('PERSON IN CRISIS','PIC'))
subject_tb['role']=subject_tb['role'].apply(lambda x: x.replace('null','no data'))
subject_tb['race'].fillna("no data",inplace=True)

subject_tb['age'] = subject_tb['age'].astype('int')
subject_tb['role_number'] = subject_tb['role_number'].astype('int')

In [None]:
subject_tb['race']=subject_tb['race'].replace(['Unknown','not recorded'],'no data')
subject_tb['sex']=subject_tb['sex'].replace('Unknown','no data')
subject_tb['resistance']=subject_tb['resistance'].apply(lambda x:x.lower().strip()).replace('null','no data')
subject_tb.head()

In [None]:
subject_tb.to_csv('../data/subject_tb.csv',index=False)

### city_tb

In [None]:
city_tb = pd.DataFrame({'city_id':1,'city_name':'Minneapolis'},index=[0])
city_tb.head()

In [None]:
city_tb.to_csv('../data/city_tb.csv',index=False)

### city_summary_tb

In [None]:
city_summary_tb = data[['year','TotalCityCallsForYear']].copy()
city_summary_tb = city_summary_tb.drop_duplicates()
city_summary_tb = city_summary_tb.reset_index(drop=True)
city_summary_tb = city_summary_tb.reset_index()
city_summary_tb['city_id'] = 1
city_summary_tb = city_summary_tb[['index','city_id','year','TotalCityCallsForYear']]
city_summary_tb['index']=city_summary_tb['index'].apply(lambda x:x+1)
city_summary_tb = city_summary_tb.rename(columns={'index':'city_summary_id','TotalCityCallsForYear':'total_calls'})
city_summary_tb.head()

In [None]:
city_summary_tb.to_csv('../data/city_summary_tb.csv',index=False)

### precinct_tb

In [None]:
precinct_list=list(data['Precinct'].unique())
precinct_list[-4]='no data'
precinct_dict={}
n=0
for i in (precinct_list):
    n=n+1
    precinct_dict.update({i:n})

In [None]:
precinct_tb = data['Precinct'].drop_duplicates().fillna("no data").reset_index(drop=True).reset_index()
precinct_tb['index'] = precinct_tb['Precinct'].apply(lambda x:precinct_dict[x])
precinct_tb = precinct_tb.rename(columns={'index':'precinct_id','Precinct':"precinct_name"})
precinct_tb.head()

In [None]:
precinct_tb.to_csv('../data/precinct_tb.csv',index=False)

### precinct_summary

In [None]:
precinct_raw = data['Precinct'].copy().fillna("no data").reset_index(drop=True).reset_index()
precinct_raw['index'] = precinct_raw['Precinct'].apply(lambda x:precinct_dict[x])
precinct_raw =precinct_raw.rename(columns={'index':'precinct_id','Precinct':"precinct_name"})

In [None]:
precinct_summary_tb_b4=precinct_raw.join(data['year'])
precinct_summary_tb_b4['count']=1

In [None]:
total=(precinct_summary_tb_b4.groupby(['precinct_id','precinct_name','year']).sum())['count']
total_list = list(total)

In [None]:
precinct_summary_tb = precinct_summary_tb_b4.dropna().sort_values(['precinct_id','precinct_name','year']).drop_duplicates().reset_index(drop=True).reset_index()
precinct_summary_tb['index'] = precinct_summary_tb['index'].apply(lambda x:x+1)
precinct_summary_tb = precinct_summary_tb.rename(columns = {'index':'precinct_summary_id'})
precinct_summary_tb = precinct_summary_tb.drop(['count','precinct_name'],axis=1)
precinct_summary_tb['total_calls'] = total_list
precinct_summary_tb.head()

In [None]:
precinct_summary_tb.to_csv('../data/precinct_summary_tb.csv',index=False)

### neightborhood_tb

In [None]:
neighborhood_list = list(data['Neighborhood'].fillna('no data').drop_duplicates())
neighborhood_dict={}
n=0
for i in neighborhood_list:
    n=n+1
    neighborhood_dict.update({i:n})

In [None]:
neighborhood_tb = pd.DataFrame({'neighborhood_id':list(neighborhood_dict.values()),
                                'neighborhood_name':list(neighborhood_dict.keys())})

In [None]:
neighborhood_tb.head()

In [None]:
neighborhood_tb.to_csv('../data/neighborhood_tb.csv',index=False)

### neightborhood_summary_tb

In [None]:
neighborhood_summary_raw = data[['Neighborhood','year']].copy()
neighborhood_summary_raw['count']=1
neighborhood_summary_raw['Neighborhood'].fillna('no data')
neighborhood_summary_raw['neighborhood_id']=neighborhood_summary_raw['Neighborhood'].fillna('no data')\
                                            .apply(lambda x:neighborhood_dict[x])
neighborhood_summary_raw = neighborhood_summary_raw.drop('Neighborhood',axis=1)

In [None]:
total=(neighborhood_summary_raw.groupby(['neighborhood_id','year']).sum())['count']
total_list = list(total)

In [None]:
neighborhood_summary_tb = neighborhood_summary_raw.copy()
neighborhood_summary_tb = neighborhood_summary_tb.drop_duplicates().sort_values(['neighborhood_id','year'])
neighborhood_summary_tb = neighborhood_summary_tb[['neighborhood_id','year']]
neighborhood_summary_tb['total_calls'] = total_list
neighborhood_summary_tb = neighborhood_summary_tb.reset_index(drop=True).reset_index()
neighborhood_summary_tb['index'] = neighborhood_summary_tb['index'].apply(lambda x:x+1)
neighborhood_summary_tb = neighborhood_summary_tb.rename(columns={'index':'neighborhood_summary_id'})
neighborhood_summary_tb.head()

In [None]:
neighborhood_summary_tb.to_csv('../data/neighborhood_summary_tb.csv',index=False)

### force_categories_tb

In [None]:
force_categories_tb = data['ForceType'].copy()
force_categories_tb.fillna("no data",inplace=True)
force_categories_tb = force_categories_tb.drop_duplicates().reset_index(drop=True).reset_index()
force_categories_tb['index']=force_categories_tb['index'].apply(lambda x:x+1)
force_categories_tb['ForceType']=force_categories_tb['ForceType'].apply(lambda x:x.lower())
force_categories_tb = force_categories_tb.rename(columns={'index':'force_category_id','ForceType':'category'})
force_categories_tb.head()

In [None]:
force_categories_tb.to_csv('../data/force_categories_tb.csv',index=False)

### police_force

In [None]:
force_dict={}
for i in range(len(force_categories_tb)):
    force_dict.update({force_categories_tb.iloc[i,1]:force_categories_tb.iloc[i,0]})

In [None]:
force_dict

In [None]:
police_force_tb = data[['ForceType','OBJECTID','PoliceUseOfForceID','ForceReportNumber','ForceTypeAction']].copy()
police_force_tb.fillna("no data",inplace=True)
police_force_tb['force_category_id'] = police_force_tb['ForceType'].apply(lambda x: force_dict[x.lower()])
police_force_tb = police_force_tb.reset_index(drop=True).reset_index()
police_force_tb['index']=police_force_tb['index'].apply(lambda x :x+1)
police_force_tb = police_force_tb[["index",'PoliceUseOfForceID','force_category_id',
                                   'ForceTypeAction','ForceReportNumber','OBJECTID']]
police_force_tb = police_force_tb.rename(columns={'index':'police_force_id','PoliceUseOfForceID':'force_number',
                                                 'ForceTypeAction':'force_action','ForceReportNumber':"force_report_number",
                                                 'OBJECTID':'subject_id'})
police_force_tb['force_action']=police_force_tb['force_action'].replace('0','no data')

In [None]:
police_force_tb['force_action']=police_force_tb['force_action'].apply(lambda x:x.lower())
police_force_tb['force_action']=police_force_tb['force_action'].replace('no data','no data')
police_force_tb.head()

In [None]:
police_force_tb.to_csv('../data/police_force_tb.csv',index=False)

### case_tb

In [None]:
case_tb=data[['CaseNumber','Is911Call','Problem','PrimaryOffense','ResponseDate','CenterLatitude',
             'CenterLongitude','Precinct','Neighborhood']].copy()
case_tb=case_tb.rename(columns={'CaseNumber':'case_number','Is911Call':'is_911_call',
                       'Problem':'problem','PrimaryOffense':'primary_offense','CenterLatitude':'latitude',
                       'CenterLongitude':'longitude','Precinct':'precinct_id','Neighborhood':'neighborhood_id'})

case_tb['date']=case_tb['ResponseDate'].apply(lambda x:x.split(' ')[0])
case_tb['time']=case_tb['ResponseDate'].apply(lambda x:x.split(' ')[1])
case_tb['year']=case_tb['date'].apply(lambda x:x.split('/')[0])
case_tb['month']=case_tb['date'].apply(lambda x:x.split('/')[1])
case_tb['day']=case_tb['date'].apply(lambda x:x.split('/')[2])
case_tb['hour']=case_tb['time'].apply(lambda x:int(x.split(':')[0]))
case_tb['date']=case_tb['date'].apply(lambda x:x.replace("/",""))


case_tb['neighborhood_id'] = case_tb['neighborhood_id'].fillna('no data')
case_tb['neighborhood_id']=case_tb['neighborhood_id'].apply(lambda x:neighborhood_dict[x])
case_tb['city_id']=1
case_tb=case_tb.drop(['ResponseDate','time'],axis=1)
case_tb['hour']=case_tb['hour'].fillna("-1")
case_tb['problem'].fillna('no data',inplace=True)
case_tb['precinct_id'].fillna('no data',inplace=True)
case_tb['precinct_id']=case_tb['precinct_id'].apply(lambda x: precinct_dict[x])
case_tb['primary_offense'].fillna("no data",inplace=True)
case_tb=case_tb.reset_index(drop=True).reset_index()
case_tb=case_tb.rename(columns={'index':'case_id'})

case_tb['police_force_id']=case_tb['case_id'].apply(lambda x: x+1)
case_tb['case_id']=case_tb['case_id'].apply(lambda x: x+1)

case_tb=case_tb[['case_id','case_number','is_911_call','problem','primary_offense','date','latitude',
                'longitude','city_id','precinct_id','neighborhood_id','police_force_id','year','month','day','hour']]

In [None]:
case_tb['problem']=case_tb['problem'].apply(lambda x:x.lower())

case_tb['primary_offense']=case_tb['primary_offense'].apply(lambda x:x.strip())
case_tb.head()

In [None]:
case_tb.to_csv('../data/case_tb.csv',index=False)

# Load

In [None]:
#username and password
username = config.username
password = config.password

In [None]:
# Create SQL Engine
engine = create_engine(f"postgresql://{username}:{password}@localhost:5432/police_force")

# Reflect the schema already exists in postgresSQL
Base = automap_base()
Base.prepare(engine, reflect = True)

# Show the existing table names
Base.classes.keys()

In [None]:
csv_files ={
    'city':'city_tb.csv',
    'city_summary':'city_summary_tb.csv',
    'precinct':'precinct_tb.csv',
    'precinct_summary':'precinct_summary_tb.csv',
    'neighborhood':'neighborhood_tb.csv',
    'neighborhood_summary':'neighborhood_summary_tb.csv',
    'force_categories':'force_categories_tb.csv',
    'subject':'subject_tb.csv',
    'police_force':'police_force_tb.csv',
    'case':'case_tb.csv'
}

In [None]:
# Import dataset into database
path = '../data/'
for table,file in csv_files.items():
    full_path=path+f'{file}'
    data=pd.read_csv(full_path)
    try:
        print(f'{table} is loading.')
        data.to_sql(name = f'{table}', con = engine, if_exists = 'append', index = False)
        print(f'\t {table} import successed.')
    except:
        print(f'!!!!Failed to import {table}.')