In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go 

### Reading in Tables

In [19]:
# need admin, other, and Location Type
admin_df = pd.read_csv('../Data/2020 Domestic Violence - Administrative Data.csv', encoding='ISO-8859-1',skiprows=7)
other_df = pd.read_csv('../Data/2020 Domestic Violence - Other Data.csv', encoding='ISO-8859-1',skiprows=7)
# Location Type is a quarterly csv, compiling
location_df = pd.DataFrame()
for i in range(1,5):
    if location_df.empty:
        location_df = pd.read_csv('../Data/2020 Q1 Domestic Violence - Location Type.csv', encoding='ISO-8859-1',skiprows=6)
    else:
        L = pd.read_csv(f'../Data/2020 Q{i} Domestic Violence - Location Type.csv', encoding='ISO-8859-1',skiprows=6)
        location_df = pd.concat([location_df, L], axis=0)
        location_df.reset_index(drop=True, inplace=True)


In [11]:
admin_df.shape

(69299, 9)

### Functions

In [22]:
# there are missing entries from one df to another...because date entry *facepalm with monitor
# verifying dates are the same formate across df
def change_date_Y_y(x):
    try:
        dt = datetime.strptime(x, "%m/%d/%y")
    except ValueError:
        dt = datetime.strptime(x, "%m/%d/%Y")
    return dt.strftime("%m/%d/%y")

# specfically want to get rid of decimal places if they occur, and fix scientific notation 
# which is not recognized as the same from one table to another in long form. Long form is searchable in the og csvs, so
# keep long form
def change_incNum_long_noDec(x):
    # will need to fix this first
    # fix scientific notation to long form, ex: other ref '2.02007E+11' -> '20200700000'
    if '+' in x:
        x, n = x.split('E+')
        # mult by power of 10, then convert back to str w/o '.0' tail
        x = str(float(x)*10**int(n))[:-2]
    # drop decimals if present, ex: admin ref '202001000000.00' -> '202001000000'
    return x.split('.')[0]
    


### Cleaning Tables, Creating keys, etc.

In [24]:
# formatting incident date feature across tables
admin_df['Incident Date'] = admin_df['Incident Date'].apply(change_date_Y_y)
other_df['Incident Date'] = other_df['Incident Date'].apply(change_date_Y_y)
location_df['Incident Date'] = location_df['Incident Date'].apply(change_date_Y_y)

# formatting Incident Numbers feature across tables
admin_df['Incident Number'] = admin_df['Incident Number'].apply(change_incNum_long_noDec)
other_df['Incident Number'] = other_df['Incident Number'].apply(change_incNum_long_noDec)
location_df['Incident Number'] = location_df['Incident Number'].apply(change_incNum_long_noDec)

# creating key that will be identifiable across df
admin_df['Incident_Key'] = admin_df['ORI'] + '<>' + admin_df['Incident Number'] + '<>' + admin_df['Incident Date']
location_df['Incident_Key'] = location_df['ORI'] + '<>' + location_df['Incident Number'] + '<>' + location_df['Incident Date']
other_df['Incident_Key'] = other_df['ORI'] + '<>' + other_df['Incident Number'] + '<>' + other_df['Incident Date']
admin_df.drop_duplicates('Incident_Key', inplace=True)
# Issue: format inconsistencies with ORIs keeps keys from working,
# ex: 'TN0150100<>2.02001E+11<>01/02/20' vs 'TN0150100<>202001000000.00<>01/02/20'

# location_df.drop_duplicates('Incident_Key', inplace=True)
print('admin: ',len(admin_df), '| other: ',len(other_df), '| location: ',len(location_df))

(69299, 9)
(69299, 9)
(59976, 10)
admin:  59976 | other:  69299 | location:  118124


In [42]:
# isn't the most efficient. Takes 5 min 39 sec to run. b

mismatching = {
    'other':[],
    'location':[],
    'both':[],
}
i, j = 0, len(admin_df)
for key in admin_df.Incident_Key.unique():
    i += 1
    if i%1000 == 0:
        print(f'{i} of {j} records checked')
    O = other_df.loc[other_df.Incident_Key == key]
    L = location_df.loc[location_df.Incident_Key == key]
    check = O.empty and L.empty
    if not check:
        continue
    elif check:
        mismatching['both'].append(key)
    # elif O.empty: # turns out it's literally all or nothing for the mismatches
    #     mismatching['other'].append(key)
    # elif L.empty:
    #     mismatching['location'].append(key)
    
    

1000 of 59976 records checked
2000 of 59976 records checked
3000 of 59976 records checked
4000 of 59976 records checked
5000 of 59976 records checked
6000 of 59976 records checked
7000 of 59976 records checked
8000 of 59976 records checked
9000 of 59976 records checked
10000 of 59976 records checked
11000 of 59976 records checked
12000 of 59976 records checked
13000 of 59976 records checked
14000 of 59976 records checked
15000 of 59976 records checked
16000 of 59976 records checked
17000 of 59976 records checked
18000 of 59976 records checked
19000 of 59976 records checked
20000 of 59976 records checked
21000 of 59976 records checked
22000 of 59976 records checked
23000 of 59976 records checked
24000 of 59976 records checked
25000 of 59976 records checked
26000 of 59976 records checked
27000 of 59976 records checked
28000 of 59976 records checked
29000 of 59976 records checked
30000 of 59976 records checked
31000 of 59976 records checked
32000 of 59976 records checked
33000 of 59976 re

In [68]:
to_replace = {} # bad admin_df.Incident_Key : good Incident_Key

for badkey in mismatching['both']:
    # identify the correct key in location
    ori, num, dt = badkey.split('<>')
    L_key = location_df.loc[(location_df['Incident Date'] == dt) & (location_df.ORI == ori)].Incident_Key.values[0]
    O_key = other_df.loc[(other_df['Incident Date'] == dt) & (other_df.ORI == ori)].Incident_Key.values[0]
    
    # verify they match
    if L_key != O_key:
        print(badkey, L_key, O_key)
    else:
        to_replace[badkey] = L_key
    
# after analyzing the anomoly there are 2 instances in each of the tables that use the L key and the O key, but L puts reverse order from O hence the mismatch
# keeping with consistent error type 
to_replace['TN0410000<>201211000000<>12/11/20'] = 'TN0410000<>201210999999<>12/11/20'

TN0410000<>201211000000<>12/11/20 TN0410000<>201211-1055<>12/11/20 TN0410000<>201210999999<>12/11/20


In [70]:
# now all Incident_Keys match across tables! 
admin_df.replace(to_replace=to_replace, inplace=True)

In [71]:
# backing up these tables
admin_df.to_csv('../Data/Mart2/admin.csv')
other_df.to_csv('../Data/Mart2/other.csv')
location_df.to_csv('../Data/Mart2/location_compiled.csv')

In [None]:
# Features: 
# -City
# -County
# -Agency Name
# -Pri Location
# -Pop of City / County 
# -Rates per 1k, 10k, 100k residents
# -Weigth (essentially how many offenses were reported in this incident)