In [73]:
import sys
sys.path.append("../../")

import itertools
import operator
import django
import pylab as pl
import re

from django.contrib.gis.measure import D

from django.db import connections
from firecares.firestation.models import FireDepartment, FireStation
from firecares.utils import dictfetchall
from pandas import DataFrame

%matplotlib inline

django.setup()

def hamming(str1,str2):
    ne = operator.ne
    return sum(itertools.imap(ne,str1,str2))

print "Number of unmatched fire stations:", FireStation.objects.filter(department__isnull=True).count()

cursor = connections['default'].cursor()

queryUnmatchedStations = """
select a.name as firestation_name, 
b.usgsstructuredata_ptr_id as firestation_id, 
b.department_id as dep_id, 
ST_ASTEXT(ST_TRANSFORM(d.geom,900913)) as firestation_location
from firestation_usgsstructuredata a
inner join firestation_firestation b
  on a.id=b.usgsstructuredata_ptr_id
join firecares_core_address d 
  on b.station_address_id=d.id
where b.department_id is null and d.geom is not null
"""

unmatched_stations_with_geom = FireStation.objects.filter(department__isnull=False, 
                                                station_address__geom__isnull=False,
                                                )


print 'Unmatched Stations with Geometry:', unmatched_stations_with_geom.count()
print

#Query fireDepartments within 50 miles of station
queryNearbyDepartments = """
select c.name as department_name, c.id as dep_id,
ST_DISTANCE(ST_TRANSFORM(e.geom,900913),ST_GEOMFROMTEXT(%s,900913)) * 0.000621371 as dis_miles,
levenshtein(c.name,%s) as dis_name,
ST_INTERSECTS(ST_TRANSFORM(c.geom,900913),ST_GEOMFROMTEXT(%s,900913)) as dep_intersects
from firestation_firedepartment c 
inner join firecares_core_address e 
  on c.headquarters_address_id=e.id
where e.geom is not null and
ST_DISTANCE(ST_TRANSFORM(e.geom,900913),ST_GEOMFROMTEXT(%s,900913)) * 0.000621371 <= 100 and
levenshtein(c.name,%s) <= 30; 
"""

queryUpdateFireStation = """
update firestation_firestation 
set department_id = %s
where firestation_firestation.usgsstructuredata_ptr_id = %s
"""

queryQualityControl = """
select fd.name as fd_name, usfa.number_of_stations as num_stations 
from firestation_firedepartment fd 
inner join usfa_census_national usfa 
 on fd.state=usfa."hq state" and fd.fdid=usfa.fdid;
"""

#cursor.execute(queryUnmatchedStations)
#desc= cursor.description
# don't load all of the stations into memory
#fireStations = dictfetchall(cursor)
#print "Number of Stations:", cursor.rowcount

#select a.name as firestation_name, 
#b.usgsstructuredata_ptr_id as firestation_id, 
#b.department_id as dep_id, 
#ST_ASTEXT(ST_TRANSFORM(d.geom,900913)) as firestation_location

totalMatchedDepartments = 0
totalFalseMatches = 0
matchedDict = dict()

for fireStation in unmatched_stations_with_geom:
    
    title = "# Searching for departmens for Station: {0} #".format(fireStation.name)
    
    print '#' * len(title)
    print title
    #print '#' * len(title)
    
    

    
    rep = { "Station": "", 
            " Engine": "", 
            " Truck": "",
            " Ladder": "",
            " Quint": "",
            " Squirt": "",
            " Ambulance": "",
            " Service": "",
            " District": "",
            " and": "",
            " Emergency": "",
            " Medical": "",
            " Services": "",
            " Headquarters" : "",
            " City" : "",
            "City " : "",
            " of" : ""} 
    rep = dict((re.escape(k), v) for k, v in rep.iteritems())
    pattern = re.compile("|".join(rep.keys()))
    stationName = pattern.sub(lambda m: rep[re.escape(m.group(0))], fireStation.name)
    stationName = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", stationName)
    stationName =  re.sub(' +',' ', stationName)
    stationName = stationName.strip()
    print 'Station Name after word replacement: {0}'.format(stationName)
    
    # This should be close to the queryNearbyDepartments query
    nearby_departments = FireDepartment.objects.filter(headquarters_address__geom__distance_lte=(fireStation.station_address.geom, D(mi=30)))\
    .distance(fireStation.station_address.geom)\
    .extra(select={'dis_name': "select levenshtein(firestation_firedepartment.name, %s)", 'dis_sound': "select similarity(firestation_firedepartment.name, %s)"}, select_params=(stationName,stationName))\
    .order_by('distance','dis_name')

    distanceRatio = 0
    closestDepID = 0
    closestDepName = ''
    if not nearby_departments:
        continue

    
    maxDepartmentScore = list()
    #print 'Nearby Department Count: {0}'.format(nearby_departments.count())
    for n, fireDepartment in enumerate(nearby_departments):
        
        if n == 10:
            break
        departmentDistance = 30
        departmentJurisdiction = 50
        exactName = False

        if fireDepartment.distance is not None:
            departmentDistance = fireDepartment.distance.mi
        else:
            departmentDistance = fireDepartment.headquarters_address.geom.distance(fireStation.station_address.geom) * 0.000621371 
        
       
        #if there is a exact match in name more than likely this is the department
        if fireDepartment.name == stationName:
            closestDepID = fireDepartment.id
            closestDepName = fireDepartment.name
            exactName = True
            break
            
        #The maximum return from levenshtein will be the length of the longer string
        # so to create a true 0-1 ratio find the longer string name
        minName = min(len(stationName),len(fireDepartment.name))
        longestName = max(len(stationName),len(fireDepartment.name))
        minDistance = longestName - minName
        
        #if the strings are the same length then the hamming distance is the upperbound
        #if minName == longestName:
            #longestName = hamming(stationName,fireDepartment.name)
            #if(longestName == 0):
                #longestName = minName
        
        #lower bound of levenshtein is at least difference of strings
        #to create zero to one ratio must subtract minimum distances
        fireDepartment.dis_name -= minDistance
        departmentRatio = ((1 - departmentDistance / 30) * 70) + (1 - fireDepartment.dis_name / longestName) * 50 + (fireDepartment.dis_sound * 20)
        
 
        if departmentRatio >= 135:
            closestDepID = fireDepartment.id
            closestDepName = fireDepartment.name
            fireDepartment.distance = departmentDistance
            maxDepartmentScore.append(fireDepartment)
        #print 'comparing firestation {0} to firedepartment {1}'.format(stationName, fireDepartment.name)
        #print 'department distance: {0} mi'.format(departmentDistance)
        #print 'department name distance: {0}'.format(fireDepartment.dis_name / minName)
        #print 'distance ratio: {0}, department ratio: {1}'.format(distanceRatio, departmentRatio)

        if departmentRatio > distanceRatio:
            distanceRatio = departmentRatio
            smallestDistance = departmentDistance
            smallestNameDistance = fireDepartment.dis_name
            closestDepID = fireDepartment.id
            closestDepName = fireDepartment.name
        
        #print

    #fireStation.department_id = closestDepID
    
    if len(maxDepartmentScore) > 1 and exactName == False:
        print 'More than one max department'
        print '#' * len(title)
        print 'Max Departments'
        print '#' * len(title)
        smallestNameDistance = 0
        for maxDepartment in maxDepartmentScore:
            print 'Department name is {0}, distance: {1}, name distance: {2}, similarity: {3}'.format(maxDepartment.name, maxDepartment.distance, maxDepartment.dis_name,maxDepartment.dis_sound)
            if maxDepartment.dis_sound > smallestNameDistance:
                closestDepID = maxDepartment.id
                closestDepName = maxDepartment.name
                smallestNameDistance = maxDepartment.dis_sound
    
    matchedDict[closestDepName] = matchedDict.get(closestDepName, 0) + 1
    #print '#' * len(title)        
    print 'Matched with: {0}'.format(closestDepName)
    if closestDepID == fireStation.department_id:
        totalMatchedDepartments += 1
        #print 'Correctly Matched Department'
    else:
        totalFalseMatches += 1;
        correctDepartment = FireDepartment.objects.get(id=fireStation.department_id)
        print 'Correct Department is {0}'.format(correctDepartment.name)
        print 'Correct Department location is {0}'.format(correctDepartment.headquarters_address.geom)
        print 'Firestation location is {0}'.format(fireStation.station_address.geom)
        print 'Distance from station is {0}'.format(fireStation.station_address.geom.distance(correctDepartment.headquarters_address.geom) * 0.000621371 )
    #print 'matched:', matchedDict
    #print "#############################################"
    #print
    #print
    

print 'Correctly Matched Department Count: {0}'.format(totalMatchedDepartments)
print 'Incorrectly Matched Department Count: {0}'.format(totalFalseMatches)

#cursor.execute(queryQualityControl)
#qualityControl = dictfetchall(cursor)

#totalStationMatches = 0
#totalIncorrectStations = 0
#for fireDepartment in qualityControl:
    #if matchedDict.get(fireDepartment['fd_name']) != None:
       # if matchedDict[fireDepartment['fd_name']] == fireDepartment['num_stations']:
        #    totalStationMatches += 1
        #else:
         #   totalIncorrectStations += 1        
#print "Station Matches:", totalStationMatches
#print "Incorrect Station Counts:", totalIncorrectStations
     

Number of unmatched fire stations: 50723
Unmatched Stations with Geometry: 1847

####################################################################################################
# Searching for departmens for Station: Anchorage Fire Department Battalion 1 North Side Station 1 #
Station Name after word replacement: Anchorage Fire Department Battalion North Side
Matched with: Anchorage Fire Department
####################################################################################################
# Searching for departmens for Station: Anchorage Fire Department Battalion 1 North Side Station 3 #
Station Name after word replacement: Anchorage Fire Department Battalion North Side
Matched with: Anchorage Fire Department
####################################################################################################
# Searching for departmens for Station: Anchorage Fire Department Battalion 1 North Side Station 5 #
Station Name after word replacement: Anchorage Fire Department Ba

In [None]:
import sys
sys.path.append("../../")

import django
import pylab as pl

from django.db import connections
from firecares.firestation.models import FireDepartment, FireStation
from firecares.utils import dictfetchall
from pandas import DataFrame

%matplotlib inline

django.setup()

print "Number of unmatched fire stations:", FireStation.objects.filter(department__isnull=True).count()

cursor = connections['default'].cursor()

queryUnmatchedStations = """
select a.name as firestation_name, 
b.usgsstructuredata_ptr_id as firestation_id, 
b.department_id as dep_id, 
ST_ASTEXT(ST_TRANSFORM(d.geom,900913)) as firestation_location
from firestation_usgsstructuredata a
inner join firestation_firestation b
  on a.id=b.usgsstructuredata_ptr_id
join firecares_core_address d 
  on b.station_address_id=d.id
where b.department_id is null and d.geom is not null
"""
#Query fireDepartments within 50 miles of station
queryNearbyDepartments = """
select c.name as department_name, c.id as dep_id,
ST_DISTANCE(ST_TRANSFORM(e.geom,900913),ST_GEOMFROMTEXT(%s,900913)) * 0.000621371 as dis_miles,
levenshtein(c.name,%s) as dis_name,
ST_INTERSECTS(ST_TRANSFORM(c.geom,900913),ST_GEOMFROMTEXT(%s,900913)) as dep_intersects
from firestation_firedepartment c 
inner join firecares_core_address e 
  on c.headquarters_address_id=e.id
where e.geom is not null and
ST_DISTANCE(ST_TRANSFORM(e.geom,900913),ST_GEOMFROMTEXT(%s,900913)) * 0.000621371 <= 100 and
levenshtein(c.name,%s) <= 30; 
"""

queryUpdateFireStation = """
update firestation_firestation 
set department_id = %s
where firestation_firestation.usgsstructuredata_ptr_id = %s
"""

queryQualityControl = """
select fd.name as fd_name, usfa.number_of_stations as num_stations 
from firestation_firedepartment fd 
inner join usfa_census_national usfa 
 on fd.state=usfa."hq state" and fd.fdid=usfa.fdid;
"""

cursor.execute(queryUnmatchedStations)
fireStations = dictfetchall(cursor)
print "Number of Stations:", cursor.rowcount

matchedDict = dict()
for fireStation in fireStations:
    params = [fireStation['firestation_location'],
              fireStation['firestation_name'],
              fireStation['firestation_location'],
              fireStation['firestation_location'],
              fireStation['firestation_name']]
    cursor.execute(queryNearbyDepartments,params)
    nearbyDepartments = dictfetchall(cursor)
    distanceRatio = 0
    closestDepID = 0
    closestDepName = ''
    if cursor.rowcount == 0:
        continue
    for fireDepartment in nearbyDepartments:
        departmentRatio = 1 + (1 - fireDepartment['dis_miles'] / 100) * 50  + (1 - fireDepartment['dis_name'] / 30) * 50
        if fireDepartment['dep_intersects'] == True:
            departmentRatio = departmentRatio * 125            
        if departmentRatio > distanceRatio:
            distanceRatio = departmentRatio
            closestDepID = fireDepartment['dep_id']
            closestDepName = fireDepartment['department_name']
    fireStation['dep_id'] = closestDepID
    if matchedDict.get(closestDepName) == None:
        matchedDict[closestDepName] = 1
    else:
        matchedDict[closestDepName] = matchedDict[closestDepName] + 1

cursor.execute(queryQualityControl)
qualityControl = dictfetchall(cursor)

totalStationMatches = 0
totalIncorrectStations = 0
for fireDepartment in qualityControl:
    if matchedDict.get(fireDepartment['fd_name']) != None:
        if matchedDict[fireDepartment['fd_name']] == fireDepartment['num_stations']:
            totalStationMatches += 1
        else:
            totalIncorrectStations += 1
            
print "Station Matches:", totalStationMatches
print "Incorrect Station Counts:", totalIncorrectStations
     

In [None]:
cursor = connections['default'].cursor()
query = """
select c.name as department_name, a.name as department_name, levenshtein(c.name, replace(a.name, 'Station', '')) 
from firestation_usgsstructuredata a 
inner join firestation_firestation b
  on a.id=b.usgsstructuredata_ptr_id 
join firestation_firedepartment c 
  on b.department_id=c.id;
"""
cursor.execute(query)
results = dictfetchall(cursor)
df = DataFrame(results)

# distribution of levenshtein distances
pl.xlabel("Levenshtein Distance")
pl.ylabel("Count of records")
print df['levenshtein'].hist()

print df['levenshtein'].describe()



In [None]:
cursor = connections['default'].cursor()
query = """
select c.name as department_name, a.name as department_name, levenshtein(c.name, a.name) 
from firestation_usgsstructuredata a 
inner join firestation_firestation b
  on a.id=b.usgsstructuredata_ptr_id 
join firestation_firedepartment c 
  on b.department_id=c.id;
"""
cursor.execute(query)
results = dictfetchall(cursor)
df = DataFrame(results)
print df['levenshtein'].hist()
print df['levenshtein'].describe()


In [None]:
### replace 'Station' and numeric characters
cursor = connections['default'].cursor()
query = """
select c.name as department_name, a.name as department_name, levenshtein(c.name, substring(replace(a.name, 'Station', '') from '^[a-zA-Z -]+')) 
from firestation_usgsstructuredata a 
inner join firestation_firestation b
  on a.id=b.usgsstructuredata_ptr_id 
join firestation_firedepartment c 
  on b.department_id=c.id;
"""
cursor.execute(query)
results = dictfetchall(cursor)
df = DataFrame(results)

# distribution of levenshtein distances
pl.xlabel("Levenshtein Distance")
pl.ylabel("Count of records")
print df['levenshtein'].hist()
print df['levenshtein'].describe()


In [None]:
query = """
select c.name as department_name, a.name as department_name, levenshtein(c.name, a.name), ST_DISTANCE(ST_Transform(e.geom, 900913), ST_Transform(d.geom, 900913)) as dis 
from firestation_usgsstructuredata a 
inner join firestation_firestation b 
  on a.id=b.usgsstructuredata_ptr_id 
join firestation_firedepartment c 
  on b.department_id=c.id 
join firecares_core_address d 
  on b.station_address_id=d.id
join firecares_core_address e 
  on c.headquarters_address_id=e.id
where e.geom is not null and c.geom is not null;  
  ;
"""
cursor.execute(query)
results = dictfetchall(cursor)
df = DataFrame(results)

# distribution of levenshtein distances
pl.suptitle("Distribution of distance from headquarters address")
pl.xlabel("Distance from headquarters address")
pl.ylabel("Count of records")

# convert meters to miles.
df['dis_mi'] = df['dis'] * 0.000621371
print df['dis_mi'].hist()
print df['dis_mi'].describe()