# Purpose of this program 
1. Calculate the distances between Sources and Outcomes
2. Determine to closest Source to Outcome
3. Assign that Source to the closest Outcome.

We will be using Haversine formula to calculate the distance between the Sources and Outcomes.

We will be limiting the search for the closest to those Sources that are in the same City as the Outcomes.

This program will be used on both the Superfund, Cumlivative Risk, and Air Quality data.


In [1]:
import pandas as pd
# Import the Numpy dependency
import numpy as np
# Import the Haversine formula package.  Calculates distances using lat & log
import haversine as hs
from haversine import Unit

Bring in Outcome data (500 City) only keeping locational information for now

In [2]:
# Bringing in 500 city data 
city_file_path = "../ProcessedData/500_City_cleaned.csv"
citydf = pd.read_csv(city_file_path)
citydf.head()

Unnamed: 0,StateAbbr,PlaceName,PlaceFIPS,TractFIPS,Place_TractID,Population2010,Insurance,HighBloodPressure,Cancer,Asthma,HeartDisease,AnnualCheckUps,Smokes,MentalHealthIssues,Latitude,Longitude,newFIPS,CountyFIPS
0,AL,Birmingham,107000,1073003200,0107000-01073003200,931.0,26.8,57.0,6.0,14.4,11.1,78.2,29.6,22.0,33.509402,-86.885908,1073,1073
1,AL,Birmingham,107000,1073003300,0107000-01073003300,947.0,21.4,55.2,6.8,12.9,9.7,80.1,22.6,16.7,33.517126,-86.891382,1073,1073
2,AL,Birmingham,107000,1073010500,0107000-01073010500,114.0,23.9,60.5,7.1,13.6,11.1,80.7,26.5,18.6,33.436379,-86.912892,1073,1073
3,AL,Birmingham,107000,1073010701,0107000-01073010701,74.0,19.8,24.9,2.7,11.2,2.4,66.2,18.1,16.1,33.473886,-86.814649,1073,1073
4,AL,Birmingham,107000,1073010801,0107000-01073010801,168.0,7.3,33.0,7.4,8.9,4.5,73.7,8.5,8.8,33.514098,-86.746697,1073,1073


In [3]:
citydf = citydf[['StateAbbr','PlaceName','PlaceFIPS','TractFIPS','Place_TractID',
                 'CountyFIPS','Latitude','Longitude']]

In [4]:
citydf['city_state'] = citydf.PlaceName + "_" +  citydf.StateAbbr

In [5]:
citydf.groupby(citydf.Place_TractID).describe()

Unnamed: 0_level_0,PlaceFIPS,PlaceFIPS,PlaceFIPS,PlaceFIPS,PlaceFIPS,PlaceFIPS,PlaceFIPS,PlaceFIPS,TractFIPS,TractFIPS,...,Latitude,Latitude,Longitude,Longitude,Longitude,Longitude,Longitude,Longitude,Longitude,Longitude
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Place_TractID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0107000-01073003200,1.0,107000.0,,107000.0,107000.0,107000.0,107000.0,107000.0,1.0,1.073003e+09,...,33.509402,33.509402,1.0,-86.885908,,-86.885908,-86.885908,-86.885908,-86.885908,-86.885908
0107000-01073003300,1.0,107000.0,,107000.0,107000.0,107000.0,107000.0,107000.0,1.0,1.073003e+09,...,33.517126,33.517126,1.0,-86.891382,,-86.891382,-86.891382,-86.891382,-86.891382,-86.891382
0107000-01073010500,1.0,107000.0,,107000.0,107000.0,107000.0,107000.0,107000.0,1.0,1.073010e+09,...,33.436379,33.436379,1.0,-86.912892,,-86.912892,-86.912892,-86.912892,-86.912892,-86.912892
0107000-01073010701,1.0,107000.0,,107000.0,107000.0,107000.0,107000.0,107000.0,1.0,1.073011e+09,...,33.473886,33.473886,1.0,-86.814649,,-86.814649,-86.814649,-86.814649,-86.814649,-86.814649
0107000-01073010801,1.0,107000.0,,107000.0,107000.0,107000.0,107000.0,107000.0,1.0,1.073011e+09,...,33.514098,33.514098,1.0,-86.746697,,-86.746697,-86.746697,-86.746697,-86.746697,-86.746697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5566000-55101001505,1.0,5566000.0,,5566000.0,5566000.0,5566000.0,5566000.0,5566000.0,1.0,5.510100e+10,...,42.771772,42.771772,1.0,-87.785185,,-87.785185,-87.785185,-87.785185,-87.785185,-87.785185
5584250-55133202902,1.0,5584250.0,,5584250.0,5584250.0,5584250.0,5584250.0,5584250.0,1.0,5.513320e+10,...,43.022614,43.022614,1.0,-88.193942,,-88.193942,-88.193942,-88.193942,-88.193942,-88.193942
5584250-55133203804,1.0,5584250.0,,5584250.0,5584250.0,5584250.0,5584250.0,5584250.0,1.0,5.513320e+10,...,43.016192,43.016192,1.0,-88.295400,,-88.295400,-88.295400,-88.295400,-88.295400,-88.295400
5613900-56021000402,1.0,5613900.0,,5613900.0,5613900.0,5613900.0,5613900.0,5613900.0,1.0,5.602100e+10,...,41.120901,41.120901,1.0,-104.773244,,-104.773244,-104.773244,-104.773244,-104.773244,-104.773244


Reading in Source and limiting it to select cities

In [6]:
# Reading in Cleaned AirQuality data
input_file_path = "../ProcessedData/AirQuality_cleaned.csv"
aqdf = pd.read_csv(input_file_path)
aqdf.head()

Unnamed: 0,SiteId,Latitude,Longitude,Sample Duration,PM2.5_Exceptional,PM2.5_ExceedCount,PM2.5_Max,State,County,City,index,Ozone_Exceptional,Ozone_ExceedCount,Ozone_Max,StateAbbr,city_state
0,11,33.553056,-86.815,24-HR BLK AVG,0,0.0,26.6,Alabama,Jefferson,Birmingham,69,0.0,1.0,0.078,AL,Birmingham_AL
1,15,33.499722,-86.924167,24 HOUR,0,0.0,22.0,Alabama,Jefferson,Birmingham,306,,,,AL,Birmingham_AL
2,17,33.521427,-86.844112,24 HOUR,0,0.0,23.1,Alabama,Jefferson,Birmingham,411,,,,AL,Birmingham_AL
3,25,34.68547,-86.58816,24 HOUR,0,0.0,18.5,Alabama,Madison,Huntsville,500,0.0,0.0,0.07,AL,Huntsville_AL
4,29,32.412811,-86.263394,24 HOUR,0,0.0,20.8,Alabama,Montgomery,Montgomery,564,0.0,0.0,0.06,AL,Montgomery_AL


In [7]:
aqdf2 = aqdf[['StateAbbr','SiteId', 'Longitude', 'Latitude']]

In [8]:
aqdf2.describe()

Unnamed: 0,SiteId,Longitude,Latitude
count,324.0,324.0,324.0
mean,1188.222222,-97.489493,37.482421
std,776.866892,16.597165,5.278893
min,11.0,-157.871171,21.303382
25%,481.0,-112.07526,33.517029
50%,1165.5,-93.719149,37.875397
75%,1846.0,-83.912782,41.116006
max,2595.0,-71.0543,61.3267


Joining data on County_FIPS

In [9]:
outerdf = aqdf2.merge(citydf, how = 'outer', left_on="StateAbbr",  right_on = 'StateAbbr')
outerdf = outerdf.dropna(subset = ['StateAbbr'])
outerdf = outerdf.dropna(subset = ['city_state'])
outerdf.describe()

Unnamed: 0,SiteId,Longitude_x,Latitude_x,PlaceFIPS,TractFIPS,CountyFIPS,Latitude_y,Longitude_y
count,23560.0,23560.0,23560.0,23566.0,23566.0,23566.0,23566.0,23566.0
mean,955.969652,-105.275691,35.141522,2074561.0,20426220000.0,20426.033523,34.766271,-104.640982
std,822.157189,15.701804,4.373967,1815838.0,18239150000.0,18239.166553,4.202529,15.34309
min,11.0,-157.871171,21.303382,15003.0,1073003000.0,1073.0,21.281244,-158.112465
25%,304.0,-119.691218,32.774262,644000.0,6059076000.0,6059.0,32.776174,-117.984959
50%,441.0,-111.872222,34.669739,684200.0,6099004000.0,6099.0,33.988256,-111.641931
75%,1817.0,-92.042908,37.936013,3921000.0,39151710000.0,39151.0,37.70411,-91.620675
max,2595.0,-71.0543,61.3267,5613900.0,56021000000.0,56021.0,61.217709,-70.965134


In [10]:
outerdf.head()

Unnamed: 0,StateAbbr,SiteId,Longitude_x,Latitude_x,PlaceName,PlaceFIPS,TractFIPS,Place_TractID,CountyFIPS,Latitude_y,Longitude_y,city_state
0,AL,11.0,-86.815,33.553056,Birmingham,107000,1073003200,0107000-01073003200,1073,33.509402,-86.885908,Birmingham_AL
1,AL,11.0,-86.815,33.553056,Birmingham,107000,1073003300,0107000-01073003300,1073,33.517126,-86.891382,Birmingham_AL
2,AL,11.0,-86.815,33.553056,Birmingham,107000,1073010500,0107000-01073010500,1073,33.436379,-86.912892,Birmingham_AL
3,AL,11.0,-86.815,33.553056,Birmingham,107000,1073010701,0107000-01073010701,1073,33.473886,-86.814649,Birmingham_AL
4,AL,11.0,-86.815,33.553056,Birmingham,107000,1073010801,0107000-01073010801,1073,33.514098,-86.746697,Birmingham_AL


In [11]:
outerdf['Sourcecoor'] = list(zip(outerdf.Latitude_x, outerdf.Longitude_x))
outerdf['Tractcoor'] = list(zip(outerdf.Latitude_y, outerdf.Longitude_y))

In [12]:
# Function from https://github.com/ashutoshb418/Foodies-Visualization/blob/master/Foodies_Chain.ipynb
def distance_from(loc1,loc2): 
    dist=hs.haversine(loc1,loc2)
    return round(dist,5)

In [13]:
# Calculate distances
outerdf['dist']=outerdf.apply(lambda row: distance_from(row.Sourcecoor,row.Tractcoor), axis = 1)

In [14]:
# Determine the min for each Air Quality site
outerdf['mindist'] = outerdf.groupby(outerdf.Place_TractID).dist.transform('min')
outerdf.head()

Unnamed: 0,StateAbbr,SiteId,Longitude_x,Latitude_x,PlaceName,PlaceFIPS,TractFIPS,Place_TractID,CountyFIPS,Latitude_y,Longitude_y,city_state,Sourcecoor,Tractcoor,dist,mindist
0,AL,11.0,-86.815,33.553056,Birmingham,107000,1073003200,0107000-01073003200,1073,33.509402,-86.885908,Birmingham_AL,"(33.553056, -86.815)","(33.5094018502, -86.8859081961)",8.17071,3.70702
1,AL,11.0,-86.815,33.553056,Birmingham,107000,1073003300,0107000-01073003300,1073,33.517126,-86.891382,Birmingham_AL,"(33.553056, -86.815)","(33.5171261108, -86.8913819749)",8.12909,3.60344
2,AL,11.0,-86.815,33.553056,Birmingham,107000,1073010500,0107000-01073010500,1073,33.436379,-86.912892,Birmingham_AL,"(33.553056, -86.815)","(33.4363786806, -86.9128923072)",15.83428,7.12068
3,AL,11.0,-86.815,33.553056,Birmingham,107000,1073010701,0107000-01073010701,1073,33.473886,-86.814649,Birmingham_AL,"(33.553056, -86.815)","(33.473886155, -86.8146487762)",8.80336,5.95055
4,AL,11.0,-86.815,33.553056,Birmingham,107000,1073010801,0107000-01073010801,1073,33.514098,-86.746697,Birmingham_AL,"(33.553056, -86.815)","(33.514097853, -86.7466971362)",7.67109,7.67109


In [15]:
# Limit to the closest site
df = outerdf[outerdf.dist == outerdf.mindist]


In [16]:
df = df.drop(['Sourcecoor','Longitude_x','Latitude_x'], axis=1)
# Find duplicates entries
print(f"Duplicate entries: {df.duplicated().sum()}")
df = df.drop_duplicates()
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 88
Duplicate entries: 0


In [17]:
df2 = df[['Place_TractID','SiteId']]
df2.describe()

Unnamed: 0,SiteId
count,1372.0
mean,1202.796647
std,788.360509
min,11.0
25%,417.0
50%,1166.0
75%,1943.0
max,2595.0


In [18]:
# Find duplicates entries
print(f"Duplicate entries: {df2.duplicated().sum()}")
df2 = df2.drop_duplicates()
print(f"Duplicate entries: {df2.duplicated().sum()}")

Duplicate entries: 0
Duplicate entries: 0


In [19]:
city = df2.merge(citydf, on = 'Place_TractID', how = 'left')
city.head()


Unnamed: 0,Place_TractID,SiteId,StateAbbr,PlaceName,PlaceFIPS,TractFIPS,CountyFIPS,Latitude,Longitude,city_state
0,0107000-01073010801,11.0,AL,Birmingham,107000,1073010801,1073,33.514098,-86.746697,Birmingham_AL
1,0107000-01073010803,11.0,AL,Birmingham,107000,1073010803,1073,33.522909,-86.710262,Birmingham_AL
2,0107000-01073010805,11.0,AL,Birmingham,107000,1073010805,1073,33.495279,-86.698718,Birmingham_AL
3,0107000-01073011207,11.0,AL,Birmingham,107000,1073011207,1073,33.671885,-86.677251,Birmingham_AL
4,0107000-01073011803,11.0,AL,Birmingham,107000,1073011803,1073,33.625258,-86.699861,Birmingham_AL


In [20]:
AQ = city[['StateAbbr','PlaceName','city_state','PlaceFIPS','TractFIPS','CountyFIPS','Place_TractID','SiteId']].merge(aqdf, on='SiteId', how = 'left')
AQ.columns

Index(['StateAbbr_x', 'PlaceName', 'city_state_x', 'PlaceFIPS', 'TractFIPS',
       'CountyFIPS', 'Place_TractID', 'SiteId', 'Latitude', 'Longitude',
       'Sample Duration', 'PM2.5_Exceptional', 'PM2.5_ExceedCount',
       'PM2.5_Max', 'State', 'County', 'City', 'index', 'Ozone_Exceptional',
       'Ozone_ExceedCount', 'Ozone_Max', 'StateAbbr_y', 'city_state_y'],
      dtype='object')

In [21]:
print(f"Duplicate entries: {AQ.duplicated().sum()}")
finalAQ = AQ.drop_duplicates()

Duplicate entries: 48


In [22]:
# Drop repeat columns
newdf = finalAQ.drop(['StateAbbr_y', 'city_state_y'], axis=1)

# Rename
df = newdf.rename(columns={'StateAbbr_x': 'StateAbbr',
    'city_state_x': 'city_state'})

# reorder

finaldf = df[['StateAbbr','PlaceName','city_state','PlaceFIPS','TractFIPS',
              'CountyFIPS','Place_TractID','SiteId', 'PM2.5_Exceptional', 
              'PM2.5_ExceedCount', 'PM2.5_Max','Ozone_Exceptional', 
              'Ozone_ExceedCount', 'Ozone_Max']]

finaldf.columns

Index(['StateAbbr', 'PlaceName', 'city_state', 'PlaceFIPS', 'TractFIPS',
       'CountyFIPS', 'Place_TractID', 'SiteId', 'PM2.5_Exceptional',
       'PM2.5_ExceedCount', 'PM2.5_Max', 'Ozone_Exceptional',
       'Ozone_ExceedCount', 'Ozone_Max'],
      dtype='object')

In [28]:
finaldf = finaldf.groupby(finaldf.Place_TractID).last()
finaldf = finaldf.reset_index()
finaldf.head()

Unnamed: 0,Place_TractID,StateAbbr,PlaceName,city_state,PlaceFIPS,TractFIPS,CountyFIPS,SiteId,PM2.5_Exceptional,PM2.5_ExceedCount,PM2.5_Max,Ozone_Exceptional,Ozone_ExceedCount,Ozone_Max
0,0107000-01073003200,AL,Birmingham,Birmingham_AL,107000,1073003200,1073,15.0,0,0.0,22.0,,,
1,0107000-01073003300,AL,Birmingham,Birmingham_AL,107000,1073003300,1073,15.0,0,0.0,22.0,,,
2,0107000-01073010500,AL,Birmingham,Birmingham_AL,107000,1073010500,1073,15.0,0,0.0,22.0,,,
3,0107000-01073010701,AL,Birmingham,Birmingham_AL,107000,1073010701,1073,17.0,0,0.0,23.1,,,
4,0107000-01073010801,AL,Birmingham,Birmingham_AL,107000,1073010801,1073,11.0,0,0.0,26.6,0.0,1.0,0.078


In [31]:
# Saving Cleaned data into csv
output_file_path = "../ProcessedData/FinalAirQuality.csv"
finaldf.to_csv(output_file_path, index=False)

end of code

In [30]:
# check for multiple tracks
Place_TractIDdf = finaldf['Place_TractID']
print(f"Duplicate entries: {Place_TractIDdf.duplicated().sum()}")

Duplicate entries: 0
