# Purpose of this program 
1. Calculate the distances between Sources and Outcomes
2. Determine to closest Source to Outcome
3. Assign that Source to the closest Outcome.

We will be using Haversine formula to calculate the distance between the Sources and Outcomes.

We will be limiting the search for the closest to those Sources that are in the same City as the Outcomes.

This program will be used on both the Superfund, Cumlivative Risk, and Air Quality data.


In [1]:
import pandas as pd
# Import the Numpy dependency
import numpy as np
# Import the Haversine formula package.  Calculates distances using lat & log
import haversine as hs
from haversine import Unit

Bring in Outcome data (500 City) only keeping locational information for now

In [2]:
# Bringing in 500 city data 
city_file_path = "../ProcessedData/500_City_cleaned.csv"
citydf = pd.read_csv(city_file_path)
citydf.head()

Unnamed: 0,StateAbbr,PlaceName,PlaceFIPS,TractFIPS,Place_TractID,Population2010,Insurance,HighBloodPressure,Cancer,Asthma,HeartDisease,AnnualCheckUps,Smokes,MentalHealthIssues,Latitude,Longitude,newFIPS,CountyFIPS
0,AL,Birmingham,107000,1073003200,0107000-01073003200,931.0,26.8,57.0,6.0,14.4,11.1,78.2,29.6,22.0,33.509402,-86.885908,1073,1073
1,AL,Birmingham,107000,1073003300,0107000-01073003300,947.0,21.4,55.2,6.8,12.9,9.7,80.1,22.6,16.7,33.517126,-86.891382,1073,1073
2,AL,Birmingham,107000,1073010500,0107000-01073010500,114.0,23.9,60.5,7.1,13.6,11.1,80.7,26.5,18.6,33.436379,-86.912892,1073,1073
3,AL,Birmingham,107000,1073010701,0107000-01073010701,74.0,19.8,24.9,2.7,11.2,2.4,66.2,18.1,16.1,33.473886,-86.814649,1073,1073
4,AL,Birmingham,107000,1073010801,0107000-01073010801,168.0,7.3,33.0,7.4,8.9,4.5,73.7,8.5,8.8,33.514098,-86.746697,1073,1073


In [3]:
citydf = citydf[['StateAbbr','PlaceName','PlaceFIPS','TractFIPS','Place_TractID',
                 'CountyFIPS','Latitude','Longitude']]

In [4]:
citydf['city_state'] = citydf.PlaceName + "_" +  citydf.StateAbbr

Reading in Source and limiting it to select cities

In [5]:
# Saving Cleaned data into csv
output_file_path = "../ProcessedData/AirQuality_cleaned.csv"
aqdf = pd.read_csv(output_file_path)
aqdf.head()

Unnamed: 0,Site Num,Latitude,Longitude,PM2.5_Exceptional,PM2.5_ExceedCount,PM2.5_Max,State,County,City,Ozone_Exceptional,Ozone_ExceedCount,Ozone_Max,StateAbbr,city_state
0,23,33.553056,-86.815,0,0.0,22.7,Alabama,Jefferson,Birmingham,0.0,1.0,0.078,AL,Birmingham_AL
1,23,33.553056,-86.815,0,0.0,22.7,Alabama,Jefferson,Birmingham,0.0,0.0,0.063,AL,Birmingham_AL
2,23,33.553056,-86.815,0,0.0,22.7,Alabama,Jefferson,Birmingham,0.0,2.0,0.076,AL,Birmingham_AL
3,23,33.553056,-86.815,0,0.0,22.7,Alabama,Jefferson,Birmingham,0.0,2.0,0.071,AL,Birmingham_AL
4,23,33.553056,-86.815,0,0.0,22.7,Alabama,Jefferson,Birmingham,0.0,4.0,0.078,AL,Birmingham_AL


In [6]:
aqdf2 = aqdf[['StateAbbr','Site Num', 'Longitude', 'Latitude']]

In [7]:
aqdf2.describe()

Unnamed: 0,Site Num,Longitude,Latitude
count,8279.0,8279.0,8279.0
mean,148.431574,-99.045964,38.453421
std,641.162964,16.556488,4.511202
min,1.0,-157.871171,21.303382
25%,4.0,-111.975524,35.614131
50%,7.0,-95.38769,39.741694
75%,14.0,-85.387908,41.206321
max,9997.0,-71.0543,61.3267


Joining data on County_FIPS

In [8]:
outerdf = aqdf2.merge(citydf, how = 'outer', left_on="StateAbbr",  right_on = 'StateAbbr')
outerdf = outerdf.dropna(subset = ['StateAbbr'])
outerdf = outerdf.dropna(subset = ['city_state'])
outerdf.describe()

Unnamed: 0,Site Num,Longitude_x,Latitude_x,PlaceFIPS,TractFIPS,CountyFIPS,Latitude_y,Longitude_y
count,598171.0,598171.0,598171.0,598177.0,598177.0,598177.0,598177.0,598177.0
mean,199.77587,-111.179071,36.573391,1366339.0,13304000000.0,13303.803742,35.441696,-109.99385
std,714.684229,14.9865,3.613219,1417599.0,14218190000.0,14218.210867,3.489228,14.306046
min,1.0,-157.871171,21.303382,15003.0,1073003000.0,1073.0,21.281244,-158.112465
25%,5.0,-121.849783,34.10002,633182.0,6037980000.0,6037.0,33.749172,-118.488619
50%,7.0,-119.691218,37.482934,664000.0,6071009000.0,6071.0,34.109169,-117.506268
75%,12.0,-97.712891,38.102507,1269700.0,12099010000.0,12099.0,37.768917,-97.611118
max,9997.0,-71.0543,61.3267,5613900.0,56021000000.0,56021.0,61.217709,-70.965134


In [9]:
outerdf.head()

Unnamed: 0,StateAbbr,Site Num,Longitude_x,Latitude_x,PlaceName,PlaceFIPS,TractFIPS,Place_TractID,CountyFIPS,Latitude_y,Longitude_y,city_state
0,AL,23.0,-86.815,33.553056,Birmingham,107000,1073003200,0107000-01073003200,1073,33.509402,-86.885908,Birmingham_AL
1,AL,23.0,-86.815,33.553056,Birmingham,107000,1073003300,0107000-01073003300,1073,33.517126,-86.891382,Birmingham_AL
2,AL,23.0,-86.815,33.553056,Birmingham,107000,1073010500,0107000-01073010500,1073,33.436379,-86.912892,Birmingham_AL
3,AL,23.0,-86.815,33.553056,Birmingham,107000,1073010701,0107000-01073010701,1073,33.473886,-86.814649,Birmingham_AL
4,AL,23.0,-86.815,33.553056,Birmingham,107000,1073010801,0107000-01073010801,1073,33.514098,-86.746697,Birmingham_AL


In [10]:
outerdf['Sourcecoor'] = list(zip(outerdf.Latitude_x, outerdf.Longitude_x))
outerdf['Tractcoor'] = list(zip(outerdf.Latitude_y, outerdf.Longitude_y))

In [11]:
# Function from https://github.com/ashutoshb418/Foodies-Visualization/blob/master/Foodies_Chain.ipynb
def distance_from(loc1,loc2): 
    dist=hs.haversine(loc1,loc2)
    return round(dist,5)

In [12]:
# Calculate distances
outerdf['dist']=outerdf.apply(lambda row: distance_from(row.Sourcecoor,row.Tractcoor), axis = 1)

In [13]:
# Determine the min for each Superfund site
outerdf['mindist'] = outerdf.groupby(outerdf.Place_TractID).dist.transform('min')
outerdf.head()

Unnamed: 0,StateAbbr,Site Num,Longitude_x,Latitude_x,PlaceName,PlaceFIPS,TractFIPS,Place_TractID,CountyFIPS,Latitude_y,Longitude_y,city_state,Sourcecoor,Tractcoor,dist,mindist
0,AL,23.0,-86.815,33.553056,Birmingham,107000,1073003200,0107000-01073003200,1073,33.509402,-86.885908,Birmingham_AL,"(33.553056, -86.815)","(33.5094018502, -86.8859081961)",8.17071,3.70702
1,AL,23.0,-86.815,33.553056,Birmingham,107000,1073003300,0107000-01073003300,1073,33.517126,-86.891382,Birmingham_AL,"(33.553056, -86.815)","(33.5171261108, -86.8913819749)",8.12909,3.60344
2,AL,23.0,-86.815,33.553056,Birmingham,107000,1073010500,0107000-01073010500,1073,33.436379,-86.912892,Birmingham_AL,"(33.553056, -86.815)","(33.4363786806, -86.9128923072)",15.83428,7.12068
3,AL,23.0,-86.815,33.553056,Birmingham,107000,1073010701,0107000-01073010701,1073,33.473886,-86.814649,Birmingham_AL,"(33.553056, -86.815)","(33.473886155, -86.8146487762)",8.80336,5.95055
4,AL,23.0,-86.815,33.553056,Birmingham,107000,1073010801,0107000-01073010801,1073,33.514098,-86.746697,Birmingham_AL,"(33.553056, -86.815)","(33.514097853, -86.7466971362)",7.67109,7.67109


In [14]:
# Limit to the closest site
df = outerdf[outerdf.dist == outerdf.mindist]


In [15]:
df = df.drop(['Sourcecoor','Longitude_x','Latitude_x'], axis=1)
# Find duplicates entries
print(f"Duplicate entries: {df.duplicated().sum()}")
df = df.drop_duplicates()
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 34242
Duplicate entries: 0


In [16]:
df2 = df[['Place_TractID','Site Num']]
df2.describe()

Unnamed: 0,Site Num
count,1372.0
mean,712.574344
std,1698.647862
min,1.0
25%,16.0
50%,38.0
75%,1003.0
max,9812.0


In [17]:
AQ = df2.merge(citydf, on = 'Place_TractID', how = 'left')
AQ.head()

Unnamed: 0,Place_TractID,Site Num,StateAbbr,PlaceName,PlaceFIPS,TractFIPS,CountyFIPS,Latitude,Longitude,city_state
0,0107000-01073010801,23.0,AL,Birmingham,107000,1073010801,1073,33.514098,-86.746697,Birmingham_AL
1,0107000-01073010803,23.0,AL,Birmingham,107000,1073010803,1073,33.522909,-86.710262,Birmingham_AL
2,0107000-01073010805,23.0,AL,Birmingham,107000,1073010805,1073,33.495279,-86.698718,Birmingham_AL
3,0107000-01073011207,23.0,AL,Birmingham,107000,1073011207,1073,33.671885,-86.677251,Birmingham_AL
4,0107000-01073011803,23.0,AL,Birmingham,107000,1073011803,1073,33.625258,-86.699861,Birmingham_AL


In [23]:
finalAQ = AQ[['StateAbbr','PlaceName','city_state','PlaceFIPS','TractFIPS','CountyFIPS','Place_TractID','Site Num']].merge(aqdf, on='Site Num', how = 'left')
finalAQ.columns

Index(['StateAbbr_x', 'PlaceName', 'city_state_x', 'PlaceFIPS', 'TractFIPS',
       'CountyFIPS', 'Place_TractID', 'Site Num', 'Latitude', 'Longitude',
       'PM2.5_Exceptional', 'PM2.5_ExceedCount', 'PM2.5_Max', 'State',
       'County', 'City', 'Ozone_Exceptional', 'Ozone_ExceedCount', 'Ozone_Max',
       'StateAbbr_y', 'city_state_y'],
      dtype='object')

In [24]:
# Drop repeat columns
newdf = finalAQ.drop(['StateAbbr_y', 'city_state_y'], axis=1)

# Rename
df = newdf.rename(columns={'StateAbbr_x': 'StateAbbr',
    'city_state_x': 'city_state'})

# reorder

finaldf = df[['StateAbbr','PlaceName','city_state','PlaceFIPS','TractFIPS',
              'CountyFIPS','Place_TractID','Site Num', 'PM2.5_Exceptional', 
              'PM2.5_ExceedCount', 'PM2.5_Max','Ozone_Exceptional', 
              'Ozone_ExceedCount', 'Ozone_Max']]

In [25]:
# Saving Cleaned data into csv
output_file_path = "../ProcessedData/FinalAirQuality.csv"
finaldf.to_csv(output_file_path, index=False)

end of code