# Purpose of this program 
1. Calculate the distances between Sources and Outcomes
2. Determine to closest Source to Outcome
3. Assign that Source to the closest Outcome.

We will be using Haversine formula to calculate the distance between the Sources and Outcomes.

We will be limiting the search for the closest to those Sources that are in the same City as the Outcomes.

This program will be used on both the Superfund, Cumlivative Risk, and Air Quality data.


In [1]:
import pandas as pd
# Import the Numpy dependency
import numpy as np
# Import the Haversine formula package.  Calculates distances using lat & log
import haversine as hs
from haversine import Unit

Bring in Outcome data (500 City) only keeping locational information for now

In [2]:
# Bringing in 500 city data 
city_file_path = "../ProcessedData/500_City_cleaned.csv"
citydf = pd.read_csv(city_file_path)
citydf.head()

Unnamed: 0,StateAbbr,PlaceName,PlaceFIPS,TractFIPS,Place_TractID,Population2010,Insurance,HighBloodPressure,Cancer,Asthma,HeartDisease,AnnualCheckUps,Smokes,MentalHealthIssues,Latitude,Longitude,newFIPS,CountyFIPS
0,AL,Birmingham,107000,1073003200,0107000-01073003200,931.0,26.8,57.0,6.0,14.4,11.1,78.2,29.6,22.0,33.509402,-86.885908,1073,1073
1,AL,Birmingham,107000,1073003300,0107000-01073003300,947.0,21.4,55.2,6.8,12.9,9.7,80.1,22.6,16.7,33.517126,-86.891382,1073,1073
2,AL,Birmingham,107000,1073010500,0107000-01073010500,114.0,23.9,60.5,7.1,13.6,11.1,80.7,26.5,18.6,33.436379,-86.912892,1073,1073
3,AL,Birmingham,107000,1073010701,0107000-01073010701,74.0,19.8,24.9,2.7,11.2,2.4,66.2,18.1,16.1,33.473886,-86.814649,1073,1073
4,AL,Birmingham,107000,1073010801,0107000-01073010801,168.0,7.3,33.0,7.4,8.9,4.5,73.7,8.5,8.8,33.514098,-86.746697,1073,1073


In [3]:
citydf = citydf[['StateAbbr','PlaceName','PlaceFIPS','TractFIPS','Place_TractID',
                 'CountyFIPS','Latitude','Longitude']]

In [4]:
citydf['city_state'] = citydf.PlaceName + "_" +  citydf.StateAbbr

Reading in Source and limiting it to select cities

In [5]:
# Superfund data
input_file_path2 = "../ProcessedData/Superfund.csv"
sfdf = pd.read_csv(input_file_path2)
sfdf.head()

Unnamed: 0,Site_EPA_ID,City,County_FIPS,County,State,StateAbbr,Longitude,Latitude
0,ALN000410750,Birmingham,1073,Jefferson,Alabama,AL,-86.799671,33.557464
1,AL7210020742,Huntsville,1089,Madison,Alabama,AL,-86.673055,34.646944
2,AL0001058056,Montgomery,1101,Montgomery,Alabama,AL,-86.306,32.382
3,ALD007454085,Montgomery,1101,Montgomery,Alabama,AL,-86.3791,32.3761
4,AK6214522157,Anchorage,2020,Anchorage Municipality,Alaska,AK,-149.7,61.258333


In [6]:
sfdf = sfdf[['Site_EPA_ID', 'County_FIPS', 'Longitude', 'Latitude']]

In [7]:
sfdf.describe()

Unnamed: 0,County_FIPS,Longitude,Latitude
count,238.0,238.0,238.0
mean,26248.495798,-97.65236,37.193208
std,17754.618059,16.913497,5.692965
min,1073.0,-149.7,25.8152
25%,12003.0,-117.34725,32.95528
50%,19622.0,-91.183277,37.3915
75%,45842.5,-83.231844,41.239169
max,56021.0,-74.085561,61.258333


Joining data on County_FIPS

In [8]:
outerdf = sfdf.merge(citydf, how = 'outer', left_on="County_FIPS",  right_on = 'CountyFIPS')
outerdf = outerdf.dropna(subset = ['Site_EPA_ID'])
outerdf = outerdf.dropna(subset = ['city_state'])
outerdf.describe()

Unnamed: 0,County_FIPS,Longitude_x,Latitude_x,PlaceFIPS,TractFIPS,CountyFIPS,Latitude_y,Longitude_y
count,2472.0,2472.0,2472.0,2472.0,2472.0,2472.0,2472.0,2472.0
mean,25271.216828,-103.131363,34.620644,2562018.0,25271490000.0,25271.216828,34.636244,-103.129501
std,19655.830676,15.22838,5.256095,1956955.0,19655800000.0,19655.830676,5.225091,15.198392
min,1073.0,-149.7,25.8152,107000.0,1073003000.0,1073.0,25.767793,-149.89126
25%,6071.0,-118.167434,29.798331,659962.0,6071002000.0,6071.0,29.973337,-118.090898
50%,17031.0,-97.401494,34.098061,1714000.0,17031800000.0,17031.0,33.981344,-97.056833
75%,48201.0,-90.040165,37.4012,4835000.0,48201250000.0,48201.0,37.369669,-90.009454
max,56021.0,-74.085561,61.258333,5613900.0,56021000000.0,56021.0,61.217709,-74.066961


In [9]:
outerdf.head()

Unnamed: 0,Site_EPA_ID,County_FIPS,Longitude_x,Latitude_x,StateAbbr,PlaceName,PlaceFIPS,TractFIPS,Place_TractID,CountyFIPS,Latitude_y,Longitude_y,city_state
0,ALN000410750,1073.0,-86.799671,33.557464,AL,Birmingham,107000.0,1073003000.0,0107000-01073003200,1073.0,33.509402,-86.885908,Birmingham_AL
1,ALN000410750,1073.0,-86.799671,33.557464,AL,Birmingham,107000.0,1073003000.0,0107000-01073003300,1073.0,33.517126,-86.891382,Birmingham_AL
2,ALN000410750,1073.0,-86.799671,33.557464,AL,Birmingham,107000.0,1073010000.0,0107000-01073010500,1073.0,33.436379,-86.912892,Birmingham_AL
3,ALN000410750,1073.0,-86.799671,33.557464,AL,Birmingham,107000.0,1073011000.0,0107000-01073010701,1073.0,33.473886,-86.814649,Birmingham_AL
4,ALN000410750,1073.0,-86.799671,33.557464,AL,Birmingham,107000.0,1073011000.0,0107000-01073010801,1073.0,33.514098,-86.746697,Birmingham_AL


In [10]:
outerdf['SFcoor'] = list(zip(outerdf.Latitude_x, outerdf.Longitude_x))
outerdf['Tractcoor'] = list(zip(outerdf.Latitude_y, outerdf.Longitude_y))

In [11]:
# Function from https://github.com/ashutoshb418/Foodies-Visualization/blob/master/Foodies_Chain.ipynb
def distance_from(loc1,loc2): 
    dist=hs.haversine(loc1,loc2)
    return round(dist,5)

In [12]:
# Calculate distances
outerdf['dist']=outerdf.apply(lambda row: distance_from(row.SFcoor,row.Tractcoor), axis = 1)

In [13]:
# Determine the min for each Superfund site
outerdf['mindist'] = outerdf.groupby(outerdf.Site_EPA_ID).dist.transform('min')
outerdf.head()

Unnamed: 0,Site_EPA_ID,County_FIPS,Longitude_x,Latitude_x,StateAbbr,PlaceName,PlaceFIPS,TractFIPS,Place_TractID,CountyFIPS,Latitude_y,Longitude_y,city_state,SFcoor,Tractcoor,dist,mindist
0,ALN000410750,1073.0,-86.799671,33.557464,AL,Birmingham,107000.0,1073003000.0,0107000-01073003200,1073.0,33.509402,-86.885908,Birmingham_AL,"(33.557464, -86.799671)","(33.5094018502, -86.8859081961)",9.6152,6.88186
1,ALN000410750,1073.0,-86.799671,33.557464,AL,Birmingham,107000.0,1073003000.0,0107000-01073003300,1073.0,33.517126,-86.891382,Birmingham_AL,"(33.557464, -86.799671)","(33.5171261108, -86.8913819749)",9.61098,6.88186
2,ALN000410750,1073.0,-86.799671,33.557464,AL,Birmingham,107000.0,1073010000.0,0107000-01073010500,1073.0,33.436379,-86.912892,Birmingham_AL,"(33.557464, -86.799671)","(33.4363786806, -86.9128923072)",17.0735,6.88186
3,ALN000410750,1073.0,-86.799671,33.557464,AL,Birmingham,107000.0,1073011000.0,0107000-01073010701,1073.0,33.473886,-86.814649,Birmingham_AL,"(33.557464, -86.799671)","(33.473886155, -86.8146487762)",9.39661,6.88186
4,ALN000410750,1073.0,-86.799671,33.557464,AL,Birmingham,107000.0,1073011000.0,0107000-01073010801,1073.0,33.514098,-86.746697,Birmingham_AL,"(33.557464, -86.799671)","(33.514097853, -86.7466971362)",6.88186,6.88186


In [14]:
# Limit to the closest site
df = outerdf[outerdf.dist == outerdf.mindist]


In [15]:
df['SFCount'] = df.groupby(df.TractFIPS).dist.transform('count')
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Site_EPA_ID,County_FIPS,Longitude_x,Latitude_x,StateAbbr,PlaceName,PlaceFIPS,TractFIPS,Place_TractID,CountyFIPS,Latitude_y,Longitude_y,city_state,SFcoor,Tractcoor,dist,mindist,SFCount
4,ALN000410750,1073.0,-86.799671,33.557464,AL,Birmingham,107000.0,1073011000.0,0107000-01073010801,1073.0,33.514098,-86.746697,Birmingham_AL,"(33.557464, -86.799671)","(33.514097853, -86.7466971362)",6.88186,6.88186,1
29,AL7210020742,1089.0,-86.673055,34.646944,AL,Huntsville,137000.0,1089011000.0,0137000-01089011022,1089.0,34.699003,-86.710815,Huntsville_AL,"(34.646944, -86.673055)","(34.6990029313, -86.7108145743)",6.74037,6.74037,1
31,AL0001058056,1101.0,-86.306,32.382,AL,Montgomery,151000.0,1101005000.0,0151000-01101005301,1101.0,32.406769,-86.246702,Montgomery_AL,"(32.382, -86.306)","(32.4067693075, -86.246702078)",6.21154,6.21154,1
42,ALD007454085,1101.0,-86.3791,32.3761,AL,Montgomery,151000.0,1101006000.0,0151000-01101005901,1101.0,32.293589,-86.353215,Montgomery_AL,"(32.3761, -86.3791)","(32.2935892389, -86.3532149081)",9.49164,9.49164,1
43,AK6214522157,2020.0,-149.7,61.258333,AK,Anchorage,203000.0,2020001000.0,0203000-02020001100,2020.0,61.217709,-149.89126,Anchorage_AK,"(61.258333, -149.7)","(61.2177091891, -149.891260329)",11.18583,11.18583,1


In [16]:
newdf = df[['Place_TractID', 'SFCount']]
newdf.describe()

Unnamed: 0,SFCount
count,235.0
mean,2.276596
std,1.812847
min,1.0
25%,1.0
50%,1.0
75%,3.0
max,8.0


In [17]:
SF = newdf.merge(citydf, on = 'Place_TractID', how = 'left')
SF.head()

Unnamed: 0,Place_TractID,SFCount,StateAbbr,PlaceName,PlaceFIPS,TractFIPS,CountyFIPS,Latitude,Longitude,city_state
0,0107000-01073010801,1,AL,Birmingham,107000,1073010801,1073,33.514098,-86.746697,Birmingham_AL
1,0137000-01089011022,1,AL,Huntsville,137000,1089011022,1089,34.699003,-86.710815,Huntsville_AL
2,0151000-01101005301,1,AL,Montgomery,151000,1101005301,1101,32.406769,-86.246702,Montgomery_AL
3,0151000-01101005901,1,AL,Montgomery,151000,1101005901,1101,32.293589,-86.353215,Montgomery_AL
4,0203000-02020001100,1,AK,Anchorage,203000,2020001100,2020,61.217709,-149.89126,Anchorage_AK


In [21]:
finalSF = SF[['city_state','Place_TractID','SFCount']]
finalSF.head()

Unnamed: 0,city_state,Place_TractID,SFCount
0,Birmingham_AL,0107000-01073010801,1
1,Huntsville_AL,0137000-01089011022,1
2,Montgomery_AL,0151000-01101005301,1
3,Montgomery_AL,0151000-01101005901,1
4,Anchorage_AK,0203000-02020001100,1


In [22]:
finalSF = finalSF.sort_values(["Place_TractID", "SFCount"],
               axis = 0, ascending = True,
               na_position = "first")
finalSF = finalSF.groupby(finalSF.Place_TractID).last()
finalSF = finalSF.reset_index()
finalSF.head()

Unnamed: 0,Place_TractID,city_state,SFCount
0,0107000-01073010801,Birmingham_AL,1
1,0137000-01089011022,Huntsville_AL,1
2,0151000-01101005301,Montgomery_AL,1
3,0151000-01101005901,Montgomery_AL,1
4,0203000-02020001100,Anchorage_AK,1


In [23]:
# Saving Cleaned data into csv
output_file_path = "../ProcessedData/FinalSuperFund.csv"
finalSF.to_csv(output_file_path, index=False)