# Notebook for data collection and pre processing


#### First collecting the US cities

- The data is collected from simplemaps (https://simplemaps.com/data/world-cities) which provides data for all the cities in the world.

- The data is then filtered to get only the US cities.

In [1]:
import pandas as pd
import numpy as np

data = "/Users/alexandreribeiro/Downloads/simplemaps_worldcities_basicv1.77/worldcities.csv"

df = pd.read_csv(data)

df

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6897,139.6922,Japan,JP,JPN,Tōkyō,primary,37732000.0,1392685764
1,Jakarta,Jakarta,-6.1750,106.8275,Indonesia,ID,IDN,Jakarta,primary,33756000.0,1360771077
2,Delhi,Delhi,28.6100,77.2300,India,IN,IND,Delhi,admin,32226000.0,1356872604
3,Guangzhou,Guangzhou,23.1300,113.2600,China,CN,CHN,Guangdong,admin,26940000.0,1156237133
4,Mumbai,Mumbai,19.0761,72.8775,India,IN,IND,Mahārāshtra,admin,24973000.0,1356226629
...,...,...,...,...,...,...,...,...,...,...,...
47863,Munha-dong,Munha-dong,39.3813,127.2517,"Korea, North",KP,PRK,Kangwŏn,,,1408979215
47864,Sil-li,Sil-li,39.4880,125.4640,"Korea, North",KP,PRK,P’yŏngnam,,,1408767958
47865,Muan,Muan,34.9897,126.4714,"Korea, South",KR,KOR,Jeonnam,admin,,1410001061
47866,Hongseong,Hongseong,36.6009,126.6650,"Korea, South",KR,KOR,Chungnam,admin,,1410822139


In [2]:
# Focus on entries with on the country United States

df_usa = df[df['country'] == 'United States']

df_usa

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
11,New York,New York,40.6943,-73.9249,United States,US,USA,New York,,18908608.0,1840034016
33,Los Angeles,Los Angeles,34.1141,-118.4068,United States,US,USA,California,,11922389.0,1840020491
55,Chicago,Chicago,41.8375,-87.6866,United States,US,USA,Illinois,,8497759.0,1840000494
88,Miami,Miami,25.7840,-80.2101,United States,US,USA,Florida,,6080145.0,1840015149
90,Houston,Houston,29.7860,-95.3885,United States,US,USA,Texas,,5970127.0,1840020925
...,...,...,...,...,...,...,...,...,...,...,...
46360,Whitehouse,Whitehouse,32.2222,-95.2210,United States,US,USA,Texas,,8512.0,1840022077
46378,Altoona,Altoona,44.8029,-91.4385,United States,US,USA,Wisconsin,,8506.0,1840002295
46379,Old Forge,Old Forge,41.3704,-75.7409,United States,US,USA,Pennsylvania,,8506.0,1840003387
46387,West Earl,West Earl,40.1260,-76.1774,United States,US,USA,Pennsylvania,,8504.0,1840152739


In [3]:
# Checking missing values

df_usa.isnull().sum()

city             0
city_ascii       0
lat              0
lng              0
country          0
iso2             0
iso3             0
admin_name       0
capital       5273
population       0
id               0
dtype: int64

In [4]:
# check dupplicates

df_usa.duplicated().sum()

0

In [5]:
# drop columns

df_usa.drop(columns=['city_ascii', 'iso2', 'iso3', 'capital', 'id'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_usa.drop(columns=['city_ascii', 'iso2', 'iso3', 'capital', 'id'], inplace=True)


In [6]:
df_usa

Unnamed: 0,city,lat,lng,country,admin_name,population
11,New York,40.6943,-73.9249,United States,New York,18908608.0
33,Los Angeles,34.1141,-118.4068,United States,California,11922389.0
55,Chicago,41.8375,-87.6866,United States,Illinois,8497759.0
88,Miami,25.7840,-80.2101,United States,Florida,6080145.0
90,Houston,29.7860,-95.3885,United States,Texas,5970127.0
...,...,...,...,...,...,...
46360,Whitehouse,32.2222,-95.2210,United States,Texas,8512.0
46378,Altoona,44.8029,-91.4385,United States,Wisconsin,8506.0
46379,Old Forge,41.3704,-75.7409,United States,Pennsylvania,8506.0
46387,West Earl,40.1260,-76.1774,United States,Pennsylvania,8504.0


In [7]:
# export to csv

df_usa.to_csv('usa_cities.csv', index=False)

In [8]:
# using just city and lat and lng to make a new dataframe

df_gee = df_usa[['city', 'lat', 'lng']]

In [9]:
df_gee

Unnamed: 0,city,lat,lng
11,New York,40.6943,-73.9249
33,Los Angeles,34.1141,-118.4068
55,Chicago,41.8375,-87.6866
88,Miami,25.7840,-80.2101
90,Houston,29.7860,-95.3885
...,...,...,...
46360,Whitehouse,32.2222,-95.2210
46378,Altoona,44.8029,-91.4385
46379,Old Forge,41.3704,-75.7409
46387,West Earl,40.1260,-76.1774


In [10]:
# export to csv

df_gee.to_csv('usa_cities_gee.csv', index=False)

### Importing datasets obtained from Google Earth Engine

- The datasets were obtained from Google Earth Engine through 5 batches of 1000 cities each.

- These datasets contain the greenspaces area inside the city limits.

In [11]:
# Load the datasets from the uploaded files

batch_0 = pd.read_csv('/Users/alexandreribeiro/Desktop/us_cities/drive-download-20240815T084459Z-001/GreenSpaceAreas_Batch_0.csv')
batch_1000 = pd.read_csv('/Users/alexandreribeiro/Desktop/us_cities/drive-download-20240815T084459Z-001/GreenSpaceAreas_Batch_1000.csv')
batch_2000 = pd.read_csv('/Users/alexandreribeiro/Desktop/us_cities/drive-download-20240815T084459Z-001/GreenSpaceAreas_Batch_2000.csv')
batch_3000 = pd.read_csv('/Users/alexandreribeiro/Desktop/us_cities/drive-download-20240815T084459Z-001/GreenSpaceAreas_Batch_3000.csv')
batch_4000 = pd.read_csv('/Users/alexandreribeiro/Desktop/us_cities/drive-download-20240815T084459Z-001/GreenSpaceAreas_Batch_4000.csv')

# Combine the datasets into one dataframe

combined_data_new = pd.concat([batch_0, batch_1000, batch_2000, batch_3000, batch_4000])

# Check for null values in the combined new dataset

null_values_new = combined_data_new.isnull().sum()
print(null_values_new)


system:index         0
city                 0
greenSpaceArea_m2    0
.geo                 0
dtype: int64


#### Merging the Combined Data with the Original Population Data:

In [12]:
# Merge the original dataset with the combined green space data using both 'city' and 'admin_name' to ensure correct matching

merged_data_corrected = df_usa.merge(combined_data_new[['city', 'greenSpaceArea_m2']], on='city', how='inner')

# Remove potential duplicates by keeping the first occurrence

merged_data_corrected = merged_data_corrected.drop_duplicates(subset=['city', 'admin_name'])

# Display basic information about the merged dataset and the first few rows

merged_data_corrected.sample(5)

Unnamed: 0,city,lat,lng,country,admin_name,population,greenSpaceArea_m2
3607,Fresno,29.5357,-95.4696,United States,Texas,23608.0,33908290.0
910,Buckeye,33.4314,-112.6429,United States,Arizona,95042.0,64783950.0
982,Bloomington,44.8306,-93.3151,United States,Minnesota,89244.0,39937800.0
360,Irving,32.8583,-96.9702,United States,Texas,254962.0,45492750.0
4747,El Dorado,33.2184,-92.664,United States,Arkansas,17606.0,318272300.0


In [13]:
#Convert the 'greenSpaceArea_m2' column to km2, with 2 decimal digits

merged_data_corrected['greenSpaceArea_km2'] = (merged_data_corrected['greenSpaceArea_m2'] / 1000000).round(2)



In [14]:
usa_df = merged_data_corrected.copy()

# Checking how many cities have green space area = 0

usa_df[usa_df['greenSpaceArea_km2'] == 0].shape

# Drop rows with green space area = 0

usa_df = usa_df[usa_df['greenSpaceArea_km2'] != 0]

#### Checking for cities that overlap

- Eliminate cities that are less 10 km apart from each other, in order to maintain accuracy of the greenspace area.

In [15]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform

# Function to calculate the Haversine distance
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in kilometers
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    
    a = np.sin(delta_phi / 2.0)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    return R * c

# Calculate pairwise distances between all cities
def calculate_distances(df):
    coords = df[['lat', 'lng']].values
    dist_matrix = pdist(coords, metric=lambda u, v: haversine(u[0], u[1], v[0], v[1]))
    dist_matrix_square = squareform(dist_matrix)
    return pd.DataFrame(dist_matrix_square, index=df['city'], columns=df['city'])

# Calculate the distance matrix for the merged dataset
distance_matrix = calculate_distances(usa_df)

# Identify city pairs that are within a certain distance threshold (e.g., 10 km)
threshold_distance = 10  # in kilometers
close_cities = (distance_matrix < threshold_distance) & (distance_matrix > 0)

# Extract the close city pairs manually
close_cities_pairs = close_cities.stack()

# Manually create a DataFrame without reset_index
close_cities_pairs_df = pd.DataFrame({
    'City1': close_cities_pairs.index.get_level_values(0),
    'City2': close_cities_pairs.index.get_level_values(1),
    'IsClose': close_cities_pairs.values
})

# Filter out rows where cities are the same
close_cities_pairs_df = close_cities_pairs_df[close_cities_pairs_df['City1'] != close_cities_pairs_df['City2']]

# Keep only the close city pairs
close_cities_pairs_df = close_cities_pairs_df[close_cities_pairs_df['IsClose']]

# Remove duplicates where the same pair appears twice (e.g., City1-City2 and City2-City1)
close_cities_pairs_df['sorted_pair'] = close_cities_pairs_df.apply(lambda x: tuple(sorted((x['City1'], x['City2']))), axis=1)
close_cities_pairs_df = close_cities_pairs_df.drop_duplicates(subset=['sorted_pair']).drop(columns=['sorted_pair', 'IsClose'])

# Display the result
close_cities_pairs_df.sample(5)


Unnamed: 0,City1,City2
9889880,Manor,Columbia
7796097,Citrus Park,Northdale
11008887,Seminole,Bardmoor
12542778,Massapequa Park,North Lindenhurst
3178514,South Hill,Graham


In [16]:
import folium

# Initialize a map centered on the United States
m = folium.Map(location=[37.0902, -95.7129], zoom_start=4)

# Plot each city in the close_cities_pairs_df on the map
for city in pd.concat([close_cities_pairs_df['City1'], close_cities_pairs_df['City2']]).unique():
    city_data = usa_df[usa_df['city'] == city].iloc[0]
    folium.Marker(
        location=[city_data['lat'], city_data['lng']],
        popup=city,
    ).add_to(m)

# Draw lines between close cities
for _, row in close_cities_pairs_df.iterrows():
    city1 = usa_df[usa_df['city'] == row['City1']].iloc[0]
    city2 = usa_df[usa_df['city'] == row['City2']].iloc[0]
    
    folium.PolyLine(
        locations=[[city1['lat'], city1['lng']], [city2['lat'], city2['lng']]],
        color='blue'
    ).add_to(m)

# Save the map to an HTML file or display it
m.save('close_cities_map_filtered.html')

# If running in a notebook, display the map directly
m



In [52]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform

# Function to calculate the Haversine distance

def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in kilometers
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    
    a = np.sin(delta_phi / 2.0)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    
    return R * c

# Calculate pairwise distances between all cities

def calculate_distances(df):
    coords = df[['lat', 'lng']].values
    dist_matrix = pdist(coords, metric=lambda u, v: haversine(u[0], u[1], v[0], v[1]))
    dist_matrix_square = squareform(dist_matrix)
    return pd.DataFrame(dist_matrix_square, index=df['city'], columns=df['city'])

# Calculate the distance matrix for the dataset

distance_matrix = calculate_distances(usa_df)

# Identify cities that have no other city within a 10 km radius

threshold_distance = 10  # in kilometers
isolation_matrix = (distance_matrix > threshold_distance) | (distance_matrix == 0)

# Identify isolated cities

isolated_cities = isolation_matrix.all(axis=1)

# Filter out the isolated cities

isolated_cities_df = usa_df[usa_df['city'].isin(isolated_cities[isolated_cities].index)]

# Display the isolated cities

print(isolated_cities_df.shape)
isolated_cities_df.head()


(1874, 8)


Unnamed: 0,city,lat,lng,country,admin_name,population,greenSpaceArea_m2,greenSpaceArea_km2
3,Miami,25.784,-80.2101,United States,Florida,6080145.0,35572800.0,35.57
5,Houston,29.786,-95.3885,United States,Texas,5970127.0,52571630.0,52.57
6,Dallas,32.7935,-96.7667,United States,Texas,5830932.0,79909370.0,79.91
11,Atlanta,33.7628,-84.422,United States,Georgia,5180179.0,147335200.0,147.34
12,Washington,38.9047,-77.0163,United States,District of Columbia,5116378.0,73151160.0,73.15


In [53]:
isolated_cities_df.describe()

Unnamed: 0,lat,lng,population,greenSpaceArea_m2,greenSpaceArea_km2
count,1874.0,1874.0,1874.0,1874.0,1874.0
mean,37.582064,-92.181681,75880.48,104512500.0,104.51254
std,4.880382,15.502346,348506.0,96496040.0,96.495888
min,19.5828,-159.3564,8513.0,5180.484,0.01
25%,34.03915,-98.074525,11923.0,16189940.0,16.19
50%,38.21305,-88.3628,18176.5,78089150.0,78.09
75%,41.2273,-81.22165,37518.25,180676500.0,180.6775
max,48.9502,-69.6657,6080145.0,365581500.0,365.58


In [54]:
# Drop rows with green space area = 0

drop_rows = isolated_cities_df[isolated_cities_df['greenSpaceArea_km2'] == 0].shape

# Drop colum greenspace area m2

isolated_cities_df.drop(columns=['greenSpaceArea_m2'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isolated_cities_df.drop(columns=['greenSpaceArea_m2'], inplace=True)


In [55]:
# Drop rows where the green space area is 0

isolated_cities_df = isolated_cities_df[isolated_cities_df['greenSpaceArea_km2'] != 0]

isolated_cities_df

Unnamed: 0,city,lat,lng,country,admin_name,population,greenSpaceArea_km2
3,Miami,25.7840,-80.2101,United States,Florida,6080145.0,35.57
5,Houston,29.7860,-95.3885,United States,Texas,5970127.0,52.57
6,Dallas,32.7935,-96.7667,United States,Texas,5830932.0,79.91
11,Atlanta,33.7628,-84.4220,United States,Georgia,5180179.0,147.34
12,Washington,38.9047,-77.0163,United States,District of Columbia,5116378.0,73.15
...,...,...,...,...,...,...,...
8537,Severance,40.5265,-104.8650,United States,Colorado,8526.0,44.14
8542,Sallisaw,35.4606,-94.8072,United States,Oklahoma,8524.0,221.03
8543,Lewistown,40.5964,-77.5730,United States,Pennsylvania,8522.0,2.91
8545,Evergreen,48.2308,-114.2700,United States,Montana,8514.0,0.96


In [21]:
isolated_cities_df.admin_name.nunique()

49

In [22]:
# Group table by admin_name and sum the green space area and ranking by the highest green space area

isolated_cities_df_grouped = isolated_cities_df.groupby('admin_name').agg({'greenSpaceArea_km2': 'sum'}).sort_values(by='greenSpaceArea_km2', ascending=False)

isolated_cities_df_grouped

Unnamed: 0_level_0,greenSpaceArea_km2
admin_name,Unnamed: 1_level_1
North Carolina,14208.38
Georgia,12801.55
Florida,12104.88
California,10577.73
Texas,10536.98
Tennessee,10455.51
Alabama,10268.32
Missouri,9677.69
Kentucky,7911.71
New Jersey,7475.31


In [23]:
# How many cities do we have per admin_name

isolated_cities_df_grouped['city_count'] = isolated_cities_df['admin_name'].value_counts()

isolated_cities_df_grouped

Unnamed: 0_level_0,greenSpaceArea_km2,city_count
admin_name,Unnamed: 1_level_1,Unnamed: 2_level_1
North Carolina,14208.38,67
Georgia,12801.55,70
Florida,12104.88,79
California,10577.73,146
Texas,10536.98,143
Tennessee,10455.51,55
Alabama,10268.32,47
Missouri,9677.69,57
Kentucky,7911.71,41
New Jersey,7475.31,56


## Loading the Public Health Indicators datasets

- The datasets were obtained from CDC (Centers for Disease Control and Prevention) and contain information about the public health indicators for each state in the United States (https://cdi.cdc.gov/?location=ALL&category=TOB&indicators=TOB04).

#### Obesity

- The dataset contains information about the percentage of adults who are obese in each state in the United States.

- https://www.cdc.gov/obesity/php/data-research/adult-obesity-prevalence-maps.html

In [24]:
# importing obesity data

obesity_data = pd.read_csv('/Users/alexandreribeiro/Desktop/us_cities/obesity-rate-US.csv')

In [25]:
# Changing the column name Prevalence to obesity_rate and state to lower_case

obesity_data.rename(columns={'Prevalence': 'obesity_rate', 'State': 'state'}, inplace=True)

# Drop the column '95% CI'

obesity_data.drop(columns=['95% CI'], inplace=True)

obesity_data.head()

Unnamed: 0,state,obesity_rate
0,Alabama,38.3
1,Alaska,32.1
2,Arizona,33.2
3,Arkansas,37.4
4,California,28.1


#### Smoking

- The datasets contain information about the percentage of adults who are obese.

- https://cdi.cdc.gov/?location=ALL&category=TOB&indicators=TOB04

In [26]:
# importing smoking data

smoking_data = pd.read_csv('/Users/alexandreribeiro/Desktop/us_cities/smoking-rates-us.csv')

In [27]:
# Maintaing only LocationDesc and DataValueDisplay columns

smoking_data = smoking_data[['LocationDesc', 'DataValueDisplay']]

# Changing column names "LocationDesc" to "state" and "DataValueDisplay" to "smoking_rate"

smoking_data.rename(columns={'LocationDesc': 'state', 'DataValueDisplay': 'smoking_rate'}, inplace=True)

smoking_data.head()

Unnamed: 0,state,smoking_rate
0,United States,13.2
1,Alabama,16.2
2,Alaska,16.2
3,Arizona,13.1
4,Arkansas,19.9


#### Physical Activity

- Adults that follow the CDC's guidelines for physical activity, like at least 150 minutes of moderate-intensity physical activity a week, such as 30 minutes a day, 5 days a week.

- https://cdi.cdc.gov/?location=ALL&category=NPAW&indicators=NPW09

In [70]:
# Importing Exercising data

exercising_data = pd.read_csv('/Users/alexandreribeiro/Desktop/us_cities/phisical-exercise-us.csv')

In [71]:
# Maintaing only LocationDesc and DataValueDisplay columns

exercising_data = exercising_data[['LocationDesc', 'DataValueDisplay']]

# Changing column names "LocationDesc" to "state" and "DataValueDisplay" to "exercising_rate"

exercising_data.rename(columns={'LocationDesc': 'state', 'DataValueDisplay': 'exercising_rate'}, inplace=True)

exercising_data



Unnamed: 0,state,exercising_rate
0,United States,51.1
1,Alabama,45.9
2,Alaska,57.4
3,Arizona,54.0
4,Arkansas,46.8
5,California,54.8
6,Colorado,59.2
7,Connecticut,53.7
8,Delaware,51.7
9,District of Columbia,56.5


In [68]:
# what are the states that have missing values in the obesity data

exercising_data.exercising_rate.isnull().unique()

array([False,  True])

#### Chronic Diseases

- The datasets contain information about the percentage of adults who have been diagnosed with 2 or more chronic diseases.

- https://cdi.cdc.gov/?location=ALL&category=HEA&indicators=HEA05

In [48]:
# importing Chronic Disease data

chronic_data = pd.read_csv('/Users/alexandreribeiro/Desktop/us_cities/chronic-diseases-us.csv')

chronic_data.head()

Unnamed: 0,LocationAbbr,LocationDesc,LocationID,TopicID,DataSource,DataSourceUrl,Question,QuestionID,StratificationCategoryId,StratificationCategory,...,HighConfidenceLimit,DataValueDisplay,LowConfidenceLimitDisplay,HighConfidenceLimitDisplay,DataValueUnit,DataValueType,DataValueTypeID,DataValueFootnote,DataValueFootnoteSymbol,DisplayOrder
0,US,United States,59,HEA,BRFSS,https://www.cdc.gov/brfss,2 or more chronic conditions among adults,HEA05,OVERALL,Overall,...,17.3,17.0,16.8,17.3,%,Age-adjusted Prevalence,AGEADJPREV,,,4
1,AL,Alabama,1,HEA,BRFSS,https://www.cdc.gov/brfss,2 or more chronic conditions among adults,HEA05,OVERALL,Overall,...,23.9,22.5,21.1,23.9,%,Age-adjusted Prevalence,AGEADJPREV,,,4
2,AK,Alaska,2,HEA,BRFSS,https://www.cdc.gov/brfss,2 or more chronic conditions among adults,HEA05,OVERALL,Overall,...,16.2,15.1,14.1,16.2,%,Age-adjusted Prevalence,AGEADJPREV,,,4
3,AZ,Arizona,4,HEA,BRFSS,https://www.cdc.gov/brfss,2 or more chronic conditions among adults,HEA05,OVERALL,Overall,...,17.9,16.8,15.8,17.9,%,Age-adjusted Prevalence,AGEADJPREV,,,4
4,AR,Arkansas,5,HEA,BRFSS,https://www.cdc.gov/brfss,2 or more chronic conditions among adults,HEA05,OVERALL,Overall,...,23.7,22.4,21.1,23.7,%,Age-adjusted Prevalence,AGEADJPREV,,,4


In [49]:
# Maintaing only LocationDesc and DataValueDisplay columns

chronic_data = chronic_data[['LocationDesc', 'DataValueDisplay']]

# Changing column names "LocationDesc" to "state" and "DataValueDisplay" to "chronic_rate"

chronic_data.rename(columns={'LocationDesc': 'state', 'DataValueDisplay': 'chronic_rate'}, inplace=True)

chronic_data.head()

Unnamed: 0,state,chronic_rate
0,United States,17.0
1,Alabama,22.5
2,Alaska,15.1
3,Arizona,16.8
4,Arkansas,22.4


#### Life Expectancy

- The datasets contain information about the life expectancy in each state in the United States.

- https://cdi.cdc.gov/?location=ALL&category=HEA&indicators=HEA06

In [32]:
# importing life expectancy data

life_expectancy_data = pd.read_csv('/Users/alexandreribeiro/Desktop/us_cities/life-expectancy-us.csv')

In [33]:
# Maintaing only LocationDesc and DataValueDisplay columns

life_expectancy_data = life_expectancy_data[['LocationDesc', 'DataValueDisplay']]

# Changing column names "LocationDesc" to "state" and "DataValueDisplay" to "life_expectancy"

life_expectancy_data.rename(columns={'LocationDesc': 'state', 'DataValueDisplay': 'life_expectancy'}, inplace=True)

life_expectancy_data.head()

Unnamed: 0,state,life_expectancy
0,United States,77.0
1,Alabama,73.2
2,Alaska,76.6
3,Arizona,76.3
4,Arkansas,73.8


##### Merge the main dataset with the public health indicators datasets

In [57]:
# df_usa lower case columns and change admin_name to state

isolated_cities_df.columns = isolated_cities_df.columns.str.lower()

isolated_cities_df.rename(columns={'admin_name': 'state'}, inplace=True)

isolated_cities_df

Unnamed: 0,city,lat,lng,country,state,population,greenspacearea_km2
3,Miami,25.7840,-80.2101,United States,Florida,6080145.0,35.57
5,Houston,29.7860,-95.3885,United States,Texas,5970127.0,52.57
6,Dallas,32.7935,-96.7667,United States,Texas,5830932.0,79.91
11,Atlanta,33.7628,-84.4220,United States,Georgia,5180179.0,147.34
12,Washington,38.9047,-77.0163,United States,District of Columbia,5116378.0,73.15
...,...,...,...,...,...,...,...
8537,Severance,40.5265,-104.8650,United States,Colorado,8526.0,44.14
8542,Sallisaw,35.4606,-94.8072,United States,Oklahoma,8524.0,221.03
8543,Lewistown,40.5964,-77.5730,United States,Pennsylvania,8522.0,2.91
8545,Evergreen,48.2308,-114.2700,United States,Montana,8514.0,0.96


In [58]:
# Adding the 5 datasets to the isolated_cities_df

isolated_cities_df = isolated_cities_df.merge(obesity_data, on='state', how='left')
isolated_cities_df = isolated_cities_df.merge(smoking_data, on='state', how='left')
isolated_cities_df = isolated_cities_df.merge(exercising_data, on='state', how='left')
isolated_cities_df = isolated_cities_df.merge(chronic_data, on='state', how='left')
isolated_cities_df = isolated_cities_df.merge(life_expectancy_data, on='state', how='left')

isolated_cities_df

Unnamed: 0,city,lat,lng,country,state,population,greenspacearea_km2,obesity_rate,smoking_rate,exercising_rate,chronic_rate,life_expectancy
0,Miami,25.7840,-80.2101,United States,Florida,6080145.0,35.57,31.6,12.0,54.3,16.5,77.5
1,Houston,29.7860,-95.3885,United States,Texas,5970127.0,52.57,35.5,12.0,49.4,16.6,76.5
2,Dallas,32.7935,-96.7667,United States,Texas,5830932.0,79.91,35.5,12.0,49.4,16.6,76.5
3,Atlanta,33.7628,-84.4220,United States,Georgia,5180179.0,147.34,37.0,12.8,49.1,17.4,75.6
4,Washington,38.9047,-77.0163,United States,District of Columbia,5116378.0,73.15,24.3,11.0,56.5,13.7,75.3
...,...,...,...,...,...,...,...,...,...,...,...,...
1869,Severance,40.5265,-104.8650,United States,Colorado,8526.0,44.14,25.0,10.9,59.2,14.8,78.3
1870,Sallisaw,35.4606,-94.8072,United States,Oklahoma,8524.0,221.03,40.0,16.0,38.5,19.9,74.1
1871,Lewistown,40.5964,-77.5730,United States,Pennsylvania,8522.0,2.91,33.4,15.6,50.8,18.5,76.8
1872,Evergreen,48.2308,-114.2700,United States,Montana,8514.0,0.96,30.5,16.0,63.2,16.4,76.8


In [74]:
# replace exercising rate missing values with the mean

isolated_cities_df['exercising_rate'] = isolated_cities_df['exercising_rate'].fillna(isolated_cities_df['exercising_rate'].mean())

# export to csv

isolated_cities_df.to_csv('isolated_cities_df.csv', index=False)