# The Battle of Neighborhoods

In [13]:
import pandas as pd # Data Analysis
import numpy as np # Handles Data in a Vectorized Manner
import requests #handles requests
import folium # Plotting
from geopy.geocoders import Nominatim #address to Latitude and Longitude
from bs4 import BeautifulSoup


print('Modules Imported!')

Modules Imported!


In [14]:
# Web Sources for scraping

NYC_url = 'https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm'
LA_url = 'http://www.laalmanac.com/communications/cm02_communities.php'
CHI_url = 'https://www.seechicagorealestate.com/chicago-zip-codes-by-neighborhood.php'

## Web Scraping and setting up DataFrames

New York City

In [15]:
source1 = requests.get(NYC_url).text
soup1 = BeautifulSoup(source1,'lxml')
table1 = soup1.find('table')

readme_html1 = pd.read_html(str(table1))
NYC_df = pd.DataFrame(readme_html1[0])
print(NYC_df.shape, '\n', NYC_df.head())

(42, 3) 
   Borough                Neighborhood                   ZIP Codes
0   Bronx               Central Bronx         10453, 10457, 10460
1   Bronx      Bronx Park and Fordham         10458, 10467, 10468
2   Bronx  High Bridge and Morrisania         10451, 10452, 10456
3   Bronx  Hunts Point and Mott Haven  10454, 10455, 10459, 10474
4   Bronx   Kingsbridge and Riverdale                10463, 10471


Los Angeles 

In [16]:
source2 = requests.get(LA_url).text
soup2 = BeautifulSoup(source2, 'lxml')
table2 = soup2.find('table')

readme_html2 = pd.read_html(str(table2))
LA_df = pd.DataFrame(readme_html2[0])
print(LA_df.shape, '\n', LA_df.head())

(643, 2) 
             City/Community   Zip Code(s)
0                    Acton         93510
1             Agoura Hills         91301
2  Agoura Hills (PO Boxes)         91376
3               Agua Dulce         91390
4                 Alhambra  91801, 91803


Chicago

In [17]:
source3 = requests.get(CHI_url).text
soup3 = BeautifulSoup(source3, 'lxml')
table3 = soup3.find('table')

readme_html3 = pd.read_html(str(table3))
CHI_df = pd.DataFrame(readme_html3[0])
print(CHI_df.shape, '\n', CHI_df.head())

(199, 2) 
                     0             1
0            Downtown      Zip Code
1  Cathedral District         60611
2     Central Station         60605
3       Dearborn Park         60605
4          Gold Coast  60610, 60611


Renaming columns to be consistent through each DataFrame

In [18]:
NYC_df.rename(columns={'ZIP Codes': 'zip_codes'}, inplace=True)
LA_df.rename(columns={'City/Community': 'Neighborhood', 'Zip Code(s)': 'zip_codes'}, inplace=True)
CHI_df.columns = ['Neighborhood','zip_codes']

print(list(NYC_df.columns))
print(list(LA_df.columns))
print(list(CHI_df.columns))

['Borough', 'Neighborhood', 'zip_codes']
['Neighborhood', 'zip_codes']
['Neighborhood', 'zip_codes']


## Cleaning DataFrames

In [19]:
#Checking NA and NULLs in each DataFrame

print(NYC_df.isna().sum())
print(NYC_df.isnull().sum())
print('\n')
print(LA_df.isna().sum())
print(LA_df.isnull().sum())
print('\n')
print(CHI_df.isna().sum())
print(CHI_df.isnull().sum())

Borough         0
Neighborhood    0
zip_codes       0
dtype: int64
Borough         0
Neighborhood    0
zip_codes       0
dtype: int64


Neighborhood    0
zip_codes       0
dtype: int64
Neighborhood    0
zip_codes       0
dtype: int64


Neighborhood    7
zip_codes       7
dtype: int64
Neighborhood    7
zip_codes       7
dtype: int64


In [20]:
#Investigating the CHI DataFrame

#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_columns', None)
#pd.set_option('display.width', None)
#pd.set_option('display.max_colwidth', -1)
#CHI_df

In [21]:
CHI_df.dropna(inplace=True)
print(CHI_df.isna().sum())
print(CHI_df.isnull().sum())

Neighborhood    0
zip_codes       0
dtype: int64
Neighborhood    0
zip_codes       0
dtype: int64


In [22]:
new_df1 = pd.DataFrame(NYC_df.zip_codes.str.split(',').tolist(), index= NYC_df.Neighborhood).stack()
new_df1 = new_df1.reset_index([0, 'Neighborhood'])
NYC_df2 = new_df1
NYC_df2.columns = ['Neighborhood','zip_codes']
NYC_df2

Unnamed: 0,Neighborhood,zip_codes
0,Central Bronx,10453
1,Central Bronx,10457
2,Central Bronx,10460
3,Bronx Park and Fordham,10458
4,Bronx Park and Fordham,10467
...,...,...
173,South Shore,10312
174,Stapleton and St. George,10301
175,Stapleton and St. George,10304
176,Stapleton and St. George,10305


In [23]:
new_df2 = pd.DataFrame(LA_df.zip_codes.str.split(',').tolist(), index= LA_df.Neighborhood).stack()
new_df2 = new_df2.reset_index([0, 'Neighborhood'])
LA_df2 = new_df2
LA_df2.columns = ['Neighborhood','zip_codes']
LA_df2

Unnamed: 0,Neighborhood,zip_codes
0,Acton,93510
1,Agoura Hills,91301
2,Agoura Hills (PO Boxes),91376
3,Agua Dulce,91390
4,Alhambra,91801
...,...,...
975,Woodland Hills (Los Angeles),91364
976,Woodland Hills (Los Angeles),91367
977,Woodland Hills (PO Boxes) (Los Angeles),91365
978,Woodland Hills (PO Boxes) (Los Angeles),91372


In [24]:
new_df3 = pd.DataFrame(CHI_df.zip_codes.str.split(',').tolist(), index= CHI_df.Neighborhood).stack()
new_df3 = new_df3.reset_index([0, 'Neighborhood'])
new_df3.drop(new_df3.index[0], inplace=True)
#new_df3.drop(['index'], axis=1)
new_df3.reset_index(inplace=True)
CHI_df2 = new_df3
CHI_df2.columns = ['old index' , 'Neighborhood','zip_codes']
CHI_df2.drop(['old index'], axis=1, inplace= True)
CHI_df2

Unnamed: 0,Neighborhood,zip_codes
0,Cathedral District,60611
1,Central Station,60605
2,Dearborn Park,60605
3,Gold Coast,60610
4,Gold Coast,60611
...,...,...
355,Washington Park,60609
356,Washington Park,60615
357,Washington Park,60621
358,Washington Park,60637
