# The Battle of the Neighborhoods - Week 2

## Step 1 Explore NYC geographical coordinates dataset

NYC has a total of 5 boroughs and 306 neighbourhoods. To segment the blocks and explore them, we will necessarily need a dataset that contains the 5 districts and the communities that exist in each borough as well as the latitude and longitude coordinates of each neighbourhood.

For solving that task we will use data set from sourse [here](https://geo.nyu.edu/catalog/nyu_2451_34572)

In [23]:
#Step 1.1 - import libraries and install modules
import numpy as np
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import requests 
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
!conda install -c conda-forge folium=0.5.0 --yes
import folium
from bs4 import BeautifulSoup
import csv
print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


In [24]:
#Step 1.2 - download data
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
print('Data downloaded!')

Data downloaded!


In [25]:
#Step 1.3 - explore the data
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)
neighborhoods_data = newyork_data['features']
neighborhoods_data[0]

#Tranform the data into a pandas dataframe
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 
neighborhoods = pd.DataFrame(columns=column_names)

#Fill the dataframe
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
    
#Checking resolts
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [26]:
#Step 1.4 - using geopy library to get the latitude and longitude values
neighborhoods.to_csv('BON1_NYC_GEO.csv',index=False)
address = 'New York City, NY'

geolocator = Nominatim(user_agent="Jupyter")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [27]:
#Step 1.5 - create a map of New York with neighborhoods superimposed on top
map_NewYork = folium.Map(location=[latitude, longitude], zoom_start=10)
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NewYork)  
    
map_NewYork

# Part 2 Web scrapping of population and demographics data from Wikipedia
Web scrapping of population and demographics data from Wikipedia pages using by BeautifulSoup

In [160]:
#Step 2.1 - web scrapping of population and demographics data from Wikipedia pages using by BeautifulSoup
website_url = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})

headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

with open('BON2_POPULATION1.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

In [161]:
#Step 2.2 - load data from CSV
Pop_data=pd.read_csv('BON2_POPULATION1.csv')
Pop_data

Unnamed: 0,New York City's five boroughsvte,Jurisdiction,Population,Gross Domestic Product,Land area,Density,Borough,County,Estimate (2017)[12],billions(US$)[13],per capita(US$),square miles,squarekm,persons / sq. mi,persons /sq. km
0,The Bronx\n,\n Bronx\n,"1,471,160\n",28.787\n,"19,570\n",42.10\n,109.04\n,"34,653\n","13,231\n",,,,,,
1,Brooklyn\n,\n Kings\n,"2,648,771\n",63.303\n,"23,900\n",70.82\n,183.42\n,"37,137\n","14,649\n",,,,,,
2,Manhattan\n,\n New York\n,"1,664,727\n",629.682\n,"378,250\n",22.83\n,59.13\n,"72,033\n","27,826\n",,,,,,
3,Queens\n,\n Queens\n,"2,358,582\n",73.842\n,"31,310\n",108.53\n,281.09\n,"21,460\n","8,354\n",,,,,,
4,Staten Island\n,\n Richmond\n,"479,458\n",11.249\n,"23,460\n",58.37\n,151.18\n,"8,112\n","3,132\n",,,,,,
5,City of New York,8622698,806.863,93574,302.64,783.83,28188,"10,947\n",,,,,,,
6,State of New York,19849399,1547.116,78354,47214,122284,416.4,159\n,,,,,,,
7,Sources:[14] and see individual borough articl...,,,,,,,,,,,,,,


In [162]:
#Step 2.3 - remove whitespaces and rename columns
Pop_data.columns = Pop_data.columns.str.replace(' ', '')
Pop_data.columns = Pop_data.columns.str.replace('\'','')
Pop_data

Unnamed: 0,NewYorkCitysfiveboroughsvte,Jurisdiction,Population,GrossDomesticProduct,Landarea,Density,Borough,County,Estimate(2017)[12],billions(US$)[13],percapita(US$),squaremiles,squarekm,persons/sq.mi,persons/sq.km
0,The Bronx\n,\n Bronx\n,"1,471,160\n",28.787\n,"19,570\n",42.10\n,109.04\n,"34,653\n","13,231\n",,,,,,
1,Brooklyn\n,\n Kings\n,"2,648,771\n",63.303\n,"23,900\n",70.82\n,183.42\n,"37,137\n","14,649\n",,,,,,
2,Manhattan\n,\n New York\n,"1,664,727\n",629.682\n,"378,250\n",22.83\n,59.13\n,"72,033\n","27,826\n",,,,,,
3,Queens\n,\n Queens\n,"2,358,582\n",73.842\n,"31,310\n",108.53\n,281.09\n,"21,460\n","8,354\n",,,,,,
4,Staten Island\n,\n Richmond\n,"479,458\n",11.249\n,"23,460\n",58.37\n,151.18\n,"8,112\n","3,132\n",,,,,,
5,City of New York,8622698,806.863,93574,302.64,783.83,28188,"10,947\n",,,,,,,
6,State of New York,19849399,1547.116,78354,47214,122284,416.4,159\n,,,,,,,
7,Sources:[14] and see individual borough articl...,,,,,,,,,,,,,,


In [163]:
Pop_data.rename(columns = {'GrossDomesticProduct\n':'GBP_billions',
                           'Borough':'square_km',
                    'NewYorkCitysfiveboroughsvte\n':'Borough',
                   'Jurisdiction\n':'County',
                   'Population\n':'Estimate_2017',
                    'GrossDomesticProduct\n':'GBP_billions',
                    'Landarea\n':'GBP_per_capita',
                     'Density\n':'square_miles',
                    'County':'persons_sq_mi',
                    'Estimate(2017)[12]':'persons_sq_km'}, inplace=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GBP_billions,GBP_per_capita,square_miles,square_km,persons_sq_mi,persons_sq_km,billions(US$)[13],percapita(US$),squaremiles,squarekm,persons/sq.mi,persons/sq.km
0,The Bronx\n,\n Bronx\n,"1,471,160\n",28.787\n,"19,570\n",42.10\n,109.04\n,"34,653\n","13,231\n",,,,,,
1,Brooklyn\n,\n Kings\n,"2,648,771\n",63.303\n,"23,900\n",70.82\n,183.42\n,"37,137\n","14,649\n",,,,,,
2,Manhattan\n,\n New York\n,"1,664,727\n",629.682\n,"378,250\n",22.83\n,59.13\n,"72,033\n","27,826\n",,,,,,
3,Queens\n,\n Queens\n,"2,358,582\n",73.842\n,"31,310\n",108.53\n,281.09\n,"21,460\n","8,354\n",,,,,,
4,Staten Island\n,\n Richmond\n,"479,458\n",11.249\n,"23,460\n",58.37\n,151.18\n,"8,112\n","3,132\n",,,,,,
5,City of New York,8622698,806.863,93574,302.64,783.83,28188,"10,947\n",,,,,,,
6,State of New York,19849399,1547.116,78354,47214,122284,416.4,159\n,,,,,,,
7,Sources:[14] and see individual borough articl...,,,,,,,,,,,,,,


In [164]:
#Step 2.4 - replace newline('\n') from each string from left and right sides
Pop_data['Borough']=Pop_data['Borough'].replace(to_replace='\n', value='', regex=True)
Pop_data['County']=Pop_data['County'].replace(to_replace='\n', value='', regex=True)
Pop_data['Estimate_2017']=Pop_data['Estimate_2017'].replace(to_replace='\n', value='', regex=True)
Pop_data['square_miles']=Pop_data['square_miles'].replace(to_replace='\n', value='', regex=True)
Pop_data['square_km']=Pop_data['square_km'].replace(to_replace='\n', value='', regex=True)
Pop_data['persons_sq_mi']=Pop_data['persons_sq_mi'].replace(to_replace='\n', value='', regex=True)
Pop_data['persons_sq_km']=Pop_data['persons_sq_km'].replace(to_replace='\n', value='', regex=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GBP_billions,GBP_per_capita,square_miles,square_km,persons_sq_mi,persons_sq_km,billions(US$)[13],percapita(US$),squaremiles,squarekm,persons/sq.mi,persons/sq.km
0,The Bronx,Bronx,1471160.0,28.787\n,"19,570\n",42.1,109.04,34653.0,13231.0,,,,,,
1,Brooklyn,Kings,2648771.0,63.303\n,"23,900\n",70.82,183.42,37137.0,14649.0,,,,,,
2,Manhattan,New York,1664727.0,629.682\n,"378,250\n",22.83,59.13,72033.0,27826.0,,,,,,
3,Queens,Queens,2358582.0,73.842\n,"31,310\n",108.53,281.09,21460.0,8354.0,,,,,,
4,Staten Island,Richmond,479458.0,11.249\n,"23,460\n",58.37,151.18,8112.0,3132.0,,,,,,
5,City of New York,8622698,806.863,93574,302.64,783.83,28188.0,10947.0,,,,,,,
6,State of New York,19849399,1547.116,78354,47214,122284.0,416.4,159.0,,,,,,,
7,Sources:[14] and see individual borough articles,,,,,,,,,,,,,,


In [165]:
#Step 2.5 - shift data in the last two rows and remove 'NAN'
Pop_data.loc[5:,['persons_sq_mi','persons_sq_km']] = Pop_data.loc[2:,['persons_sq_mi','persons_sq_km']].shift(1,axis=1)
Pop_data.loc[5:,['square_km','persons_sq_mi']] = Pop_data.loc[2:,['square_km','persons_sq_mi']].shift(1,axis=1)
Pop_data.loc[5:,['square_miles','square_km']] = Pop_data.loc[2:,['square_miles','square_km']].shift(1,axis=1)
Pop_data.loc[5:,['Estimate_2017','square_miles']] = Pop_data.loc[2:,['Estimate_2017','square_miles']].shift(1,axis=1)
Pop_data.loc[5:,['County','Estimate_2017']] = Pop_data.loc[2:,['County','Estimate_2017']].shift(1,axis=1)
Pop_data.loc[5:,['Borough','County']] = Pop_data.loc[2:,['Borough','County']].shift(1,axis=1)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GBP_billions,GBP_per_capita,square_miles,square_km,persons_sq_mi,persons_sq_km,billions(US$)[13],percapita(US$),squaremiles,squarekm,persons/sq.mi,persons/sq.km
0,The Bronx,Bronx,1471160.0,28.787\n,"19,570\n",42.1,109.04,34653.0,13231.0,,,,,,
1,Brooklyn,Kings,2648771.0,63.303\n,"23,900\n",70.82,183.42,37137.0,14649.0,,,,,,
2,Manhattan,New York,1664727.0,629.682\n,"378,250\n",22.83,59.13,72033.0,27826.0,,,,,,
3,Queens,Queens,2358582.0,73.842\n,"31,310\n",108.53,281.09,21460.0,8354.0,,,,,,
4,Staten Island,Richmond,479458.0,11.249\n,"23,460\n",58.37,151.18,8112.0,3132.0,,,,,,
5,,City of New York,8622698.0,93574,302.64,806.863,783.83,28188.0,10947.0,,,,,,
6,,State of New York,19849399.0,78354,47214,1547.116,122284.0,416.4,159.0,,,,,,
7,,Sources:[14] and see individual borough articles,,,,,,,,,,,,,


In [166]:
#Remove 'NAN
Pop_data = Pop_data.fillna('')
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GBP_billions,GBP_per_capita,square_miles,square_km,persons_sq_mi,persons_sq_km,billions(US$)[13],percapita(US$),squaremiles,squarekm,persons/sq.mi,persons/sq.km
0,The Bronx,Bronx,1471160.0,28.787\n,"19,570\n",42.1,109.04,34653.0,13231.0,,,,,,
1,Brooklyn,Kings,2648771.0,63.303\n,"23,900\n",70.82,183.42,37137.0,14649.0,,,,,,
2,Manhattan,New York,1664727.0,629.682\n,"378,250\n",22.83,59.13,72033.0,27826.0,,,,,,
3,Queens,Queens,2358582.0,73.842\n,"31,310\n",108.53,281.09,21460.0,8354.0,,,,,,
4,Staten Island,Richmond,479458.0,11.249\n,"23,460\n",58.37,151.18,8112.0,3132.0,,,,,,
5,,City of New York,8622698.0,93574,302.64,806.863,783.83,28188.0,10947.0,,,,,,
6,,State of New York,19849399.0,78354,47214,1547.116,122284.0,416.4,159.0,,,,,,
7,,Sources:[14] and see individual borough articles,,,,,,,,,,,,,


In [167]:
#Step 2.6 - drop the last row
i = Pop_data[((Pop_data.County == 'Sources:[14] and see individual borough articles'))].index
Pop_data=Pop_data.drop(i)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GBP_billions,GBP_per_capita,square_miles,square_km,persons_sq_mi,persons_sq_km,billions(US$)[13],percapita(US$),squaremiles,squarekm,persons/sq.mi,persons/sq.km
0,The Bronx,Bronx,1471160,28.787\n,"19,570\n",42.1,109.04,34653.0,13231,,,,,,
1,Brooklyn,Kings,2648771,63.303\n,"23,900\n",70.82,183.42,37137.0,14649,,,,,,
2,Manhattan,New York,1664727,629.682\n,"378,250\n",22.83,59.13,72033.0,27826,,,,,,
3,Queens,Queens,2358582,73.842\n,"31,310\n",108.53,281.09,21460.0,8354,,,,,,
4,Staten Island,Richmond,479458,11.249\n,"23,460\n",58.37,151.18,8112.0,3132,,,,,,
5,,City of New York,8622698,93574,302.64,806.863,783.83,28188.0,10947,,,,,,
6,,State of New York,19849399,78354,47214,1547.116,122284.0,416.4,159,,,,,,


In [168]:
Pop_data.drop(Pop_data.columns[[9,10,11,12,13,14]], axis=1,inplace=True)
Pop_data

Unnamed: 0,Borough,County,Estimate_2017,GBP_billions,GBP_per_capita,square_miles,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1471160,28.787\n,"19,570\n",42.1,109.04,34653.0,13231
1,Brooklyn,Kings,2648771,63.303\n,"23,900\n",70.82,183.42,37137.0,14649
2,Manhattan,New York,1664727,629.682\n,"378,250\n",22.83,59.13,72033.0,27826
3,Queens,Queens,2358582,73.842\n,"31,310\n",108.53,281.09,21460.0,8354
4,Staten Island,Richmond,479458,11.249\n,"23,460\n",58.37,151.18,8112.0,3132
5,,City of New York,8622698,93574,302.64,806.863,783.83,28188.0,10947
6,,State of New York,19849399,78354,47214,1547.116,122284.0,416.4,159


In [169]:
#Step 2.7 - save dataframe as csv file
Pop_data.to_csv('BON2_POPULATION.csv',index=False)

# Part 3 Explore NYC and its Boroughs Cuisine dataset

In [172]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.style.use('ggplot') 
# install wordcloud
!conda install -c conda-forge wordcloud==1.4.1 --yes
# import package and its set of stopwords
from wordcloud import WordCloud, STOPWORDS
print ('Wordcloud is installed and imported!')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - wordcloud==1.4.1


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    wordcloud-1.4.1            |           py36_0         324 KB  conda-forge

The following NEW packages will be INSTALLED:

    wordcloud: 1.4.1-py36_0 conda-forge


Downloading and Extracting Packages
wordcloud-1.4.1      | 324 KB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Wordcloud is installed and imported!


In [174]:
# Fetch the file
my_file = project.get_file("BON3_NYC_CUISINE.csv")
# Read the CSV data file from the object storage into a pandas DataFrame
my_file.seek(0)
import pandas as pd
NYC_CUISINE=pd.read_csv(my_file)
NYC_CUISINE.drop(NYC_CUISINE.columns[[3,4,5,6,7]], axis=1,inplace=True) 
NYC_CUISINE.head()

NameError: name 'project' is not defined

In [None]:
print(NYC_CUISINE.Borough.unique())

In [None]:
NYC_CUISINE['Borough'].value_counts().to_frame()

# #1. NYC CUISINE

In [None]:
CUISINE_WC = NYC_CUISINE[['Cuisine']]
CUISINE_WC

In [None]:
CUISINE_WC.to_csv('CUISINE_WC.txt', sep=',', index=False)
CUISINE_WC1 = open('CUISINE_WC.txt', 'r').read()
stopwords = set(STOPWORDS)

In [175]:
# instantiate and geberate word cloud object
NYC_CUISINE_WC = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)
NYC_CUISINE_WC.generate(CUISINE_WC1)

NameError: name 'stopwords' is not defined

# #2. BROOKLYN CUISINE

In [None]:
Brooklyn_data = NYC_CUISINE[NYC_CUISINE['Borough'] == 'Brooklyn'].reset_index(drop=True)
Brooklyn_data.head()

In [None]:
BR_CUISINE_WC = Brooklyn_data[['Cuisine']]
BR_CUISINE_WC

In [None]:
BR_CUISINE_WC.to_csv('BR_CUISINE.txt', sep=',', index=False)

In [None]:
BR_CUISINE_WC = open('BR_CUISINE.txt', 'r').read()

In [None]:
stopwords = set(STOPWORDS)

In [None]:
# instantiate a word cloud object
BR_CUISINE_NYC = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)

# generate the word cloud
BR_CUISINE_NYC.generate(BR_CUISINE_WC)

In [None]:
# display the word cloud
plt.imshow(BR_CUISINE_NYC, interpolation='bilinear')
plt.axis('off')

fig = plt.figure()
fig.set_figwidth(30)
fig.set_figheight(45)

plt.show()

# #3. QUEENS CUISINE

In [None]:
Queens_data = NYC_CUISINE[NYC_CUISINE['Borough'] == 'Queens'].reset_index(drop=True)
Q_CUISINE_WC = Queens_data[['Cuisine']]
Q_CUISINE_WC

In [None]:
Q_CUISINE_WC.to_csv('Q_CUISINE.txt', sep=',', index=False)
Q_CUISINE_WC = open('Q_CUISINE.txt', 'r').read()
stopwords = set(STOPWORDS)
# instantiate a word cloud object
Q_CUISINE_NYC = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)

# generate the word cloud
Q_CUISINE_NYC.generate(Q_CUISINE_WC)

# display the word cloud
plt.imshow(Q_CUISINE_NYC, interpolation='bilinear')
plt.axis('off')

fig = plt.figure()
fig.set_figwidth(30)
fig.set_figheight(45)

plt.show()

# #4. MANHATTAN CUISINE

In [None]:
Manhattan_data = NYC_CUISINE[NYC_CUISINE['Borough'] == 'Manhattan'].reset_index(drop=True)
Manhattan_data.head()

In [None]:
MN_CUISINE_WC = Manhattan_data[['Cuisine']]
MN_CUISINE_WC

In [None]:
MN_CUISINE_WC.to_csv('MN_CUISINE.txt', sep=',', index=False)

In [None]:
MN_CUISINE_WC = open('MN_CUISINE.txt', 'r').read()

In [None]:
stopwords = set(STOPWORDS)

In [None]:
# instantiate a word cloud object
MN_CUISINE_NYC = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)

# generate the word cloud
MN_CUISINE_NYC.generate(MN_CUISINE_WC)

In [None]:
# display the word cloud
plt.imshow(MN_CUISINE_NYC, interpolation='bilinear')
plt.axis('off')

fig = plt.figure()
fig.set_figwidth(30)
fig.set_figheight(45)

plt.show()

# #4. THE BRONX CUISINE

In [None]:
Bronx_data = NYC_CUISINE[NYC_CUISINE['Borough'] == 'The Bronx'].reset_index(drop=True)
Bronx_data.head()

In [None]:
BX_CUISINE_WC = Bronx_data[['Cuisine']]
BX_CUISINE_WC

In [None]:
BX_CUISINE_WC.to_csv('BX_CUISINE.txt', sep=',', index=False)
BX_CUISINE_WC = open('BX_CUISINE.txt', 'r').read()
stopwords = set(STOPWORDS)

In [None]:
# instantiate a word cloud object
BX_CUISINE_NYC = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)

# generate the word cloud
BX_CUISINE_NYC.generate(BX_CUISINE_WC)

In [None]:
# display the word cloud
plt.imshow(BX_CUISINE_NYC, interpolation='bilinear')
plt.axis('off')

fig = plt.figure()
fig.set_figwidth(30)
fig.set_figheight(45)

plt.show()