### Scrape Wikipedia Data into Pandas Dataframe with Beautifulsoup

#### Get data From Wikipedia and load into a dictionary

Here we get data from the specified Wikipedia page and use a combination of BeautifulSoup and Regex/String operations to extract the 3 data fields. They are then initially stored in a dictionary. 

In [1]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd

website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'html')
my_table = soup.find('table',{'class':'wikitable sortable'})
links=my_table.findAll('td')
groups={}
#Iterating over the links list in Groups of 3, as each Postal Code, Borough and Neighborhood belong to the same group.
for i in range(0,len(links),3):
    # Extract Postal Code using String slice. All postcodes have this same length and format.
    postcode=str(links[i])[4:-5]
    # Extract Borough using Regex. All boroughs have title field present.
    try:
        borough = re.search(r'title="(.+?)"',str(links[i+1])).group(1)
    except AttributeError:
        borough = '' 
    # Extract those Neighborhoods which have a title attribute/tag present
    try:
        neighborhood = re.search(r'title="(.+?)"',str(links[i+2])).group(1)
    except AttributeError:
        # Extract those Neighborhoods which do not have a title attribute/tag present, by using a regex for the <td> tag
        try:
            neighborhood = re.search(r'<td>(.+?)\n</td>',str(links[i+2])).group(1)
        except AttributeError:
            neighborhood = ''
    if(borough!=''):
        #For multiple neighborhoods associated with the same Postal codeappend the Neighborhood to the one already present.
        if(neighborhood==''or neighborhood=='Not assigned'):
            neighborhood=borough
        if(postcode in groups.keys()):
            groups[postcode][1]=groups[postcode][1]+','+(neighborhood)
        else:
            groups[postcode]=[borough,neighborhood]

#### Convert dictionary into Dataframe with required format

Here the dictionary is converted into a dataframe of the required format.

In [2]:
df=pd.DataFrame.from_dict(groups,orient='index',columns=['Borough','Neighborhood'])
df.reset_index(inplace=True)
df.rename(columns={'index':'PostalCode'},inplace=True)
df=df.astype({'PostalCode': 'string','Borough': 'string','Neighborhood':'string'})
print('Sample:',df.head(10))
print('Dataframe shape:',df.shape)

Sample:   PostalCode                 Borough                     Neighborhood
0        M3A              North York                        Parkwoods
1        M4A              North York                 Victoria Village
2        M5A        Downtown Toronto                      Regent Park
3        M6A              North York  Lawrence Heights,Lawrence Manor
4        M7A        Downtown Toronto           Queen's Park (Toronto)
5        M9A  Queen's Park (Toronto)           Queen's Park (Toronto)
6        M1B    Scarborough, Toronto  Rouge, Toronto,Malvern, Toronto
7        M3B              North York                  Don Mills North
8        M4B               East York   Woodbine Gardens,Parkview Hill
9        M5B        Downtown Toronto          Ryerson,Garden District
Dataframe shape: (100, 3)


### Import coordinate data from the .csv file URL

In [3]:
import csv
import requests

CSV_URL = 'https://cocl.us/Geospatial_data'


with requests.Session() as s:
    download = s.get(CSV_URL)

    decoded_content = download.content.decode('utf-8')

    cr = csv.reader(decoded_content.splitlines(), delimiter=',')
    my_list = list(cr)

coord_df = pd.DataFrame(my_list) 

In [4]:
coord_df=coord_df.rename(columns=coord_df.iloc[0])
coord_df=coord_df.drop(coord_df.index[0])
coord_df=coord_df.astype({'Postal Code': 'string','Latitude': 'float','Longitude':'float'})
df_combined = pd.merge(coord_df, df, left_on='Postal Code', right_on='PostalCode')
df_combined.drop(['Postal Code'], axis=1,inplace=True)

#### Combined Dataframe to have both latitude, longitude along with Postal code details.

In [5]:
df_combined.head()

Unnamed: 0,Latitude,Longitude,PostalCode,Borough,Neighborhood
0,43.806686,-79.194353,M1B,"Scarborough, Toronto","Rouge, Toronto,Malvern, Toronto"
1,43.784535,-79.160497,M1C,"Scarborough, Toronto","Highland Creek (Toronto),Rouge Hill,Port Union..."
2,43.763573,-79.188711,M1E,"Scarborough, Toronto","Guildwood,Morningside, Toronto,West Hill, Toronto"
3,43.770992,-79.216917,M1G,"Scarborough, Toronto","Woburn, Toronto"
4,43.773136,-79.239476,M1H,"Scarborough, Toronto",Cedarbrae


In [6]:
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
import folium

In [7]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="tn_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [8]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_combined['Latitude'], df_combined['Longitude'], df_combined['Borough'], df_combined['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [9]:
CLIENT_ID = '3OAZTISX0JOG5GLYGDZ2GVCAJH333YM2P14EJHF33WGZHW0X' # your Foursquare ID
CLIENT_SECRET = 'UEX5CVS4V1GTNNDWB2PPBG4XLGG2IZQOAXK3J4LOROM02JG1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [10]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
toronto_venues = getNearbyVenues(names=df_combined['Neighborhood'],
                                   latitudes=df_combined['Latitude'],
                                   longitudes=df_combined['Longitude']
                                  )

print(toronto_venues.shape)
toronto_venues.head()

(2208, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Toronto,Malvern, Toronto",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek (Toronto),Rouge Hill,Port Union...",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood,Morningside, Toronto,West Hill, Toronto",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,"Guildwood,Morningside, Toronto,West Hill, Toronto",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood,Morningside, Toronto,West Hill, Toronto",43.763573,-79.188711,Marina Spa,43.766,-79.191,Spa


In [12]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100
"Agincourt North,L'Amoreaux East,Milliken, Ontario,Steeles East",2,2,2,2,2,2
"Agincourt, Toronto",4,4,4,4,4,4
"Albion Gardens,Beaumond Heights,Humbergate,Mount Olive-Silverstone-Jamestown,Mount Olive-Silverstone-Jamestown,Silverstone, Toronto,South Steeles,Thistletown",9,9,9,9,9,9
"Alderwood, Toronto,Long Branch, Toronto",9,9,9,9,9,9
...,...,...,...,...,...,...
Willowdale West,5,5,5,5,5,5
"Woburn, Toronto",3,3,3,3,3,3
"Woodbine Gardens,Parkview Hill",12,12,12,12,12,12
Woodbine Heights,12,12,12,12,12,12


In [13]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

There are 271 uniques categories.


Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
toronto_onehot.shape

(2208, 271)

In [15]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.shape

(96, 271)

In [16]:
import numpy as np

num_top_venues = 5


for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Adelaide,King,Richmond----
              venue  freq
0       Coffee Shop  0.06
1               Bar  0.04
2              Café  0.04
3   Thai Restaurant  0.04
4  Sushi Restaurant  0.03


----Agincourt North,L'Amoreaux East,Milliken, Ontario,Steeles East----
                        venue  freq
0                  Playground   0.5
1                        Park   0.5
2                 Yoga Studio   0.0
3                 Men's Store   0.0
4  Modern European Restaurant   0.0


----Agincourt, Toronto----
                       venue  freq
0                     Lounge  0.25
1             Breakfast Spot  0.25
2  Latin American Restaurant  0.25
3               Skating Rink  0.25
4              Metro Station  0.00


----Albion Gardens,Beaumond Heights,Humbergate,Mount Olive-Silverstone-Jamestown,Mount Olive-Silverstone-Jamestown,Silverstone, Toronto,South Steeles,Thistletown----
                 venue  freq
0        Grocery Store  0.22
1             Pharmacy  0.11
2           Beer Store  0.11
3

                 venue  freq
0  Empanada Restaurant   1.0
1          Yoga Studio   0.0
2  Monument / Landmark   0.0
3   Mac & Cheese Joint   0.0
4               Market   0.0


----Kingsview Village,Martin Grove Gardens,Richview Gardens,St. Phillips----
               venue  freq
0        Pizza Place  0.25
1  Mobile Phone Shop  0.25
2               Park  0.25
3     Sandwich Place  0.25
4        Men's Store  0.00


----Kingsway Park South West,Mimico,The Queensway,Royal York South West,South of Bloor----
            venue  freq
0     Social Club  0.07
1   Grocery Store  0.07
2    Burger Joint  0.07
3   Burrito Place  0.07
4  Sandwich Place  0.07


----L'Amoreaux West----
                  venue  freq
0  Fast Food Restaurant  0.15
1    Chinese Restaurant  0.15
2           Coffee Shop  0.08
3       Bubble Tea Shop  0.08
4        Sandwich Place  0.08


----Lawrence Heights,Lawrence Manor----
                    venue  freq
0  Furniture / Home Store  0.25
1                Boutique  0.08
2   

In [17]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [18]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,King,Richmond",Coffee Shop,Café,Bar,Thai Restaurant,Restaurant,Sushi Restaurant,Steakhouse,Burger Joint,Bakery,Cosmetics Shop
1,"Agincourt North,L'Amoreaux East,Milliken, Onta...",Park,Playground,Doner Restaurant,Department Store,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop
2,"Agincourt, Toronto",Lounge,Latin American Restaurant,Skating Rink,Breakfast Spot,Women's Store,Doner Restaurant,Dim Sum Restaurant,Diner,Discount Store,Dog Run
3,"Albion Gardens,Beaumond Heights,Humbergate,Mou...",Grocery Store,Fried Chicken Joint,Pharmacy,Pizza Place,Sandwich Place,Beer Store,Fast Food Restaurant,Coffee Shop,General Entertainment,Cuban Restaurant
4,"Alderwood, Toronto,Long Branch, Toronto",Pizza Place,Gym,Skating Rink,Coffee Shop,Pharmacy,Pub,Sandwich Place,Pool,Dim Sum Restaurant,Deli / Bodega


In [19]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 2, 0, 0, 0, 0, 0, 0, 0, 0])

In [20]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_combined

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Latitude,Longitude,PostalCode,Borough,Neighborhood,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,43.806686,-79.194353,M1B,"Scarborough, Toronto","Rouge, Toronto,Malvern, Toronto",3.0,Fast Food Restaurant,Women's Store,Donut Shop,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Drugstore
1,43.784535,-79.160497,M1C,"Scarborough, Toronto","Highland Creek (Toronto),Rouge Hill,Port Union...",0.0,Bar,Women's Store,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Donut Shop,Falafel Restaurant
2,43.763573,-79.188711,M1E,"Scarborough, Toronto","Guildwood,Morningside, Toronto,West Hill, Toronto",0.0,Rental Car Location,Moving Target,Breakfast Spot,Spa,Medical Center,Intersection,Mexican Restaurant,Electronics Store,Pizza Place,Concert Hall
3,43.770992,-79.216917,M1G,"Scarborough, Toronto","Woburn, Toronto",0.0,Coffee Shop,Korean Restaurant,Women's Store,Doner Restaurant,Dessert Shop,Dim Sum Restaurant,Diner,Discount Store,Dog Run,Drugstore
4,43.773136,-79.239476,M1H,"Scarborough, Toronto",Cedarbrae,0.0,Hakka Restaurant,Bakery,Bank,Athletics & Sports,Thai Restaurant,Caribbean Restaurant,Gas Station,Fried Chicken Joint,Diner,Dessert Shop


In [27]:
toronto_merged.dropna(inplace=True)

In [28]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters