## IBM Data Science Capstone - Battle of neighbourhoods

First thing First. Importing all the libraries that we will be using in notebook for the project.

In [None]:
# library to handle data in a vectorized manner
import numpy as np 

# library to handle data frames
import pandas as pd 
import requests
from bs4 import BeautifulSoup

# library for data analsysis
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# library to handle JSON files
import json 

# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim 

# library to handle requests
import requests

# tranform JSON file into a pandas dataframe
from pandas.io.json import json_normalize 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# import folium map rendering library
import folium 
print('Libraries imported.')

Storing the wikpedia link in variable called URL

In [None]:
URL = 'https://en.wikipedia.org/wiki/Template:Neighbourhoods_of_Chennai'

Checking response

In [None]:
response = requests.get(URL)
print(response)

Using beautiful soup to scrape data from wikipedia page

In [None]:
soup = BeautifulSoup(response.text,'lxml')
print(soup.prettify())

Finding the table in which all the neighbourhood names are stored and extracting them.

In [None]:
td = soup.find('div',{"style":"padding:0em 0.25em"})
print(td)

In [None]:
ul = td.find('ul')
print(ul)

In [None]:
all_li = ul.find_all('li')
print(all_li)

In [None]:
for li in all_li:
    print(li.a.string)

Storing the neighbourhoods name list in to a pandas data frame

In [None]:
Neigh = [li.a.string for li in all_li]
Neigh

In [None]:
df = pd.DataFrame({'Neighbourhood':Neigh})
df

Looking for geographical coordinates of chennai

In [None]:
address = 'Chennai'

geolocator = Nominatim(user_agent = 'Capstone_Project')
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Chennai are {}, {}.'.format(latitude, longitude))

Gathering all the location latitudes and longitudes from OpenStreetMap API.

In [None]:
def get_coords_local(neighbourhood, output_as='center'):
    
    """
    get the bounding box of a locality in WGS84 given its name

    Parameters
    ----------
    neighbourhood : str
        name of the city in english and lowercase
    output_as : 'str
        chose from 'boundingbox' or 'center'. 
         - 'boundingbox' for [latmin, latmax, lonmin, lonmax]
         - 'center' for [latcenter, loncenter]

    Returns
    -------
    output : list
        list with coordinates as str
    """
    # create url
    url = '{0}{1}{2}'.format('http://nominatim.openstreetmap.org/search.php?q=',
                             neighbourhood+', Chennai, Tamilnadu',
                             '&format=json&polygon=0')
    
    if requests.get(url).json():
        response = requests.get(url).json()[0]
    # parse response to list
        if output_as == 'boundingbox':
            lst = response[output_as]
            output = [float(i) for i in lst]
            print(output)
        if output_as == 'center':
            lst = [response.get(key) for key in ['lat','lon']]
            output = [float(i) for i in lst]
            print(output)
        return output
    else:
        return [0, 0]

In [None]:
df2 = df.copy()

latitudeCln = []
longitudeCln = []
for index, row in df2.iterrows():
    print(row[0])
    response = get_coords_local(neighbourhood=row[0], output_as='center') 
    if response != False:
        lat, long = response
        latitudeCln.append(lat)
        longitudeCln.append(long)

df2['Latitude'] = latitudeCln
df2['Longitude'] = longitudeCln

df2.shape

Creating a dataframe with Neighbourhood names and Location data. Let's drop the Neighbourhoods whose data is not avaliable.

In [None]:
df3 = df2[df2.Latitude != 0].reset_index(drop=True)
df3

In [None]:
df3.shape

Saving the above dataframe into a csv file locally

In [None]:
df3.to_csv('Neighbourhoods.csv')

Lets now map all the neighbourhoods onto map using folium

In [None]:
map_chennai = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df3['Latitude'], df3['Longitude'], df3['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_chennai)  
    
map_chennai

In [None]:
CLIENT_ID = 'OLN1BAQQBHO234LKFIU1ZNGV4Z3O3P1GS5KIMTNPJHLX1MKL' # your Foursquare ID
CLIENT_SECRET = 'VDM5CGGVSUOGKMY21ETO4J1UAJH5QJEALQCJAIWUF2DJXR2T' # your Foursquare Secret
VERSION = '20190201' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Checking for venues of one of the neighbourhood 'Adyar', The first one in the list.

In [None]:
df3.loc[0, 'Neighbourhood']

In [None]:
neighbourhood_latitude = df3.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = df3.loc[0, 'Longitude'] # neighbourhood longitude value

neighbourhood_name = df3.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

In [None]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url

Fetching results in json form

In [None]:
results = requests.get(url).json()
results

Fetching the category type of venues

In [None]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

In [None]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

Making a function to call to fetch the same data for all the nighbourhoods and appending them to dataframe.

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
chennai_venues = getNearbyVenues(names=df3['Neighbourhood'],
                                   latitudes=df3['Latitude'],
                                   longitudes=df3['Longitude']
                                  )

In [None]:
print(chennai_venues.shape)
chennai_venues.head()


chennai_venues.to_csv('Chennai_Venues.csv')

Grouping all the venues in neighbourhood and getiing the count

In [None]:
chennai_venues.groupby('Neighbourhood').count()

In [None]:
print('There are {} uniques categories.'.format(len(chennai_venues['Venue Category'].unique())))

One hot coding

In [None]:
chennai_onehot = pd.get_dummies(chennai_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
chennai_onehot['Neighbourhood'] = chennai_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [chennai_onehot.columns[-1]] + list(chennai_onehot.columns[:-1])
chennai_onehot = chennai_onehot[fixed_columns]

chennai_onehot.head()

In [None]:
chennai_onehot.shape

In [None]:
chennai_grouped = chennai_onehot.groupby('Neighbourhood').mean().reset_index()
chennai_grouped.head()

In [None]:
chennai_grouped.shape

Frequency of categorial venues

In [None]:
num_top_venues = 5

for hood in chennai_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = chennai_grouped[chennai_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

Top 10 venues in neighbourhood

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = chennai_grouped['Neighbourhood']

for ind in np.arange(chennai_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(chennai_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

K-Means clustering with 5 clusters to segment neighbourhood

In [None]:
kclusters = 5

chennai_grouped_clustering = chennai_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(chennai_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[:]

In [None]:
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighbourhoods_venues_sorted.head()

In [None]:
# # add clustering labels
# neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

chennai_merged = df3

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
chennai_merged = chennai_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

chennai_merged.head() # check the last columns!

Dropping the NaN values from data frame

In [None]:
chennai_merged = chennai_merged.dropna()
chennai_merged = chennai_merged.reset_index(drop=True)
chennai_merged.head()

In [None]:
chennai_merged["Cluster Labels"] = chennai_merged['Cluster Labels'].astype('int')
chennai_merged.head()

Mapping clusters onto map of Chennai

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(chennai_merged['Latitude'], chennai_merged['Longitude'], chennai_merged['Neighbourhood'], chennai_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Exploring all the Clusters

In [None]:
chennai_merged.loc[chennai_merged['Cluster Labels'] == 0, chennai_merged.columns[[0] + list(range(4, chennai_merged.shape[1]))]]

In [None]:
chennai_merged.loc[chennai_merged['Cluster Labels'] == 1, chennai_merged.columns[[0] + list(range(4, chennai_merged.shape[1]))]]

In [None]:
chennai_merged.loc[chennai_merged['Cluster Labels'] == 2, chennai_merged.columns[[0] + list(range(4, chennai_merged.shape[1]))]]

In [None]:
chennai_merged.loc[chennai_merged['Cluster Labels'] == 3, chennai_merged.columns[[0] + list(range(4, chennai_merged.shape[1]))]]

In [None]:
chennai_merged.loc[chennai_merged['Cluster Labels'] == 4, chennai_merged.columns[[0] + list(range(4, chennai_merged.shape[1]))]]

In [None]:
Gaming_Venues = chennai_merged.loc[chennai_merged['Cluster Labels'] == 2, chennai_merged.columns[[0] + list(range(1, chennai_merged.shape[1]))]].reset_index(drop=True)
Gaming_Venues.head()

Potential Neighbourhoods to open gaming cafe

In [None]:
Gaming_data = Gaming_Venues[['Neighbourhood','Latitude','Longitude']]
Gaming_data.head()

In [None]:
import urllib
def getNearbyVenues(names, latitudes, longitudes, radius=5000, categoryIds='',LIMIT = 100):
    try:
        venues_list=[]
        for name, lat, lng in zip(names, latitudes, longitudes):
            #print(name)

            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)

            if (categoryIds != ''):
                url = url + '&categoryId={}'
                url = url.format(categoryIds)

            # make the GET request
            response = requests.get(url).json()
            results = response["response"]['venues']

            # return only relevant information for each nearby venue
            for v in results:
                success = False
                try:
                    category = v['categories'][0]['name']
                    success = True
                except:
                    pass

                if success:
                    venues_list.append([(
                        name, 
                        lat, 
                        lng, 
                        v['name'], 
                        v['location']['lat'], 
                        v['location']['lng'],
                        v['categories'][0]['name']
                    )])

        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    
    except:
        print(url)
        print(response)
        print(results)
        print(nearby_venues)

    return(nearby_venues)

Fetching all the school venues 

In [None]:
school_venues = getNearbyVenues(names=Gaming_data['Neighbourhood'], latitudes=Gaming_data['Latitude'], longitudes=Gaming_data['Longitude'], radius=2000, categoryIds='4bf58dd8d48988d13b941735')
school_venues.head()

In [None]:
school_venues.shape

Mapping all the Schools onto the chennai Map

In [None]:
def addToMap(df, color, existingMap):
    for lat, lng, local, venue, venueCat in zip(df['Venue Latitude'], df['Venue Longitude'], df['Neighbourhood'], df['Venue'], df['Venue Category']):
        label = '{} ({}) - {}'.format(venue, venueCat, local)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7).add_to(existingMap)

In [None]:
map_chennai_schools = folium.Map(location=[latitude, longitude], zoom_start=12)
addToMap(school_venues, 'red', map_chennai_schools)
map_chennai_schools

Retriving all the colleges in the required neighbourhood and mapping them

In [None]:
college_venues = getNearbyVenues(names=Gaming_data['Neighbourhood'], latitudes=Gaming_data['Latitude'], longitudes=Gaming_data['Longitude'], radius=2000, categoryIds='4d4b7105d754a06372d81259')
college_venues

In [None]:
map_college_venues = folium.Map(location=[latitude, longitude], zoom_start=12)
addToMap(college_venues, 'green', map_college_venues)
map_college_venues

Plotting Existing Gaming Cafes in Chennai

In [None]:
Gaming_cafe_venues = getNearbyVenues(names=Gaming_data['Neighbourhood'], latitudes=Gaming_data['Latitude'], longitudes=Gaming_data['Longitude'], radius=2000, categoryIds='4bf58dd8d48988d18d941735')
Gaming_cafe_venues

In [None]:
map_Gaming_cafe_venues = folium.Map(location=[latitude, longitude], zoom_start=12)
addToMap(Gaming_cafe_venues, 'green', map_Gaming_cafe_venues)
map_Gaming_cafe_venues

In [None]:
def addColumn(startDf, columnTitle, dataDf):
    grouped = dataDf.groupby('Neighbourhood').count()
    
    for n in startDf['Neighbourhood']:
        try:
            startDf.loc[startDf['Neighbourhood'] == n,columnTitle] = grouped.loc[n, 'Venue']
        except:
            startDf.loc[startDf['Neighbourhood'] == n,columnTitle] = 0

In [None]:
df_data = Gaming_data.copy()
addColumn(df_data, 'Gaming_Cafe', Gaming_cafe_venues)
addColumn(df_data, 'Schools', school_venues)
addColumn(df_data, 'Universities', college_venues)
df_data

Normalizing some of the Data in data frame (Schools and Universities Count)

In [None]:
import pandas as pd
from sklearn import preprocessing

# standardise the means to 0 and standard error to 1
for i in df_data.columns[4:6]: # df.columns[:-1] = dataframe for all features
    df_data[i] = preprocessing.scale(df_data[i].astype('float64'))

Data Frame Containing Neighbouthoods, Gaming cafe's, Schools and Universities

In [None]:
df_data

In [None]:
# atttaching some weights to attribute and calculation Final Score

weight_Gaming_cafe = -1

weight_schools = 1

weight_universities = 2

In [None]:
df_weighted = df_data[['Neighbourhood']].copy()

In [None]:
df_weighted['Score'] = df_data['Gaming_Cafe'] * weight_Gaming_cafe + df_data['Schools'] * weight_schools + df_data['Universities'] * weight_universities
df_weighted = df_weighted.sort_values(by=['Score'], ascending=False)
df_weighted

Mylapore is the best place to Opening a gaming Cafe.