In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import requests
import io
import folium
import geopy
import json
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

#### Let' scrape a wikipedia page (which needs serious donations, please donate) to get the list of towns in Quebec

![Wikipedia logo](https://upload.wikimedia.org/wikipedia/commons/7/77/Wikipedia_svg_logo.svg)

In [None]:
tables=pd.read_html("https://en.wikipedia.org/wiki/List_of_towns_in_Quebec")


In [None]:
tables1=tables[1]
tables1

Unnamed: 0,Name,Regional county municipality,Region,CMA/CA,Population(2016)[2],Population(2011)[2],Change(%)[2],Area(km²)[2],Populationdensity[2]
0,Acton Vale,Acton,Montérégie,,7656,7664,−0.1,91.10,84.0
1,Alma,Lac-Saint-Jean-Est,Saguenay–Lac-Saint-Jean,Alma,30776,30904,−0.4,196.54,156.6
2,Amos,Abitibi,Abitibi-Témiscamingue,Amos,12823,12671,1.2,430.29,29.8
3,Amqui,La Matapédia,Bas-Saint-Laurent,,6178,6322,−2.3,121.17,51.0
4,Baie-Comeau,Manicouagan,Côte-Nord,Baie-Comeau,21536,22113,−2.6,336.59,64.0
...,...,...,...,...,...,...,...,...,...
220,Waterloo,La Haute-Yamaska,Montérégie,,4410,4330,1.8,12.24,360.2
221,Waterville,Coaticook,Estrie,Sherbrooke,2121,2028,4.6,44.10,48.1
222,Westmount,,Montréal,Montréal,20312,19931,1.9,4.04,5024.9
223,Windsor,Le Val-Saint-François,Estrie,,5419,5330,1.7,14.56,372.2


In [None]:
df=tables1.loc[:,['Name','Region']]
df

Unnamed: 0,Name,Region
0,Acton Vale,Montérégie
1,Alma,Saguenay–Lac-Saint-Jean
2,Amos,Abitibi-Témiscamingue
3,Amqui,Bas-Saint-Laurent
4,Baie-Comeau,Côte-Nord
...,...,...
220,Waterloo,Montérégie
221,Waterville,Estrie
222,Westmount,Montréal
223,Windsor,Estrie


In [None]:
df.isna().any().any() #missing values present or not

False

In [None]:
df.isna().sum().sum() #total no of missing values

0

In [None]:
df.isna().sum()/(len(df))*100 #missing values percentage

Name      0.0
Region    0.0
dtype: float64

In [None]:
df.loc[:, df.isnull().any()].columns #determining columns with missing values

Index([], dtype='object')

In [None]:
df.isna().sum()/(len(df))*100 #no of missing values in df_town

Name      0.0
Region    0.0
dtype: float64

In [None]:
#determining values with [] in Name column
print(df[df.Name.str.endswith("]")])
#rows with ___ in Region column
print(df.loc[215:224 , :])
#removing the columns
df=df.drop([81,96,116,141,224])

                   Name              Region
81       La Tuque[QC 1]            Mauricie
96   L'Île-Dorval[QC 2]            Montréal
116      Montreal[QC 3]            Montréal
141        Québec[QC 4]  Capitale-Nationale
                            Name                 Region
215                     Varennes             Montérégie
216             Vaudreuil-Dorion             Montérégie
217                Victoriaville       Centre-du-Québec
218                  Ville-Marie  Abitibi-Témiscamingue
219                      Warwick       Centre-du-Québec
220                     Waterloo             Montérégie
221                   Waterville                 Estrie
222                    Westmount               Montréal
223                      Windsor                 Estrie
224  Total villes (cities/towns)                      —


In [None]:
#reorder index after removing columns
df_town=df
df_town.reset_index(drop=True, inplace=True) 
df_town.loc[78:220 , :]

Unnamed: 0,Name,Region
78,La Pocatière,Bas-Saint-Laurent
79,La Prairie,Montérégie
80,La Sarre,Abitibi-Témiscamingue
81,Lac-Delage,Capitale-Nationale
82,Lachute,Laurentides
...,...,...
215,Warwick,Centre-du-Québec
216,Waterloo,Montérégie
217,Waterville,Estrie
218,Westmount,Montréal


#### Let's get the geographical location of the towns


In [None]:
count=df_town.index
print(count)

RangeIndex(start=0, stop=220, step=1)


In [None]:
#FOR FULL DATA (220 ROWS)
town_latitude=[0 for a in range(220)]
town_longitude=[0 for a in range(220)]
for i in list(range(220)):
  address = df_town.loc[i,"Name"]+", "+df_town.loc[i,"Region"]+", Quebec"
  locator = Nominatim(user_agent="quebec_explore")
  location = locator.geocode(address)
  town_latitude[i]=location.latitude
  town_longitude[i]=location.longitude

print(town_latitude)
print(town_longitude)

[45.6471564, 48.548887, 48.5718519, 48.4656706, 49.217597, 45.4174258, 47.444343, 46.0005197, 45.4289766, 46.210725, 45.3139778, 47.043926, 46.339343, 45.120537, 47.3905166, 45.5643183, 46.0828098, 45.67932, 45.6130352, 45.6713243, 48.0444351, 45.6050197, 45.219192, 45.318116, 45.4554829, 45.676516, 45.3867947, 49.0959673, 46.6701717, 45.437955, 48.1014497, 48.3542375, 45.4473898, 48.3485676, 49.783855, 45.725519649999995, 45.3619066, 46.971117, 49.9137407, 47.694512, 45.1334169, 45.8573781, 45.4124593, 45.4772716, 45.2963957, 45.2076125, 45.785278, 46.20431, 47.547799, 45.3749509, 48.417809, 45.5372581, 45.904822, 48.883231, 45.48423, 46.676532, 45.4453082, 45.8864303, 45.130469, 48.5026134, 45.4828558, 46.0423647, 45.2831, 52.7931065, 48.739437, 46.871928, 48.6580556, 45.4841214, 46.0933721, 45.3990409, 48.5347749, 45.4811545, 45.464465, 45.087339, 46.0239798, 45.857978, 45.4529194, 47.657525, 47.369524, 45.4175472, 48.8000956, 46.9706256, 45.6560478, 45.5905023, 46.9361368, 46.85854

In [None]:
df_town['Latitude']=town_latitude
df_town

Unnamed: 0,Name,Region,Latitude,Longitude
0,Acton Vale,Montérégie,45.647156,-72.565411
1,Alma,Saguenay–Lac-Saint-Jean,48.548887,-71.651459
2,Amos,Abitibi-Témiscamingue,48.571852,-78.116086
3,Amqui,Bas-Saint-Laurent,48.465671,-67.431517
4,Baie-Comeau,Côte-Nord,49.217597,-68.152313
...,...,...,...,...
215,Warwick,Centre-du-Québec,45.945659,-71.990662
216,Waterloo,Montérégie,45.344157,-72.516217
217,Waterville,Estrie,45.277061,-71.890755
218,Westmount,Montréal,45.485733,-73.596395


In [None]:
df_town['Longitude']=town_longitude
df_town

Unnamed: 0,Name,Region,Latitude,Longitude
0,Acton Vale,Montérégie,45.647156,-72.565411
1,Alma,Saguenay–Lac-Saint-Jean,48.548887,-71.651459
2,Amos,Abitibi-Témiscamingue,48.571852,-78.116086
3,Amqui,Bas-Saint-Laurent,48.465671,-67.431517
4,Baie-Comeau,Côte-Nord,49.217597,-68.152313
...,...,...,...,...
215,Warwick,Centre-du-Québec,45.945659,-71.990662
216,Waterloo,Montérégie,45.344157,-72.516217
217,Waterville,Estrie,45.277061,-71.890755
218,Westmount,Montréal,45.485733,-73.596395


In [None]:
#creating bag of words model 
BOW = df_town.iloc[1:, :].values 
print(BOW[:])

[['Alma' 'Saguenay–Lac-Saint-Jean' 48.548887 -71.651459]
 ['Amos' 'Abitibi-Témiscamingue' 48.5718519 -78.1160856]
 ['Amqui' 'Bas-Saint-Laurent' 48.4656706 -67.4315171]
 ['Baie-Comeau' 'Côte-Nord' 49.217597 -68.152313]
 ["Baie-D'Urfé" 'Montréal' 45.4174258 -73.9153643]
 ['Baie-Saint-Paul' 'Capitale-Nationale' 47.444343 -70.505447]
 ['Barkmere' 'Laurentides' 46.0005197 -74.5746325]
 ['Beaconsfield' 'Montréal' 45.4289766 -73.8654387]
 ['Beauceville' 'Chaudière-Appalaches' 46.210725 -70.774269]
 ['Beauharnois' 'Montérégie' 45.3139778 -73.875834]
 ['Beaupré' 'Capitale-Nationale' 47.043926 -70.892029]
 ['Bécancour' 'Centre-du-Québec' 46.339343 -72.433205]
 ['Bedford' 'Montérégie' 45.120537 -72.988701]
 ['Belleterre' 'Abitibi-Témiscamingue' 47.3905166 -78.7042972]
 ['Beloeil' 'Montérégie' 45.5643183 -73.2040066]
 ['Berthierville' 'Lanaudière' 46.0828098 -73.1747099]
 ['Blainville' 'Laurentides' 45.67932 -73.87619]
 ['Boisbriand' 'Laurentides' 45.6130352 -73.8386827]
 ['Bois-des-Filion' 'Laure

#### Now let's plot the cities in a map of Canada

In [None]:
Quebec_lat=48.571852
Quebec_lng=-79.201022

map_Quebec = folium.Map(location=[Quebec_lat, Quebec_lng], zoom_start=6)

for lat, lng, town, reg in zip(df_town['Latitude'], df_town['Longitude'], df_town['Name'],df_town['Region']):
    label='{}, {}'.format(town, reg)
    folium.Marker(location=[lat,lng]).add_to(map_Quebec)

map_Quebec

In [None]:
CLIENT_ID = '****'
CLIENT_SECRET = '****'
VERSION='20180325'
LIMIT=5000
RADIUS=5000

In [None]:
url='https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&ll={},{}&v={}&limit={}&radius={}'.format(CLIENT_ID,CLIENT_SECRET,df_town.loc[0,"Latitude"],
                                                                                            df_town.loc[0,"Longitude"],VERSION,LIMIT,RADIUS)

In [None]:
result=requests.get(url).json()
result

{'meta': {'code': 429,
  'errorDetail': 'Quota exceeded',
  'errorType': 'quota_exceeded',
  'requestId': '5fecb53176cf540ef12fa326'},
 'response': {}}

In [None]:
LIMIT=400
def getNearbyVenues(names, latitudes, longitudes, radius=2000):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
      print(name)
      url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
      try:
            results=requests.get(url).json()['response']['groups'][0]['items']
      except:
        continue;
        venues_list.append([(
        name, 
        lat, 
        lng,
        v['venue']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

#### Let's get all the venues in the towns of Quebec

In [None]:
Quebec_venues = getNearbyVenues(names=df_town['Name'],latitudes=df_town['Latitude'],longitudes=df_town['Longitude'])

In [None]:
Quebec_venues

In [None]:
Quebec_venues.groupby("Neighborhood").count()

In [None]:
Quebec_venues.groupby("Venue Category").count()

In [None]:
#One hot encoding of categorical features 
Quebec_onehot=pd.get_dummies(Quebec_venues[['Venue Category']], prefix_sep="")
#Quebec_onehot.drop("Neighborhood", axis=1, inplace=True)
Quebec_onehot['Neighborhood']=Quebec_venues['Neighborhood']

fixed_columns=[Quebec_onehot.columns[-1]]+list(Quebec_onehot.columns[:-1])
Quebec_onehot=Quebec_onehot[fixed_columns]
Quebec_onehot

In [None]:
#re-ordering indexes
Quebec_grouped=Quebec_onehot.groupby("Neighborhood").mean().reset_index()
Quebec_grouped

In [None]:
def top_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

#### Let's find the top 10 venues in every neighborhood

In [None]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for point in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(point+1, indicators[point]))
    except:
        columns.append('{}th Most Common Venue'.format(point+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Quebec_grouped['Neighborhood']

for point in np.arange(Quebec_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[point, 1:] = top_common_venues(Quebec_grouped.iloc[point, :], num_top_venues)

neighborhoods_venues_sorted

#### Let's cluster the neighborhoods

In [None]:
kclusters = 3

Quebec_grouped_clustering = Quebec_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=1234).fit(Quebec_grouped_clustering)

In [None]:
display(kmeans.cluster_centers_)

In [None]:
df_town.drop('Region',axis=1,inplace=True)

In [None]:
df_town.rename(columns={"Name":"Neighborhood"}, inplace=True)
df_town

In [None]:
neighborhoods_venues_sorted.drop('Cluster Labels', axis=1, inplace=True)

#### Let's merge the main dataset with the most common venues

In [None]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_.astype('int32'))
neighborhoods_venues_sorted["Cluster Labels"].dtype
Quebec_merged = df_town

Quebec_merged = Quebec_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
print(Quebec_merged.shape)

In [None]:
Quebec_merged.reset_index(inplace=True)
Quebec_merged.drop("index",inplace=True, axis=1)
Quebec_merged

In [None]:
Quebec_merged.dropna()

In [None]:
#checking missing values
Quebec_merged = Quebec_merged[~Quebec_merged['Cluster Labels'].isnull()]
Quebec_merged.shape

#### Now let's plot the areas in map with a colour assigned to each cluster

In [None]:
latitude=45.647156
longitude=-72.565411

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=6)

arg = np.arange(kclusters)
rg = [i + arg + (i*arg)**2 for i in range(kclusters)]
col_array = cm.rainbow(np.linspace(0, 1, len(rg)))
rainbow = [colors.rgb2hex(i) for i in col_array]

# add markers to the map
markers_colors = []
for lat, lng, ngh, cluster in zip(Quebec_merged['Latitude'], Quebec_merged['Longitude'], Quebec_merged['Neighborhood'], Quebec_merged['Cluster Labels']):
    label = folium.Popup(str(ngh) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color=rainbow[int(cluster)-1],
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
cluster_1=Quebec_merged[Quebec_merged.loc[:,"Cluster Labels"]==0.0].reset_index(drop=True)
cluster_1

In [None]:
cluster_2=Quebec_merged[Quebec_merged.loc[:,"Cluster Labels"]==1.0].reset_index(drop=True)
cluster_2

In [None]:
cluster_3=Quebec_merged[Quebec_merged.loc[:,"Cluster Labels"]==2.0].reset_index(drop=True)
cluster_3

### Let's analyze the two clusters

In [None]:
cluster_1.groupby("1st Most Common Venue").count().sort_values(by="Neighborhood",ascending=False)

#### The most frequent "1st most common venue in first cluster is Grocery Store" followed by "Fast Food restaurant" and "Liquor Store". This is also the similar to findings in 2nd most common venues too

In [None]:
cluster_1.groupby("2nd Most Common Venue").count().sort_values(by="Neighborhood",ascending=False)

#### Gym and Fitness Center and shopping malls are also popular in cluster 1

In [None]:
cluster_1.groupby("6th Most Common Venue").count().sort_values(by="Neighborhood",ascending=False)

#### Fireworks Store and Fish & Chips Shops seems to have taken 10th most frequent available places in Quebec

### Now let's see the most frequent top venues in the second cluster

In [None]:
cluster_2.groupby("1st Most Common Venue").count().sort_values(by="Neighborhood",ascending=False)

###### Construction and Landscaping, ATMs are the also among the available venues in cluster 2

#### Filipino restaurants are also preferable venue in cluster 2

In [None]:
cluster_2.groupby("10th Most Common Venue").count().sort_values(by="Neighborhood",ascending=False)

#### Hotel Bar and Zoo are also among the most available venues in cluster 2

Now let's see the most frequent top venues in the third cluster

In [None]:
cluster_3.groupby("1st Most Common Venue").count().sort_values(by="Neighborhood",ascending=False)

In [None]:
cluster_3.groupby("10th Most Common Venue").count().sort_values(by="Neighborhood",ascending=False)

###### Farms and Bakery are among available places in cluster 3. Airport Terminal, Museum, Night Club and Hockey Arena are also some of the most common venues in Cluster 3.

###CLUSTER PLOT


## Final Suggestion: Any investment other than Fast Food restaurant, Grocery Store and Coffee Shop is better, considering there's a market for it. Bus Station, Shopping Malls and Beach are lagging behind in the frequency. So investing in them would make a better sense as there's a market for it but the availability is lesser.