# Using Machine Learning for clustering the 10 Canadian cities by Population Origins and Most Common Types of Food Venues

In [15]:
import pandas as pd
from sklearn import preprocessing
from sklearn.cluster import KMeans

### Here we read the the files needed for the ML K-means.

In [16]:
dfrests = pd.read_csv('CAN_All_Rests.csv')

In [17]:
dfpop = pd.read_csv('CaMLPERCENTAGES.csv')

In [18]:
dfrests.drop('Unnamed: 0', axis = 1, inplace = True)

In [19]:
dfrests.set_index('City', inplace = True)

In [20]:
dfpop.set_index('City', inplace = True)

### Now we calculate the cluster of the population origins.

In [21]:
# number of clusters
kclusters = 4

# k-means clustering
k_means = KMeans(n_clusters=kclusters, random_state=0).fit(dfpop)

k_means.labels_[0:10] 

array([1, 3, 3, 2, 1, 0, 0, 2, 1, 2])

In [22]:
poplabels = list(k_means.labels_)

### And here the cluster for the restaurants.

In [23]:
# number of clusters
kclusters = 4

# k-means clustering
k_means = KMeans(n_clusters=kclusters, random_state=0).fit(dfrests)

k_means.labels_[0:10] 

array([3, 3, 1, 3, 0, 0, 2, 1, 1, 3])

### Putting the clusters on the tables

In [24]:
restslabels = list(k_means.labels_)
dfrests['Food Venues Cluster Labels'] = restslabels

dfrests

Unnamed: 0_level_0,Afghan Restaurant,African Restaurant,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Asian Restaurant,BBQ Joint,Bagel Shop,Belgian Restaurant,Bistro,...,Swiss Restaurant,Szechuan Restaurant,Taco Place,Tapas Restaurant,Thai Restaurant,Turkish Restaurant,Ukrainian Restaurant,Vietnamese Restaurant,Wings Joint,Food Venues Cluster Labels
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Calgary,0.0,0.003546,0.042553,0.0,0.003546,0.024823,0.010638,0.003546,0.0,0.0,...,0.003546,0.0,0.003546,0.003546,0.010638,0.0,0.0,0.060284,0.0,3
Edmonton,0.0,0.0,0.02,0.0,0.0,0.032,0.024,0.0,0.0,0.004,...,0.0,0.0,0.0,0.008,0.036,0.004,0.004,0.044,0.0,3
Halifax,0.0,0.0,0.022857,0.0,0.0,0.0,0.005714,0.005714,0.0,0.005714,...,0.0,0.0,0.005714,0.0,0.011429,0.011429,0.0,0.017143,0.005714,1
London,0.0,0.006211,0.037267,0.0,0.0,0.031056,0.018634,0.0,0.0,0.006211,...,0.0,0.0,0.0,0.0,0.031056,0.0,0.0,0.024845,0.0,3
Montreal,0.0,0.002242,0.004484,0.0,0.002242,0.015695,0.013453,0.011211,0.0,0.004484,...,0.0,0.004484,0.002242,0.004484,0.011211,0.0,0.0,0.026906,0.002242,0
Ottawa,0.0,0.0,0.015957,0.0,0.0,0.015957,0.021277,0.010638,0.005319,0.005319,...,0.0,0.0,0.005319,0.037234,0.010638,0.0,0.0,0.069149,0.0,0
Quebec City,0.004464,0.0,0.004464,0.0,0.0,0.022321,0.013393,0.004464,0.0,0.026786,...,0.004464,0.0,0.0,0.0,0.022321,0.0,0.0,0.008929,0.0,2
Toronto,0.003731,0.0,0.026119,0.003731,0.0,0.018657,0.007463,0.003731,0.0,0.011194,...,0.0,0.0,0.011194,0.011194,0.022388,0.0,0.0,0.007463,0.0,1
Vancouver,0.0,0.0,0.006079,0.0,0.0,0.012158,0.009119,0.009119,0.006079,0.00304,...,0.0,0.0,0.018237,0.009119,0.012158,0.0,0.0,0.036474,0.0,1
Winnipeg,0.0,0.0,0.042636,0.0,0.003876,0.062016,0.007752,0.0,0.0,0.007752,...,0.0,0.0,0.0,0.007752,0.015504,0.0,0.0,0.031008,0.0,3


In [25]:
dfrests.reset_index(inplace = True)

In [26]:
dfrests.set_index('Food Venues Cluster Labels', inplace = True)

In [27]:
dfrests.sort_index(inplace = True)

In [28]:
dfpop['Population Origins Cluster Labels'] = poplabels

dfpop

Unnamed: 0_level_0,English,Canadian,Scottish,German,Irish,French,Chinese,Ukrainian,East Indian,Filipino,...,Malinké,Burkinabe,Corsican,Djiboutian,Maure,Guadeloupean,Martinican,Montserratan,Hmong,Population Origins Cluster Labels
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Calgary,0.217412,0.200742,0.175154,0.146692,0.143447,0.085865,0.076107,0.06601,0.065926,0.054574,...,1.5e-05,1.5e-05,7e-06,7e-06,7e-06,0.0,0.0,0.0,0.0,1
Toronto,0.124949,0.127406,0.092747,0.046361,0.092852,0.042265,0.119516,0.024618,0.109737,0.046849,...,3.8e-05,1.7e-05,1e-05,3.6e-05,4e-05,8e-06,2.6e-05,7.4e-05,9e-06,3
Vancouver,0.193858,0.13651,0.140578,0.091512,0.113491,0.060883,0.205741,0.038908,0.100213,0.055199,...,0.0,8e-06,3.1e-05,0.0,4.3e-05,4e-06,1e-05,8e-06,4.7e-05,3
Halifax,0.306302,0.384365,0.285378,0.113435,0.239092,0.167027,0.019126,0.012964,0.01265,0.00674,...,2.5e-05,2.5e-05,0.0,0.0,0.0,0.0,2.5e-05,0.0,0.0,2
Edmonton,0.199109,0.205387,0.160686,0.164278,0.143458,0.110192,0.055466,0.123289,0.05569,0.049542,...,5.4e-05,5.4e-05,6.9e-05,0.000119,2.7e-05,0.0,0.0,1.5e-05,0.0,1
Montreal,0.034494,0.437838,0.030958,0.021454,0.059719,0.21703,0.027129,0.008742,0.012092,0.008898,...,0.000133,0.00044,0.000228,6.1e-05,0.000103,0.000197,0.000364,6.1e-05,7e-06,0
Quebec City,0.014453,0.712657,0.019484,0.013516,0.050521,0.289789,0.004704,0.001425,0.000911,0.000404,...,3.9e-05,0.000321,0.000122,3.2e-05,5.1e-05,0.000154,0.000148,0.0,0.0,0
Ottawa,0.210578,0.299698,0.184934,0.085807,0.216557,0.208148,0.047835,0.026727,0.028669,0.01371,...,7.7e-05,0.000293,5.7e-05,0.00075,4.1e-05,1e-05,3.6e-05,4.6e-05,0.0,2
Winnipeg,0.193772,0.171206,0.166426,0.162789,0.127951,0.126198,0.035212,0.152651,0.040438,0.101511,...,3.9e-05,2.6e-05,1.3e-05,5.3e-05,9.2e-05,0.0,0.0,1.3e-05,0.0,1
London,0.30345,0.273851,0.219916,0.121304,0.201293,0.096279,0.025642,0.025097,0.017811,0.008787,...,3.1e-05,0.0,2.1e-05,0.000103,0.0,2.1e-05,0.0,0.0,2.1e-05,2


In [29]:
dfpop.reset_index(inplace = True)

In [30]:
dfpop.set_index('Population Origins Cluster Labels', inplace = True)

In [31]:
dfpop.sort_index(inplace = True)

In [32]:
dfpop.reset_index(inplace = True)

In [33]:
dfrests.reset_index(inplace = True)

In [34]:
dfpop

Unnamed: 0,Population Origins Cluster Labels,City,English,Canadian,Scottish,German,Irish,French,Chinese,Ukrainian,...,Breton,Malinké,Burkinabe,Corsican,Djiboutian,Maure,Guadeloupean,Martinican,Montserratan,Hmong
0,0,Montreal,0.034494,0.437838,0.030958,0.021454,0.059719,0.21703,0.027129,0.008742,...,0.001274,0.000133,0.00044,0.000228,6.1e-05,0.000103,0.000197,0.000364,6.1e-05,7e-06
1,0,Quebec City,0.014453,0.712657,0.019484,0.013516,0.050521,0.289789,0.004704,0.001425,...,0.00172,3.9e-05,0.000321,0.000122,3.2e-05,5.1e-05,0.000154,0.000148,0.0,0.0
2,1,Calgary,0.217412,0.200742,0.175154,0.146692,0.143447,0.085865,0.076107,0.06601,...,1.8e-05,1.5e-05,1.5e-05,7e-06,7e-06,7e-06,0.0,0.0,0.0,0.0
3,1,Edmonton,0.199109,0.205387,0.160686,0.164278,0.143458,0.110192,0.055466,0.123289,...,2.7e-05,5.4e-05,5.4e-05,6.9e-05,0.000119,2.7e-05,0.0,0.0,1.5e-05,0.0
4,1,Winnipeg,0.193772,0.171206,0.166426,0.162789,0.127951,0.126198,0.035212,0.152651,...,9.2e-05,3.9e-05,2.6e-05,1.3e-05,5.3e-05,9.2e-05,0.0,0.0,1.3e-05,0.0
5,2,Halifax,0.306302,0.384365,0.285378,0.113435,0.239092,0.167027,0.019126,0.012964,...,0.0,2.5e-05,2.5e-05,0.0,0.0,0.0,0.0,2.5e-05,0.0,0.0
6,2,Ottawa,0.210578,0.299698,0.184934,0.085807,0.216557,0.208148,0.047835,0.026727,...,0.000139,7.7e-05,0.000293,5.7e-05,0.00075,4.1e-05,1e-05,3.6e-05,4.6e-05,0.0
7,2,London,0.30345,0.273851,0.219916,0.121304,0.201293,0.096279,0.025642,0.025097,...,0.0,3.1e-05,0.0,2.1e-05,0.000103,0.0,2.1e-05,0.0,0.0,2.1e-05
8,3,Toronto,0.124949,0.127406,0.092747,0.046361,0.092852,0.042265,0.119516,0.024618,...,1e-05,3.8e-05,1.7e-05,1e-05,3.6e-05,4e-05,8e-06,2.6e-05,7.4e-05,9e-06
9,3,Vancouver,0.193858,0.13651,0.140578,0.091512,0.113491,0.060883,0.205741,0.038908,...,1.9e-05,0.0,8e-06,3.1e-05,0.0,4.3e-05,4e-06,1e-05,8e-06,4.7e-05


In [35]:
dfpop.columns

Index(['Population Origins Cluster Labels', 'City', '      English',
       '    Canadian', '      Scottish', '      German', '      Irish',
       '      French', '      Chinese', '      Ukrainian',
       ...
       '      Breton', '      Malinké', '      Burkinabe', '      Corsican',
       '      Djiboutian', '      Maure', '    Guadeloupean', '    Martinican',
       '    Montserratan', '      Hmong'],
      dtype='object', length=226)

In [36]:
df1 = dfpop[['Population Origins Cluster Labels', 'City']]
df2 = dfrests[['Food Venues Cluster Labels', 'City']]

### Here we make a tbale with only the clusters

In [37]:
df_clusters = pd.merge(df1, df2, how='outer', on ='City')
df_clusters.set_index('City', inplace = True)

In [38]:
df_clusters

Unnamed: 0_level_0,Population Origins Cluster Labels,Food Venues Cluster Labels
City,Unnamed: 1_level_1,Unnamed: 2_level_1
Montreal,0,0
Quebec City,0,2
Calgary,1,3
Edmonton,1,3
Winnipeg,1,3
Halifax,2,1
Ottawa,2,0
London,2,3
Toronto,3,1
Vancouver,3,1


In [39]:
df_clusters.to_csv('Df_can_clusters.csv')

### After saving it we make tables with the clusters and the most frequente venues and origins.

In [40]:
most_c_venues = pd.read_csv('ML10CanadaMostFoodVenues.csv');

In [41]:
most_c_venues['Food Venues Cluster Labels'] = restslabels

In [None]:
most_c_venues = most_c_venues.set_index('Food Venues Cluster Labels')

In [973]:
most_c_venues = most_c_venues.sort_index()

In [974]:
most_c_venues

Unnamed: 0_level_0,City,1st Most Common Food Venue,2nd Most Common Food Venue,3rd Most Common Food Venue,4th Most Common Food Venue,5th Most Common Food Venue,6th Most Common Food Venue,7th Most Common Food Venue,8th Most Common Food Venue,9th Most Common Food Venue,10th Most Common Food Venue
Food Venues Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Montreal,French Restaurant,Pizza Place,Japanese Restaurant,Vietnamese Restaurant,Indian Restaurant,Middle Eastern Restaurant,Fast Food Restaurant,Italian Restaurant,Sushi Restaurant,Chinese Restaurant
0,Ottawa,Vietnamese Restaurant,Pizza Place,Indian Restaurant,Tapas Restaurant,Italian Restaurant,Sushi Restaurant,New American Restaurant,Middle Eastern Restaurant,Mexican Restaurant,Chinese Restaurant
1,Halifax,Pizza Place,Seafood Restaurant,Italian Restaurant,Sushi Restaurant,Chinese Restaurant,Japanese Restaurant,Burger Joint,Steakhouse,Gastropub,Diner
1,Toronto,Italian Restaurant,Japanese Restaurant,Gastropub,Sushi Restaurant,French Restaurant,Pizza Place,American Restaurant,Mexican Restaurant,Steakhouse,Diner
1,Vancouver,Japanese Restaurant,Sushi Restaurant,Pizza Place,Seafood Restaurant,Chinese Restaurant,Vietnamese Restaurant,Indian Restaurant,Italian Restaurant,French Restaurant,Gastropub
2,Quebec City,French Restaurant,Pizza Place,Italian Restaurant,Gastropub,Fast Food Restaurant,Bistro,Asian Restaurant,Thai Restaurant,Sushi Restaurant,Steakhouse
3,Calgary,Vietnamese Restaurant,Pizza Place,Italian Restaurant,American Restaurant,Sushi Restaurant,Chinese Restaurant,Mexican Restaurant,Steakhouse,Diner,Burger Joint
3,Edmonton,Chinese Restaurant,Italian Restaurant,Vietnamese Restaurant,Pizza Place,Mexican Restaurant,Thai Restaurant,Gastropub,Asian Restaurant,BBQ Joint,Korean Restaurant
3,London,Pizza Place,Sushi Restaurant,Middle Eastern Restaurant,Italian Restaurant,Mexican Restaurant,American Restaurant,Indian Restaurant,Chinese Restaurant,Thai Restaurant,Asian Restaurant
3,Winnipeg,Sushi Restaurant,Asian Restaurant,Pizza Place,Burger Joint,Chinese Restaurant,American Restaurant,Italian Restaurant,Vietnamese Restaurant,French Restaurant,Greek Restaurant


In [975]:
most_c_venues.to_csv('VenuesClustered.csv')

In [976]:
origins = pd.read_csv('10MostOrigins.csv')
origins

Unnamed: 0,City,1st Most Common Origin,2nd Most Common Origin,3rd Most Common Origin,4th Most Common Origin,5th Most Common Origin,6th Most Common Origin,7th Most Common Origin,8th Most Common Origin,9th Most Common Origin,10th Most Common Origin
0,Calgary,English,Canadian,Scottish,German,Irish,French,Chinese,Ukrainian,East Indian,Filipino
1,Toronto,Canadian,English,Chinese,East Indian,Irish,Scottish,Italian,Filipino,German,French
2,Vancouver,Chinese,English,Scottish,Canadian,Irish,East Indian,German,French,Filipino,Ukrainian
3,Halifax,Canadian,English,Scottish,Irish,French,German,Dutch,Acadian,Welsh,Italian
4,Edmonton,Canadian,English,German,Scottish,Irish,Ukrainian,French,Polish,East Indian,Chinese
5,Montreal,Canadian,French,Italian,Irish,English,Haitian,Scottish,Chinese,Québécois,German
6,Quebec City,Canadian,French,Irish,Québécois,Scottish,English,German,Italian,Spanish,Belgian
7,Ottawa,Canadian,Irish,English,French,Scottish,German,Italian,Chinese,Polish,Dutch
8,Winnipeg,English,Canadian,Scottish,German,Ukrainian,Irish,French,Filipino,Polish,East Indian
9,London,English,Canadian,Scottish,Irish,German,French,Dutch,Italian,Polish,Portuguese


In [977]:
origins['Population Origin Cluster Labels'] = poplabels

In [978]:
origins = origins.set_index('Population Origin Cluster Labels')

In [979]:
origins = origins.sort_index()

In [980]:
origins

Unnamed: 0_level_0,City,1st Most Common Origin,2nd Most Common Origin,3rd Most Common Origin,4th Most Common Origin,5th Most Common Origin,6th Most Common Origin,7th Most Common Origin,8th Most Common Origin,9th Most Common Origin,10th Most Common Origin
Population Origin Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Montreal,Canadian,French,Italian,Irish,English,Haitian,Scottish,Chinese,Québécois,German
0,Quebec City,Canadian,French,Irish,Québécois,Scottish,English,German,Italian,Spanish,Belgian
1,Calgary,English,Canadian,Scottish,German,Irish,French,Chinese,Ukrainian,East Indian,Filipino
1,Edmonton,Canadian,English,German,Scottish,Irish,Ukrainian,French,Polish,East Indian,Chinese
1,Winnipeg,English,Canadian,Scottish,German,Ukrainian,Irish,French,Filipino,Polish,East Indian
2,Halifax,Canadian,English,Scottish,Irish,French,German,Dutch,Acadian,Welsh,Italian
2,Ottawa,Canadian,Irish,English,French,Scottish,German,Italian,Chinese,Polish,Dutch
2,London,English,Canadian,Scottish,Irish,German,French,Dutch,Italian,Polish,Portuguese
3,Toronto,Canadian,English,Chinese,East Indian,Irish,Scottish,Italian,Filipino,German,French
3,Vancouver,Chinese,English,Scottish,Canadian,Irish,East Indian,German,French,Filipino,Ukrainian


In [981]:
origins.to_csv('originsclusters.csv')

### Now we have the tables with the K-means Clusters, through Machine Learning we can see the similarities between cities.

Notebook by Francisco Tosetto da Silva, made for the Capstone Project of the IBM Data Science Professional Certificate.