In [1]:
pip install geocoder

Note: you may need to restart the kernel to use updated packages.


In [2]:
import geocoder

In [3]:
from sklearn.cluster import KMeans

In [4]:
import pandas as pd
!conda install -c conda-forge folium=0.5.0 --yes
import folium


Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [54]:
import matplotlib.cm as cm
import matplotlib.colors as colors

In [5]:
import numpy as np

In [6]:
from IPython.display import display

In [7]:
import requests
from pandas.io.json import json_normalize
import urllib
import json

## Introduction/Business Problem

#### Background 

As climate change is reducing the amount of snow every year, ski resorts are trying to change the experience in the resort. To put it in a nutshell, they are trying to replace ski with other activities. Their ultimate goal is to make sure they will attract as many tourists as before, or, in the best case scenario, more tourists than before.

#### Problem to solve

In this context, ski resorts representative might be looking for some new ideas to enhance the user experience. But a question remain : where to get these ideas ? which ski resorts have the same characteristics so they can take ideas and easily implement them in their resort ? That is the problem we will solve in this project. 


## Data 

To get the list of all ski resorts in France, and some infos about them, we will use the **Open Ski Map Data** (https://openskimap.org/#2/46.88/20.83). For venues data, we will use the **Foursquare API**

We will describe the data below, in the Exploratory Data Analysis section and what we intend to do with it. 


## Exploratory Data Analysis 

First, I load the geojson file from the URL.

In [8]:
url = 'http://tiles.skimap.org/geojson/ski_areas.geojson'
jsonurl = urllib.request.urlopen(url)
ski_area = json.loads(jsonurl.read())

Then, I locate the name of every resorts and their related region

In [9]:
ski_area = ski_area['features']

Then, I create a dataframe out of the json file. To do so, I create an empty dataframe with two columns namely 'region' and 'name'.

In [11]:
column_names  = ['country','region','locality','name']

In [12]:
ski_data =  pd.DataFrame(columns=column_names)
ski_data

Unnamed: 0,country,region,locality,name


Then I loop through the json file and append the name and region to the empty dataframe 

In [13]:
for data in ski_area:
    try:
        type = data['properties']['activities']
        country = data['properties']['location']['localized']['en']['country']
        region = data['properties']['location']['localized']['en']['region']
        locality = data['properties']['location']['localized']['en']['locality']
        name = data['properties']['name']
    except:
        pass
    
    ski_data = ski_data.append({'type': type,
                                'country': country,
                                'region': region, 
                                'locality': locality,
                                'name': name}, ignore_index=True)

In [14]:
ski_data

Unnamed: 0,country,region,locality,name,type
0,Slovenia,,Radlje ob Dravi,,[downhill]
1,Germany,Rhineland-Palatinate,Arft,,[downhill]
2,Spain,Castile and León,Espinosa de los Monteros,Estación de Esquí Lunada,[downhill]
3,Norway,Møre og Romsdal,Nordre Vartdal,,[downhill]
4,Norway,Møre og Romsdal,Norddal,,[downhill]
...,...,...,...,...,...
8769,Russia,Krasnodar Krai,,Гранд Отель Поляна | Gazprom Mountain Resort (...,"[downhill, nordic]"
8770,Norway,Hedmark,Mesnalia,Sjusjøen,"[downhill, nordic]"
8771,Norway,Hordaland,Håra,Røldal Skisenter,"[downhill, nordic]"
8772,Austria,Vorarlberg,Gemeinde Gaschurn,Silvretta-Bielerhöhe,"[downhill, nordic]"


Then I need to filter the dataset to get french ski stations 

In [15]:
ski_data = ski_data[ski_data['country'] == 'France']
ski_data.reset_index(inplace=True)


In [16]:
ski_data

Unnamed: 0,index,country,region,locality,name,type
0,126,France,Auvergne-Rhône-Alpes,,La Poya (Chamonix),[downhill]
1,127,France,Auvergne-Rhône-Alpes,Plateau-des-Petites-Roches,Col de Marcieu,"[nordic, downhill]"
2,131,France,Auvergne-Rhône-Alpes,Laval,Domaine de Beldina,[nordic]
3,222,France,Auvergne-Rhône-Alpes,Entremont-le-Vieux,Le Granier,[downhill]
4,229,France,Auvergne-Rhône-Alpes,Les Déserts,La Féclaz,"[downhill, nordic]"
...,...,...,...,...,...,...
391,8753,France,Auvergne-Rhône-Alpes,Haut Valromey,Plateau de Retord (Les Plans d'Hotonnes),"[downhill, nordic]"
392,8754,France,Auvergne-Rhône-Alpes,Lans-en-Vercors,Lans-en-Vercors,"[downhill, nordic]"
393,8759,France,Provence-Alpes-Côte d'Azur,Crévoux,Crévoux,"[downhill, nordic]"
394,8762,France,Provence-Alpes-Côte d'Azur,Abriès-Ristolas,Abriès-Ristolas,"[downhill, nordic]"


In [17]:
ski_data.drop(['index'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [18]:
ski_data

Unnamed: 0,country,region,locality,name,type
0,France,Auvergne-Rhône-Alpes,,La Poya (Chamonix),[downhill]
1,France,Auvergne-Rhône-Alpes,Plateau-des-Petites-Roches,Col de Marcieu,"[nordic, downhill]"
2,France,Auvergne-Rhône-Alpes,Laval,Domaine de Beldina,[nordic]
3,France,Auvergne-Rhône-Alpes,Entremont-le-Vieux,Le Granier,[downhill]
4,France,Auvergne-Rhône-Alpes,Les Déserts,La Féclaz,"[downhill, nordic]"
...,...,...,...,...,...
391,France,Auvergne-Rhône-Alpes,Haut Valromey,Plateau de Retord (Les Plans d'Hotonnes),"[downhill, nordic]"
392,France,Auvergne-Rhône-Alpes,Lans-en-Vercors,Lans-en-Vercors,"[downhill, nordic]"
393,France,Provence-Alpes-Côte d'Azur,Crévoux,Crévoux,"[downhill, nordic]"
394,France,Provence-Alpes-Côte d'Azur,Abriès-Ristolas,Abriès-Ristolas,"[downhill, nordic]"


In [19]:
ski_data.drop(['country'], axis=1, inplace=True)


In [20]:
ski_data.reset_index(inplace=True)

In [21]:
ski_data.drop(['index'], axis=1, inplace=True)

In [22]:
ski_data

Unnamed: 0,region,locality,name,type
0,Auvergne-Rhône-Alpes,,La Poya (Chamonix),[downhill]
1,Auvergne-Rhône-Alpes,Plateau-des-Petites-Roches,Col de Marcieu,"[nordic, downhill]"
2,Auvergne-Rhône-Alpes,Laval,Domaine de Beldina,[nordic]
3,Auvergne-Rhône-Alpes,Entremont-le-Vieux,Le Granier,[downhill]
4,Auvergne-Rhône-Alpes,Les Déserts,La Féclaz,"[downhill, nordic]"
...,...,...,...,...
391,Auvergne-Rhône-Alpes,Haut Valromey,Plateau de Retord (Les Plans d'Hotonnes),"[downhill, nordic]"
392,Auvergne-Rhône-Alpes,Lans-en-Vercors,Lans-en-Vercors,"[downhill, nordic]"
393,Provence-Alpes-Côte d'Azur,Crévoux,Crévoux,"[downhill, nordic]"
394,Provence-Alpes-Côte d'Azur,Abriès-Ristolas,Abriès-Ristolas,"[downhill, nordic]"


I need to clean the dataset and especially, get ride of double entry

In [23]:
ski_data.drop_duplicates(subset = "name",
                        keep='last',
                        inplace = True)
ski_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,region,locality,name,type
0,Auvergne-Rhône-Alpes,,La Poya (Chamonix),[downhill]
1,Auvergne-Rhône-Alpes,Plateau-des-Petites-Roches,Col de Marcieu,"[nordic, downhill]"
2,Auvergne-Rhône-Alpes,Laval,Domaine de Beldina,[nordic]
3,Auvergne-Rhône-Alpes,Entremont-le-Vieux,Le Granier,[downhill]
4,Auvergne-Rhône-Alpes,Les Déserts,La Féclaz,"[downhill, nordic]"
...,...,...,...,...
391,Auvergne-Rhône-Alpes,Haut Valromey,Plateau de Retord (Les Plans d'Hotonnes),"[downhill, nordic]"
392,Auvergne-Rhône-Alpes,Lans-en-Vercors,Lans-en-Vercors,"[downhill, nordic]"
393,Provence-Alpes-Côte d'Azur,Crévoux,Crévoux,"[downhill, nordic]"
394,Provence-Alpes-Côte d'Azur,Abriès-Ristolas,Abriès-Ristolas,"[downhill, nordic]"


Then, I geocode the dataset...

In [24]:
Latitude = []
Longitude = []

lat_lng_coords = None


for i in ski_data['locality']:
    g = geocoder.arcgis('{}, France'.format(i))
    lat_lng_coords = g.latlng
    
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    Latitude.append(latitude)
    Longitude.append(longitude)
    

In [25]:
ski_data['Latitude'] = Latitude
ski_data['Longitude'] = Longitude 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [26]:
ski_data

Unnamed: 0,region,locality,name,type,Latitude,Longitude
0,Auvergne-Rhône-Alpes,,La Poya (Chamonix),[downhill],46.79099,0.53843
1,Auvergne-Rhône-Alpes,Plateau-des-Petites-Roches,Col de Marcieu,"[nordic, downhill]",45.30269,5.87738
2,Auvergne-Rhône-Alpes,Laval,Domaine de Beldina,[nordic],48.07268,-0.77307
3,Auvergne-Rhône-Alpes,Entremont-le-Vieux,Le Granier,[downhill],45.45184,5.88389
4,Auvergne-Rhône-Alpes,Les Déserts,La Féclaz,"[downhill, nordic]",45.62079,6.00958
...,...,...,...,...,...,...
391,Auvergne-Rhône-Alpes,Haut Valromey,Plateau de Retord (Les Plans d'Hotonnes),"[downhill, nordic]",45.99699,5.69182
392,Auvergne-Rhône-Alpes,Lans-en-Vercors,Lans-en-Vercors,"[downhill, nordic]",45.12820,5.58886
393,Provence-Alpes-Côte d'Azur,Crévoux,Crévoux,"[downhill, nordic]",44.54816,6.60715
394,Provence-Alpes-Côte d'Azur,Abriès-Ristolas,Abriès-Ristolas,"[downhill, nordic]",44.79418,6.92574


In [27]:
ski_data.rename(columns={'name':'Ski resorts'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [28]:
ski_data

Unnamed: 0,region,locality,Ski resorts,type,Latitude,Longitude
0,Auvergne-Rhône-Alpes,,La Poya (Chamonix),[downhill],46.79099,0.53843
1,Auvergne-Rhône-Alpes,Plateau-des-Petites-Roches,Col de Marcieu,"[nordic, downhill]",45.30269,5.87738
2,Auvergne-Rhône-Alpes,Laval,Domaine de Beldina,[nordic],48.07268,-0.77307
3,Auvergne-Rhône-Alpes,Entremont-le-Vieux,Le Granier,[downhill],45.45184,5.88389
4,Auvergne-Rhône-Alpes,Les Déserts,La Féclaz,"[downhill, nordic]",45.62079,6.00958
...,...,...,...,...,...,...
391,Auvergne-Rhône-Alpes,Haut Valromey,Plateau de Retord (Les Plans d'Hotonnes),"[downhill, nordic]",45.99699,5.69182
392,Auvergne-Rhône-Alpes,Lans-en-Vercors,Lans-en-Vercors,"[downhill, nordic]",45.12820,5.58886
393,Provence-Alpes-Côte d'Azur,Crévoux,Crévoux,"[downhill, nordic]",44.54816,6.60715
394,Provence-Alpes-Côte d'Azur,Abriès-Ristolas,Abriès-Ristolas,"[downhill, nordic]",44.79418,6.92574


## Mapping 

In [29]:
ski_map = folium.Map(location=[45.188529, 5.724524], zoom_start=6)

for lat, lng, type, locality in zip(ski_data['Latitude'], ski_data['Longitude'], ski_data['type'] , ski_data['locality']):
    label = '{}, {}'.format(locality, type)
    label = folium.Popup(label, parse_html=True)
    
    folium.CircleMarker(
        [lat, lng],
        radius=5, # define how big you want the circle markers to be
        popup=label,
        color='yellow',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        parse_html=False).add_to(ski_map)
    

ski_map

**N.B** : there are some wrong coordinates due to the fact that some ski resorts' locality name are the same that actual town names situated in France. For example, the "Saint-Prix" ski resort is not obviously not situated near Paris. 

## Retrieving venues data from the Foursquare API

In [30]:
CLIENT_ID = 'ELFZ42THMXXN1LLKT5YPUU05Q412OGFCBBW4IEMPGPCGFWMO'
CLIENT_SECRET = 'JXTMRDTUGS2HL4XNBWDGIIRX004OR5SXBRMZSCHXQUID2ZVM'
VERSION = '20180605'

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: ELFZ42THMXXN1LLKT5YPUU05Q412OGFCBBW4IEMPGPCGFWMO
CLIENT_SECRET:JXTMRDTUGS2HL4XNBWDGIIRX004OR5SXBRMZSCHXQUID2ZVM


### Test - Identify venues in ski resorts

1- I used the Foursquare API to get the venues data. Especially, I requested every venues within 2000 meters of the coordinates of every neighborhood

2- I created a function to retrieve the categories of the venues

3- I created a dataframe with the retrieved data

In [31]:
#1
ski_latitude = ski_data.loc[391,'Latitude']
ski_longitude = ski_data.loc[391,'Longitude']

In [32]:
LIMIT = 100
radius = 5000
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, ski_latitude, ski_longitude, VERSION, radius, LIMIT)


In [33]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ecffe65fb34b5001b786452'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Hotonnes',
  'headerFullLocation': 'Hotonnes',
  'headerLocationGranularity': 'city',
  'totalResults': 5,
  'suggestedBounds': {'ne': {'lat': 46.041990045000084,
    'lng': 5.756475639946563},
   'sw': {'lat': 45.951989954999995, 'lng': 5.627164360053564}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '5bf5c1ada35f46002548abf3',
       'name': 'Le Relais Saint Didier',
       'location': {'address': '260 Grande Rue',
        'lat': 45.994409,
        'lng': 5.663571,
        'labeledLatLngs': [{'label': 'display',
          'lat': 45.994409,
          'lng': 5.663571}

In [34]:
#2
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
    
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

get_category_type

<function __main__.get_category_type(row)>

In [35]:
#3
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head(10)

Unnamed: 0,name,categories,lat,lng
0,Le Relais Saint Didier,Business Service,45.994409,5.663571
1,Martinod Marcel,Hotel,45.9733,5.701797
2,3D Étanchéité,Miscellaneous Shop,45.972005,5.701535
3,Plan d'Hotonnes,Ski Area,46.035769,5.701357
4,Auberge Gîte de la Praille,Restaurant,45.982488,5.634698


### Real Stuff - Get every venues in every resorts

I followed the same logic as above and created a function to get every venues in ski resorts

In [36]:
#1
def getNearbyVenus(names, latitudes, longitudes, radius=5000):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Ski resorts', 
                  'Resort Latitude', 
                  'Resort Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)
        
                  

In [37]:
ski_venues = getNearbyVenus(names=ski_data['Ski resorts'],
                                latitudes=ski_data['Latitude'], 
                                longitudes=ski_data['Longitude'])



La Poya (Chamonix)
Col de Marcieu
Domaine de Beldina
Le Granier
La Féclaz
Les Portes du Mont-Blanc
Col de Plainpalais
Aillon-Margériaz
Thollon les Mémises
Le Tourchet (Chamonix)
Les Chosalets (Chamonix)
Aiguille du Midi (Chamonix)
Les Planards (Chamonix)
Vormaine
Nistos
Valtin
Pierrefontaine Les Varans
None
Estenc-Entraunes
Château-Ville-Vieille
Allemont
Peïra-Cava
Grand Echaillon
Chichilianne
Tréminis
Les Entremonts
Sardières (Val Cenis)
Valdrôme
La Ruchère-St-Christophe
Val Pelens
Venosc-Vénéon
Les Rafforts - Hery-sur-Ugine
Col de Romeyère
Turini-Camp-d'Argent
Montmin - Col de la Forclaz
Le Val d'Ambin (Val Cenis)
Saint-Jean-de-Sixt
Romme-sur-Cluses
Laye
La Jarjatte
Méaudre
Val d'Allos - Le Seignus
Saint-Léger-les-Mélèzes
Mont-Saxonnex
Pelvoux-Vallouise
Espace Villard de Lans / Corrençon en Vercors
Auron Saint-Etienne-de-Tinée
Orcières
Bessans
Saint-François-Longchamp
Herbouilly
Valmorel
Station du Haut-Folin
Thollon-les-Memises
Sommand
Cornimont
Girmont-Val-d'Ajol
Payolle
Donon
Le H

In [38]:
ski_venues

Unnamed: 0,Ski resorts,Resort Latitude,Resort Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,La Poya (Chamonix),46.79099,0.53843,Loft Cinéma,46.820714,0.544282,Multiplex
1,La Poya (Chamonix),46.79099,0.53843,Auchan,46.797998,0.528085,Supermarket
2,La Poya (Chamonix),46.79099,0.53843,Gare SNCF de Châtellerault,46.819223,0.549120,Train Station
3,La Poya (Chamonix),46.79099,0.53843,Hotel ibis Chatellerault,46.797599,0.528032,Hotel
4,La Poya (Chamonix),46.79099,0.53843,Leclerc Chatellerault,46.823967,0.560818,Grocery Store
...,...,...,...,...,...,...,...
3783,Lac Blanc,48.12643,7.16141,Au Bois Le Sire***,48.126837,7.164606,Restaurant
3784,Lac Blanc,48.12643,7.16141,Gîte les 4 Saisons,48.125653,7.144396,Bed & Breakfast
3785,Lac Blanc,48.12643,7.16141,Les Alisiers,48.144528,7.167377,Hotel
3786,Lac Blanc,48.12643,7.16141,Le Domaine De Basil,48.101619,7.159670,Hotel


In [39]:
ski_venues.groupby('Ski resorts').count()

Unnamed: 0_level_0,Resort Latitude,Resort Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Ski resorts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abondance,4,4,4,4,4,4
Abriès-Ristolas,7,7,7,7,7,7
Aiguille du Midi (Chamonix),89,89,89,89,89,89
Aiguilles,7,7,7,7,7,7
Aillon-Margériaz,6,6,6,6,6,6
...,...,...,...,...,...,...
Verthemex (Vacheresse),5,5,5,5,5,5
Vormaine,89,89,89,89,89,89
Wangenbourg-Engenthal,4,4,4,4,4,4
Xonrupt-Longemer,26,26,26,26,26,26


In [40]:
ski_venues['Ski resorts']

0       La Poya (Chamonix)
1       La Poya (Chamonix)
2       La Poya (Chamonix)
3       La Poya (Chamonix)
4       La Poya (Chamonix)
               ...        
3783             Lac Blanc
3784             Lac Blanc
3785             Lac Blanc
3786             Lac Blanc
3787             Lac Blanc
Name: Ski resorts, Length: 3788, dtype: object

## Get the 5 most common venues in every ski resorts 

The 5 most common venues will determine the user experience profile of every ski resort. To get them, I created the following workflow :

1- I started by creating dummy variables to tell me if there is or not this category of venues in the resort

2- I replace the ski resort column back the first column of the dataframe

3- I grouped the dataframe by ski resort and computed the mean of appearance of every venue category. This gives me an idea of the density of every venue category in the ski resort. 

4- I decided to compute the top 5 of venue category, that is to say the most frequent venue category in every ski resort. The top 5 most frequent venue category was chosen as the data to be k-clustered

In [41]:
#1
# one hot encoding
ski_resort_onehot = pd.get_dummies(ski_venues[['Venue Category']], prefix="", prefix_sep="")

#2
# add ski resorts column back to dataframe
ski_resort_onehot['Ski resorts'] = ski_venues['Ski resorts'] 

# move ski resorts column to the first column
fixed_columns = [ski_resort_onehot.columns[-1]] + list(ski_resort_onehot.columns[:-1])
ski_resort_onehot = ski_resort_onehot[fixed_columns]

ski_resort_onehot.head()

Unnamed: 0,Ski resorts,Airport,Alternative Healer,American Restaurant,Apres Ski Bar,Arcade,Art Museum,Arts & Crafts Store,Arts & Entertainment,Athletics & Sports,...,Turkish Restaurant,Village,Vineyard,Volcano,Water Park,Waterfall,Wine Bar,Wine Shop,Zoo,Zoo Exhibit
0,La Poya (Chamonix),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,La Poya (Chamonix),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,La Poya (Chamonix),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,La Poya (Chamonix),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,La Poya (Chamonix),0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
ski_resort_onehot.shape

(3788, 246)

In [43]:
ski_resort_onehot.set_index('Ski resorts', inplace=True)

In [44]:
ski_resort_onehot.head()

Unnamed: 0_level_0,Airport,Alternative Healer,American Restaurant,Apres Ski Bar,Arcade,Art Museum,Arts & Crafts Store,Arts & Entertainment,Athletics & Sports,Auto Dealership,...,Turkish Restaurant,Village,Vineyard,Volcano,Water Park,Waterfall,Wine Bar,Wine Shop,Zoo,Zoo Exhibit
Ski resorts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
La Poya (Chamonix),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
La Poya (Chamonix),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
La Poya (Chamonix),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
La Poya (Chamonix),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
La Poya (Chamonix),0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
#3
ski_resort_grouped = ski_resort_onehot.groupby('Ski resorts').mean().reset_index()
ski_resort_grouped.head()

Unnamed: 0,Ski resorts,Airport,Alternative Healer,American Restaurant,Apres Ski Bar,Arcade,Art Museum,Arts & Crafts Store,Arts & Entertainment,Athletics & Sports,...,Turkish Restaurant,Village,Vineyard,Volcano,Water Park,Waterfall,Wine Bar,Wine Shop,Zoo,Zoo Exhibit
0,Abondance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Abriès-Ristolas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Aiguille du Midi (Chamonix),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011236,0.0,0.0
3,Aiguilles,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Aillon-Margériaz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
ski_resort_grouped.shape

(297, 246)

In [47]:
#4
num_top_venues = 10

for resort in ski_resort_grouped['Ski resorts']:
    print("----"+resort+"----")
    temp = ski_resort_grouped[ski_resort_grouped['Ski resorts'] == resort].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Abondance----
         venue  freq
0        Hotel  0.50
1       Hostel  0.25
2     Ski Area  0.25
3   Playground  0.00
4          Pub  0.00
5  Post Office  0.00
6         Pool  0.00
7        Plaza  0.00
8      Airport  0.00
9   Racecourse  0.00


----Abriès-Ristolas----
               venue  freq
0              Hotel  0.29
1           Mountain  0.14
2     Science Museum  0.14
3   Business Service  0.14
4  French Restaurant  0.14
5           Ski Area  0.14
6         Playground  0.00
7                Pub  0.00
8        Post Office  0.00
9               Pool  0.00


----Aiguille du Midi (Chamonix)----
                 venue  freq
0                Hotel  0.20
1    French Restaurant  0.13
2             Ski Area  0.08
3           Restaurant  0.06
4                  Bar  0.06
5        Train Station  0.03
6  Japanese Restaurant  0.03
7                 Café  0.02
8               Lounge  0.02
9            Cable Car  0.02


----Aiguilles----
         venue  freq
0        Hotel  0.43
1     Ski

In [120]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [121]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Ski resorts']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
ski_resort_venues_sorted = pd.DataFrame(columns=columns)
ski_resort_venues_sorted['Ski resorts'] = ski_resort_grouped['Ski resorts']

for ind in np.arange(ski_resort_grouped.shape[0]):
    ski_resort_venues_sorted.iloc[ind, 1:] = return_most_common_venues(ski_resort_grouped.iloc[ind, :], num_top_venues)

ski_resort_venues_sorted.head()

Unnamed: 0,Ski resorts,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Abondance,Hotel,Hostel,Ski Area,Zoo Exhibit,Fast Food Restaurant,Fishing Spot,Flower Shop,Fondue Restaurant,Food,Food & Drink Shop
1,Abriès-Ristolas,Hotel,Business Service,Ski Area,Science Museum,Mountain,French Restaurant,Food Truck,Forest,Food Service,Farmers Market
2,Aiguille du Midi (Chamonix),Hotel,French Restaurant,Ski Area,Bar,Restaurant,Japanese Restaurant,Train Station,Campground,Mountain,Café
3,Aiguilles,Hotel,Bank,Mountain,Ski Area,Ski Shop,Forest,Food Truck,Food Service,Food & Drink Shop,Farm
4,Aillon-Margériaz,Ski Area,Rental Car Location,Museum,Mountain,Snack Place,Food Service,Food & Drink Shop,Food,Farmers Market,Food Truck


## Cluster Neighborhood

1- I decide to create five clusters based on the most 10 frequent venue category in every ski resort

2- I added the coordinates to the clustered resorts

3- I mapped the data using Folium

In [122]:
#1
# set number of clusters
kclusters = 5

ski_resort_grouped_clustering = ski_resort_grouped.drop('Ski resorts', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(ski_resort_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 2, 1, 0, 4, 2, 2, 0, 2], dtype=int32)

In [123]:
#2
# add clustering labels
ski_resort_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

ski_merged = ski_data



In [124]:
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
ski_merged = ski_merged.join(ski_resort_venues_sorted.set_index('Ski resorts'), on='Ski resorts')

ski_merged.head() # check the last columns!

Unnamed: 0,region,locality,Ski resorts,type,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Auvergne-Rhône-Alpes,,La Poya (Chamonix),[downhill],46.79099,0.53843,1.0,Hotel,Supermarket,Grocery Store,Train Station,Multiplex,Food,Fishing Spot,Flower Shop,Fondue Restaurant,Food & Drink Shop
1,Auvergne-Rhône-Alpes,Plateau-des-Petites-Roches,Col de Marcieu,"[nordic, downhill]",45.30269,5.87738,3.0,Train Station,Skate Park,Scenic Lookout,Zoo Exhibit,Fast Food Restaurant,Fishing Spot,Flower Shop,Fondue Restaurant,Food,Food & Drink Shop
2,Auvergne-Rhône-Alpes,Laval,Domaine de Beldina,[nordic],48.07268,-0.77307,2.0,Hotel,Supermarket,Fast Food Restaurant,French Restaurant,Airport,Castle,Other Repair Shop,Shopping Mall,Motorcycle Shop,Garden
3,Auvergne-Rhône-Alpes,Entremont-le-Vieux,Le Granier,[downhill],45.45184,5.88389,0.0,Mountain,Scenic Lookout,Ski Area,Zoo Exhibit,Food,Fast Food Restaurant,Fishing Spot,Flower Shop,Fondue Restaurant,Food & Drink Shop
4,Auvergne-Rhône-Alpes,Les Déserts,La Féclaz,"[downhill, nordic]",45.62079,6.00958,0.0,Ski Area,French Restaurant,Miscellaneous Shop,Diner,Pizza Place,Zoo Exhibit,Fishing Spot,Flower Shop,Fondue Restaurant,Food & Drink Shop


In [169]:
#Just need to convert the Cluster Labels data type to integers
ski_merged['Cluster Labels'] = ski_merged['Cluster Labels'].fillna(0)
ski_merged['Cluster Labels'] = ski_merged['Cluster Labels'].astype(int)

In [170]:
ski_merged['Cluster Labels'].dtype

dtype('int64')

In [168]:
#3
# create map
map_clusters = folium.Map(location=[45.188529, 5.724524], zoom_start=6)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ski_merged['Latitude'], ski_merged['Longitude'], ski_merged['Ski resorts'], ski_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters