# Notebook for Capstone Final Project.

### Importing libs and datasets.

In [1]:
import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim 
from geopy.point import Point
import requests
import folium
from random import randint

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_Russia_by_population'
df_u = pd.read_html(url, header = 0)[-1].rename({'Rank (2017)' : 'Number', 'City/town' : 'City', 'Population(2017 estimate)[1][2]' : 'Population'}, axis = 1).set_index('Number')
df = df_u.loc[:, ['City', 'Population', 'Russian']]
res_loc = pd.read_csv('ru.csv') # reserve location dataset, downloaded from https://simplemaps.com/data/ru-cities
df.head()

Unnamed: 0_level_0,City,Population,Russian
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Moscow,12380664,Москва
2,Saint Petersburg,5281579,Санкт-Петербург
3,Novosibirsk,1602915,Новосибирск
4,Yekaterinburg,1455514,Екатеринбург
5,Nizhny Novgorod,1261666,Нижний Новгород


***
*I chose dataset of russian cities because I'm from Russia (logical, right?). My mission is to make obvious which type of food establishment is the most popular in every Russian city (in truth, I will use only top 300 cities by population). That knowledge can give potential investors an understanding of which type of restaurant they should open, for example, or which city offers the best food of some cuisine (the more restaurants of the same type, the better this cuisine).*
***

***
*Getting location data for every city. As a reserve dataset I am using .csv-file, because sometimes geolocator can't find right city.*
***

In [3]:
lat = []
long = []
for i in range(301):
    address = df.City[i+1]
    try:
        geolocator = Nominatim(timeout = 2)
        location = geolocator.geocode(address, country_codes = "ru")
        lat.append(location.latitude)
        long.append(location.longitude)
    except:
        lat.append(res_loc.loc[res_loc['city'] == address].lat[res_loc.loc[res_loc['city'] == address].lat.index[0]])
        long.append(res_loc.loc[res_loc['city'] == address].lng[res_loc.loc[res_loc['city'] == address].lng.index[0]])      

  


***
*I'm dropping Salavat and Kurgan, because FourSquare API have problems with food category in this towns (and I can't find out the reason).*
***

In [4]:
df = df.iloc[0:301]
df['Latitude'] = lat
df['Longitude'] = long
df.drop(index = df[df['City'] == 'Kurgan'].index.values[0], inplace = True)
df.drop(index = df[df['City'] == 'Salavat'].index.values[0], inplace = True)
df.reset_index(drop = True, inplace = True)
df.head(10)

Unnamed: 0,City,Population,Russian,Latitude,Longitude
0,Moscow,12380664,Москва,55.750446,37.617494
1,Saint Petersburg,5281579,Санкт-Петербург,59.938732,30.316229
2,Novosibirsk,1602915,Новосибирск,55.028217,82.923451
3,Yekaterinburg,1455514,Екатеринбург,56.839104,60.60825
4,Nizhny Novgorod,1261666,Нижний Новгород,56.328571,44.003506
5,Kazan,1231878,Казань,55.782355,49.124227
6,Chelyabinsk,1198858,Челябинск,55.159841,61.402555
7,Omsk,1178391,Омск,54.991375,73.371529
8,Samara,1169719,Самара,53.198627,50.113987
9,Rostov-on-Don,1125299,Ростов-на-Дону,47.221386,39.71142


In [5]:
CLIENT_ID = 'KSSTY3BOSMGYCAXZUT5VBR1I2OYL51FZTEIYRFMU5ILFEZ2H' # your Foursquare ID
CLIENT_SECRET = '5GPSTEMZHJLABL4FTNX15L2SUXAZZPXHEOQAQEMRIDTPOXG4' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [6]:
print('Method to get type from the given row.')
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
from pandas.io.json import json_normalize

Method to get type from the given row.


***
*Code for FourSquare Data extracting. Checked lower cases of city names, because sometimes they have one upper case, some upper cases or all lower cases. That's just a precaution.  
Dataset with city name as index and top 3 types of food establishments is a result.*
***

In [7]:
food = {}
for city in zip(df['Latitude'],df['Longitude'],df['City']):
    #print(str(one_city))
    city_low = str(city[2]).lower()
    latitude = city[0]
    longitude = city[1]
    search_query = 'Food'
    radius = 14500
    limit = 200
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&query={}&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,VERSION,latitude,longitude,search_query,radius,limit)
    results = requests.get(url).json()
    venues = results['response']['groups'][0]['items']
    
    nearby_venues = json_normalize(venues) # flatten JSON

    # filter columns
    filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
    nearby_venues = nearby_venues.loc[:, filtered_columns]

    # filter the category for each row
    nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

    # clean columns
    list_ = []
    nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
    ab = pd.DataFrame((nearby_venues.categories.value_counts()))
    if nearby_venues.categories.value_counts().shape[0] == 0:
        list_.extend(['None', 0, 'None', 0, 'None', 0])
    elif nearby_venues.categories.value_counts().shape[0] == 1:
        list_.extend([nearby_venues.categories.value_counts().index[0], nearby_venues.categories.value_counts()[0], 'None', 0, 'None', 0])
    elif nearby_venues.categories.value_counts().shape[0] == 2:
        list_.extend([nearby_venues.categories.value_counts().index[0], nearby_venues.categories.value_counts()[0], nearby_venues.categories.value_counts().index[1], nearby_venues.categories.value_counts()[1], 'None', 0])
    else:
        list_.extend([nearby_venues.categories.value_counts().index[0], nearby_venues.categories.value_counts()[0], nearby_venues.categories.value_counts().index[1], nearby_venues.categories.value_counts()[1], nearby_venues.categories.value_counts().index[2], nearby_venues.categories.value_counts()[2]])
    food.update({city_low.title():list_})

KeyError: 'groups'

In [None]:
final = pd.DataFrame.from_dict(food, orient = 'index', columns = ['1st place','№','2nd place','№','3rd place','№'])
final.index.name = 'Сity'
print(final['1st place'].unique())
final.head()

***
*Code for map with colored markers on cities. Every color means the most popular type of food establishment in this city. Map is fully interactive, all markers of every color can be visible or hidden.  
Note: a large void about north-east of Russia is not a code error, it's just that there aren't many cities there. This is Siberia.*
***

In [None]:
latitude = 60
longitude = 100
map_russia = folium.Map(location=[latitude, longitude], tiles='Stamen Terrain', zoom_start=3)
colormap = []

# add markers to map
for i in range(len(final['1st place'].unique())):
    lgd_txt = '<span style="color: {col};">{txt}</span>'
    clr = '#%06X' % randint(0, 0xFFFFFF)
    feature[i] = folium.FeatureGroup(name = lgd_txt.format(txt = final['1st place'].unique()[i], col = clr))
    colormap.append(clr)
    for j in range(len(final['1st place'])):
        if final['1st place'][j] == final['1st place'].unique()[i]:
            for lat, lng, label in zip(df.Latitude[df.City == final.index[j]], df.Longitude[df.City == final.index[j]], df.City[df.City == final.index[j]]):
                label = folium.Popup(label, parse_html=True)
                folium.CircleMarker(
                    [lat, lng],
                    radius=5,
                    popup=label,
                    color=clr,
                    fill=True,
                    fill_color=clr,
                    fill_opacity=0.7,
                    parse_html=False).add_to(feature[i])
                map_russia.add_child(feature[i])
    feature.append(0)
                
map_russia.add_child(folium.map.LayerControl(collapsed= False))
map_russia