In [1]:
# ---------------- #
# Common Libraries #
# ---------------- #
      
# Standard Imports
import os
import re
import nltk
import requests
import unicodedata
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

## setting basic style parameters for matplotlib
plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')

# ------------- #
# LOCAL IMPORTS #
# ------------- #

# importing sys
import sys
# adding 00_helper_files to the system path
sys.path.insert(0, '/Users/qmcbt/codeup-data-science/00_helper_files')
# env containing sensitive access credentials
import env
from env import user, password, host
from env import get_db_url

# Import Local Helper Modules
import QMCBT_00_quicktips as qt
import QMCBT_01_acquire as acq
import QMCBT_02_prepare as prep
import QMCBT_03_explore as exp
import QMCBT_04_visualize as viz
import QMCBT_05_model as mod
import QMCBT_wrangle as w


In [2]:
# Read-in data from compressed file
df = pd.read_pickle('michelin_df.pickle')

In [4]:
df.T.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6770,6771,6772,6773,6774,6775,6776,6777,6778,6779
name,King's Joy,Xin Rong Ji (Xinyuan South Road),Taian Table,Ultraviolet by Paul Pairet,Quince,Atelier Crenn,Addison,Manresa,Benu,SingleThread,...,Mimo,A Casa do Porco,Più Iguatemi,Bistrot de Paris,AE! Cozinha,Szóstka,Fiorentina,Zazie,Butchery & Wine,alewino
address,"2 Wudaoying Hutong, Beijing, China Mainland","1F, East Tower, Genesis Beijing, 8 Xinyuan Sou...","101-102, Building No. 1, Garden Office, No.161...",'somewhere in Shanghai' - meet at Mr & Mrs Bun...,"470 Pacific Ave., San Francisco, 94133, USA","3127 Fillmore St., San Francisco, 94123, USA","5200 Grand Del Mar Way, San Diego, 92130, USA","320 Village Ln., Los Gatos, 95030, USA","22 Hawthorne St., San Francisco, 94105, USA","131 North St., Healdsburg, 95448, USA",...,"Rua Caconde 118, São Paulo, 01425-010, Brazil","Rua Araujo 124, São Paulo, 01220-020, Brazil","Avenida Brigadeiro Faria Lima 2232, São Paulo,...","Rua Augusta 2542, São Paulo, 01412-100, Brazil","Rua Áurea 285, São Paulo, 04015-070, Brazil","Plac Powstańców Warszawy 9, Warsaw, 00 039, Po...","ul. Grodzka 63, Cracow, 31 044, Poland","ul. Józefa 34, Cracow, 32 056, Poland","ul. Żurawia 22, Warsaw, 00 515, Poland","ul. Mokotowska 48, Warsaw, 00 543, Poland"
location,"Beijing, China Mainland","Beijing, China Mainland","Shanghai, China Mainland","Shanghai, China Mainland","San Francisco, USA","San Francisco, USA","San Diego, USA","Los Gatos, USA","San Francisco, USA","Healdsburg, USA",...,"São Paulo, Brazil","São Paulo, Brazil","São Paulo, Brazil","São Paulo, Brazil","São Paulo, Brazil","Warsaw, Poland","Cracow, Poland","Cracow, Poland","Warsaw, Poland","Warsaw, Poland"
price,¥¥¥,¥¥¥¥,¥¥¥¥,¥¥¥¥,$$$$,$$$$,$$$$,$$$$,$$$$,$$$$,...,,,,,,€€,€€,€,€€,€€
cuisine,Vegetarian,Taizhou,Innovative,Innovative,"Contemporary, Californian","Contemporary, French","Contemporary, Californian","Contemporary, Californian","Asian, Contemporary","Contemporary, Californian",...,Modern Cuisine,Brazilian,"Italian, Creative",French,"Creative, Market Cuisine","Polish, Modern Cuisine","Creative, Polish",French,"Meats and Grills, Traditional Cuisine","Polish, Traditional Cuisine"
longitude,116.410004,116.450148,121.474049,121.48509,-122.403261,-122.43586,-117.198891,-121.98071,-122.39906,-122.869723,...,-46.658201,-46.644742,-46.688315,-46.665519,-46.645587,21.012698,19.938179,19.946949,21.015495,21.022057
latitude,39.946681,39.94638,31.221807,31.240358,37.797505,37.79835,32.941297,37.22761,37.785376,38.61226,...,-23.574572,-23.544867,-23.577106,-23.563072,-23.586913,52.235803,50.055898,50.05124,52.228581,52.225201
phone_number,861084049191.0,861065015501.0,8617301605350.0,,14157758500.0,14154400460.0,18583141900.0,14083544330.0,14156854860.0,17077234646.0,...,551130522517.0,551132582578.0,551131987649.0,551130631675.0,551134768521.0,48224700342.0,48124264608.0,48500410829.0,48225023118.0,48226283830.0
url,https://guide.michelin.com/en/beijing-municipa...,https://guide.michelin.com/en/beijing-municipa...,https://guide.michelin.com/en/shanghai-municip...,https://guide.michelin.com/en/shanghai-municip...,https://guide.michelin.com/en/california/san-f...,https://guide.michelin.com/en/california/san-f...,https://guide.michelin.com/en/california/us-sa...,https://guide.michelin.com/en/california/los-g...,https://guide.michelin.com/en/california/san-f...,https://guide.michelin.com/en/california/heald...,...,https://guide.michelin.com/en/sao-paulo-region...,https://guide.michelin.com/en/sao-paulo-region...,https://guide.michelin.com/en/sao-paulo-region...,https://guide.michelin.com/en/sao-paulo-region...,https://guide.michelin.com/en/sao-paulo-region...,https://guide.michelin.com/en/masovia/warsaw/r...,https://guide.michelin.com/en/lesser-poland/cr...,https://guide.michelin.com/en/lesser-poland/cr...,https://guide.michelin.com/en/masovia/warsaw/r...,https://guide.michelin.com/en/masovia/warsaw/r...
website_url,,,https://taian-table.cn/,https://uvbypp.cc/,http://www.quincerestaurant.com,https://www.ateliercrenn.com/,https://www.addisondelmar.com/,https://www.manresarestaurant.com/,https://www.benusf.com/,https://www.singlethreadfarms.com/,...,https://www.mimorestaurante.com.br/,https://acasadoporco.com.br/,https://www.piurestaurante.com.br/,https://bistrotdeparis.com.br/,,https://warszawa.hotel.com.pl/hotel-warszawa,https://fiorentina.com.pl/,http://www.zaziebistro.pl/,https://www.butcheryandwine.pl/,http://www.alewino.pl/


In [5]:
# CLEAN

In [6]:
# basic cleaning function:
# add_stopwords = ['r', 'u', '2', 'ltgt']

def clean(text, stem_or_lem=None, add_stopwords=[]):
    """
    Description:
    Simplified text cleaning function
    
    Required Imports:
    import re
    import nltk
    import unicodedata
    import pandas as pd
    from nltk.corpus import stopwords

    Arguments:
             text = The text you want to clean
      stem_or_lem = Default is None; stem will perform stemming on your text; lem will lemmatize it.
    add_stopwords = [] This is an empty list by default; just add words that you want to include as stopwords.
    
    Returns:
    returns list of cleaned words.
    """
    # 1. lowercase everything
    text = text.lower()
    # 2. Remove accented and ASCII characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # 3. Remove special characters
    words = re.sub(r"[^a-z0-9\s]", '', text).split()
    # 4. Tokenize
    tokenize = nltk.tokenize.ToktokTokenizer()
    tokenize.tokenize(text, return_str=False)
    # 5. Stemming or Lemmatizing
    if stem_or_lem == "stem":
        ps = nltk.porter.PorterStemmer()
        text = [ps.stem(word) for word in text.split()]
        ' '.join(text)
        print('Stemming Performed')
    elif stem_or_lem == "lem":
        wnl = nltk.stem.WordNetLemmatizer()
        text = [wnl.lemmatize(word) for word in text.split()]
        ' '.join(text)
        print('Lemmatizing Performed')
    else:
        print('No Stemming or Lemmatizing Performed')
    # 6. Remove StopWords
    stopwords = nltk.corpus.stopwords.words('english') + add_stopwords
    
    return [word for word in words if word not in stopwords]


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6780 entries, 0 to 6779
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     6780 non-null   object 
 1   address                  6780 non-null   object 
 2   location                 6780 non-null   object 
 3   price                    6726 non-null   object 
 4   cuisine                  6780 non-null   object 
 5   longitude                6780 non-null   float64
 6   latitude                 6780 non-null   float64
 7   phone_number             6649 non-null   float64
 8   url                      6780 non-null   object 
 9   website_url              5542 non-null   object 
 10  award                    6780 non-null   object 
 11  facilities_and_services  6732 non-null   object 
 12  data                     6780 non-null   object 
dtypes: float64(3), object(10)
memory usage: 688.7+ KB


In [8]:
# Change DTYPE
def change_dtype_str(df):
    '''
    ## Description:
    This is a custom Function to change dtype to string as appropraiate for this project
    ## Arguments:
    df = DataFrame
    ## Returns:
    df - DataFrame
    '''
    df.name = df.name.fillna('').astype('string')
    df.address = df.address.fillna('').astype('string')
    df.location = df.location.fillna('').astype('string')
    df.cuisine = df.cuisine.fillna('').astype('string')
    df.facilities_and_services = df.facilities_and_services.fillna('NONE').astype('string')
    df.award = df.award.fillna('').astype('string')
    df.data = df.data.fillna('').astype('string')
    return df


In [9]:
change_dtype_str(df)

Unnamed: 0,name,address,location,price,cuisine,longitude,latitude,phone_number,url,website_url,award,facilities_and_services,data
0,King's Joy,"2 Wudaoying Hutong, Beijing, China Mainland","Beijing, China Mainland",¥¥¥,Vegetarian,116.410004,39.946681,8.610840e+11,https://guide.michelin.com/en/beijing-municipa...,,3 MICHELIN Stars,"Air conditioning,American Express credit card,...","Just a stone’s throw from Yonghe Temple, this ..."
1,Xin Rong Ji (Xinyuan South Road),"1F, East Tower, Genesis Beijing, 8 Xinyuan Sou...","Beijing, China Mainland",¥¥¥¥,Taizhou,116.450148,39.946380,8.610650e+11,https://guide.michelin.com/en/beijing-municipa...,,3 MICHELIN Stars,"Air conditioning,Car park,China UnionPay,Count...",This branch of the chain restaurant opened in ...
2,Taian Table,"101-102, Building No. 1, Garden Office, No.161...","Shanghai, China Mainland",¥¥¥¥,Innovative,121.474049,31.221807,8.617302e+12,https://guide.michelin.com/en/shanghai-municip...,https://taian-table.cn/,3 MICHELIN Stars,"Air conditioning,American Express credit card,...",A fixture on the city’s dining scene since 201...
3,Ultraviolet by Paul Pairet,'somewhere in Shanghai' - meet at Mr & Mrs Bun...,"Shanghai, China Mainland",¥¥¥¥,Innovative,121.485090,31.240358,,https://guide.michelin.com/en/shanghai-municip...,https://uvbypp.cc/,3 MICHELIN Stars,"Air conditioning,American Express credit card,...",This multi-sensory experience at a secret loca...
4,Quince,"470 Pacific Ave., San Francisco, 94133, USA","San Francisco, USA",$$$$,"Contemporary, Californian",-122.403261,37.797505,1.415776e+10,https://guide.michelin.com/en/california/san-f...,http://www.quincerestaurant.com,3 MICHELIN Stars,"Air conditioning,American Express credit card,...","An air of refinement infuses this dining room,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6775,Szóstka,"Plac Powstańców Warszawy 9, Warsaw, 00 039, Po...","Warsaw, Poland",€€,"Polish, Modern Cuisine",21.012698,52.235803,4.822470e+10,https://guide.michelin.com/en/masovia/warsaw/r...,https://warszawa.hotel.com.pl/hotel-warszawa,Bib Gourmand,"Air conditioning,American Express credit card,...",Take in rooftop views from this appealing glas...
6776,Fiorentina,"ul. Grodzka 63, Cracow, 31 044, Poland","Cracow, Poland",€€,"Creative, Polish",19.938179,50.055898,4.812426e+10,https://guide.michelin.com/en/lesser-poland/cr...,https://fiorentina.com.pl/,Bib Gourmand,"Air conditioning,American Express credit card,...","Sit under vaulted, red-brick ceilings to enjoy..."
6777,Zazie,"ul. Józefa 34, Cracow, 32 056, Poland","Cracow, Poland",€,French,19.946949,50.051240,4.850041e+10,https://guide.michelin.com/en/lesser-poland/cr...,http://www.zaziebistro.pl/,Bib Gourmand,"Air conditioning,Booking essential - dinner,Cr...",You’ll find this lively bistro in a corner spo...
6778,Butchery & Wine,"ul. Żurawia 22, Warsaw, 00 515, Poland","Warsaw, Poland",€€,"Meats and Grills, Traditional Cuisine",21.015495,52.228581,4.822502e+10,https://guide.michelin.com/en/masovia/warsaw/r...,https://www.butcheryandwine.pl/,Bib Gourmand,"American Express credit card,Booking essential...",The name of this modern bistro says it all: st...


In [10]:
df.facilities_and_services[df.facilities_and_services == 'NONE']

1994    NONE
4692    NONE
4698    NONE
4706    NONE
4707    NONE
4710    NONE
4723    NONE
4726    NONE
4727    NONE
4730    NONE
4735    NONE
4736    NONE
4740    NONE
4743    NONE
4744    NONE
4745    NONE
5283    NONE
5284    NONE
5285    NONE
5290    NONE
5291    NONE
5293    NONE
5294    NONE
5298    NONE
5299    NONE
5301    NONE
5305    NONE
5306    NONE
5307    NONE
5308    NONE
5309    NONE
5310    NONE
5311    NONE
5313    NONE
5316    NONE
5330    NONE
5331    NONE
5332    NONE
5333    NONE
5334    NONE
5335    NONE
5336    NONE
5338    NONE
5339    NONE
5340    NONE
5341    NONE
5343    NONE
5344    NONE
Name: facilities_and_services, dtype: string

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6780 entries, 0 to 6779
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   name                     6780 non-null   string 
 1   address                  6780 non-null   string 
 2   location                 6780 non-null   string 
 3   price                    6726 non-null   object 
 4   cuisine                  6780 non-null   string 
 5   longitude                6780 non-null   float64
 6   latitude                 6780 non-null   float64
 7   phone_number             6649 non-null   float64
 8   url                      6780 non-null   object 
 9   website_url              5542 non-null   object 
 10  award                    6780 non-null   string 
 11  facilities_and_services  6780 non-null   string 
 12  data                     6780 non-null   string 
dtypes: float64(3), object(3), string(7)
memory usage: 688.7+ KB
