In [1]:
# ---------------- #
# Common Libraries #
# ---------------- #
      
# Standard Imports
import os
import re
import nltk
import requests
import unicodedata
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

## setting basic style parameters for matplotlib
plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')

# ------------- #
# LOCAL IMPORTS #
# ------------- #

# importing sys
import sys
# adding 00_helper_files to the system path
sys.path.insert(0, '/Users/qmcbt/codeup-data-science/00_helper_files')
# env containing sensitive access credentials
import env
from env import user, password, host
from env import get_db_url

# Import Local Helper Modules
import QMCBT_00_quicktips as qt
import QMCBT_01_acquire as acq
import QMCBT_02_prepare as prep
import QMCBT_03_explore as exp
import QMCBT_04_visualize as viz
import QMCBT_05_model as mod
import QMCBT_wrangle as w

# Import py modules
import prepare as p

In [2]:
# Read-in data from compressed file
df = pd.read_pickle('michelin_df.pickle')

In [3]:
# Display DataFrame
df.T.head(13)[0]

name                                                              King's Joy
address                          2 Wudaoying Hutong, Beijing, China Mainland
location                                             Beijing, China Mainland
price                                                                    ¥¥¥
cuisine                                                           Vegetarian
longitude                                                         116.410004
latitude                                                           39.946681
phone_number                                                  861084049191.0
url                        https://guide.michelin.com/en/beijing-municipa...
website_url                                                              NaN
award                                                       3 MICHELIN Stars
facilities_and_services    Air conditioning,American Express credit card,...
data                       Just a stone’s throw from Yonghe Temple, this ...

In [4]:
df = p.create_features(df)

In [5]:
df.T.head(20)[0]

name                                                              king's joy
address                          2 wudaoying hutong, beijing, china mainland
location                                             beijing, china mainland
price                                                                    ¥¥¥
cuisine                                                           vegetarian
longitude                                                         116.410004
latitude                                                           39.946681
url                        https://guide.michelin.com/en/beijing-municipa...
award                                                       3 michelin stars
facilities_and_services    air conditioning,american express credit card,...
data                       just a stone’s throw from yonghe temple, this ...
price_level                                                                3
city                                                                 beijing

In [None]:
# Display DataFrame information
df.info()

In [None]:
df = p.change_dtype_str(df)

In [None]:
df.T.head(16)[0]

In [None]:
# Display DataFrame information
df.info()

In [None]:
df = p.create_features(df)

In [None]:
df.T.head()[0]

<div class="alert alert-info">

# Observations
* Dropping all nulls would remove 1336 documents (~135 documents have >1 Null)  

                          price:   54 Nulls
                   phone_number:  131 Nulls
                    website_url: 1238 Nulls
        facilities_and_services:   48 Nulls

* facilities_and_services nunique is 1571 

        * there are only 120 documents that share values with 10 or more other documents
        * there are 540 documents that share values with 2-9 other documents
        * there are 910 documents that are entirely unique

* There are only 16 locations that have more than 50 documents (awardees) and only half of those contain more than 100 documents  

        * Tokyo, Japan                    422
        * Kyoto, Japan                    196
        * Osaka, Japan                    193
        * New York, USA                   177
        * Paris, France                   164
        * Hong Kong                       135
        * Singapore                       119
        * London, United Kingdom          104
        * Bangkok, Thailand               103

* There are 863 unique cuisine values; 26 cuisine types that have more than 50 documents and only 9 that have more than 100 documents; 445 unique cuisine type combinations have only one document.  
        
        * Modern Cuisine              994
        * Creative                    416
        * Japanese                    287
        * Traditional Cuisine         218
        * French                      184
        * Street Food                 157
        * Italian                     130
        * Contemporary                114
        * Cantonese                   112

* There are four different award types  

        * Bib Gourmand        3411
        * 1 MICHELIN Star     2744
        * 2 MICHELIN Stars     483
        * 3 MICHELIN Stars     142

# Clean
### DTYPE & IMPUTE
* Cast facilities_and_services astype('string') and fillna('NONE')
* Cast data astype('string')

# Feature Engineer
* NON-MVP Option - Create buckets and encode locations and cuisine
* Encode award 
* Use facilities_and_services as a feature
* IMPUTE prce with len and encode


In [None]:
#TODO Replace price with len count and IMPUTE NaN with MODE
#TODO Separate City and Country for wordcloud clustering and exploration

In [None]:
# show DataFrame without Nulls
df.shape, df.dropna().shape

In [None]:
# show records where facilities_and_services is null
df[df.facilities_and_services.isnull()]

In [None]:
# Display count of unique entries
df.facilities_and_services.nunique()

In [None]:
# Show aggregated value counts
df.facilities_and_services.value_counts().head(662)

In [None]:
df.award.value_counts()

In [None]:
# Unique of locations
df.location.nunique()

In [None]:
# location counts > 50
df.location.value_counts().head(17)

In [None]:
df.cuisine.nunique()

In [None]:
# cuisine counts > 50
df.cuisine.value_counts().tail(445)

# CLEAN

In [None]:
# basic cleaning function:
# add_stopwords = ['r', 'u', '2', 'ltgt']

def clean(text, stem_or_lem=None, add_stopwords=[]):
    """
    Description:
    Simplified text cleaning function
    
    Required Imports:
    import re
    import nltk
    import unicodedata
    import pandas as pd
    from nltk.corpus import stopwords

    Arguments:
             text = The text you want to clean
      stem_or_lem = Default is None; stem will perform stemming on your text; lem will lemmatize it.
    add_stopwords = [] This is an empty list by default; just add words that you want to include as stopwords.
    
    Returns:
    returns list of cleaned words.
    """
    # 1. lowercase everything
    text = text.lower()
    # 2. Remove accented and ASCII characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # 3. Remove special characters
    words = re.sub(r"[^a-z0-9\s]", '', text).split()
    # 4. Tokenize
    tokenize = nltk.tokenize.ToktokTokenizer()
    tokenize.tokenize(text, return_str=False)
    # 5. Stemming or Lemmatizing
    if stem_or_lem == "stem":
        ps = nltk.porter.PorterStemmer()
        text = [ps.stem(word) for word in text.split()]
        ' '.join(text)
        print('Stemming Performed')
    elif stem_or_lem == "lem":
        wnl = nltk.stem.WordNetLemmatizer()
        text = [wnl.lemmatize(word) for word in text.split()]
        ' '.join(text)
        print('Lemmatizing Performed')
    else:
        print('No Stemming or Lemmatizing Performed')
    # 6. Remove StopWords
    stopwords = nltk.corpus.stopwords.words('english') + add_stopwords
    
    return [word for word in words if word not in stopwords]

In [None]:
df.info()

In [None]:
# Change DTYPE
def change_dtype_str(df):
    '''
    ## Description:
    This is a custom Function to change dtype to string as appropraiate for this project
    ## Arguments:
    df = DataFrame
    ## Returns:
    df - DataFrame
    '''
    df.name = df.name.fillna('').astype('string')
    df.address = df.address.fillna('').astype('string')
    df.location = df.location.fillna('').astype('string')
    df.cuisine = df.cuisine.fillna('').astype('string')
    df.facilities_and_services = df.facilities_and_services.fillna('NONE').astype('string')
    df.award = df.award.fillna('').astype('string')
    df.data = df.data.fillna('').astype('string')
    return df

In [None]:
change_dtype_str(df)

In [None]:
df.facilities_and_services[df.facilities_and_services == 'NONE']

In [None]:
df.info()

### Reviews

In [None]:
# Assign all, 1_star, 2_star, 3_star and bib_gourmand lists by passing the clean function with a join
all_reviews = clean(' '.join(df['data']),'lem')
one_star_reviews = clean(' '.join(df[df.award == '1 MICHELIN Star']['data']),'lem')
two_star_reviews = clean(' '.join(df[df.award == '2 MICHELIN Stars']['data']),'lem')
three_star_reviews = clean(' '.join(df[df.award == '3 MICHELIN Stars']['data']),'lem')
bib_gourmand_reviews = clean(' '.join(df[df.award == 'Bib Gourmand']['data']),'lem')


In [None]:
#TODO Create Mean Word Count

In [None]:
# show word count
print('Reviews Word Counts')
print(f'all_reviews: {len(all_reviews)}')
print(f'one_star_reviews: {len(one_star_reviews)}')
print(f'two_star_reviews: {len(two_star_reviews)}')
print(f'three_star_reviews: {len(three_star_reviews)}')
print(f'bib_gourmand_reviews: {len(bib_gourmand_reviews)}')

In [None]:
# Assign word counts to Frequency Variables
freq_one_star_reviews = pd.Series(one_star_reviews).value_counts()
freq_two_star_reviews = pd.Series(two_star_reviews).value_counts()
freq_three_star_reviews = pd.Series(three_star_reviews).value_counts()
freq_bib_gourmand_reviews = pd.Series(bib_gourmand_reviews).value_counts()
freq_all_reviews = pd.Series(all_reviews).value_counts()

In [None]:
# Test it
freq_all_reviews

### Facilities

In [None]:
# Assign all, 1_star, 2_star, 3_star and bib_gourmand lists by passing the clean function with a join
all_facilities = clean(' '.join(df['facilities_and_services']),'lem')
one_star_facilities = clean(' '.join(df[df.award == '1 MICHELIN Star']['facilities_and_services']),'lem')
two_star_facilities = clean(' '.join(df[df.award == '2 MICHELIN Stars']['facilities_and_services']),'lem')
three_star_facilities = clean(' '.join(df[df.award == '3 MICHELIN Stars']['facilities_and_services']),'lem')
bib_gourmand_facilities = clean(' '.join(df[df.award == 'Bib Gourmand']['facilities_and_services']),'lem')


In [None]:
# show word count
print('Facility Word Counts')
print(f'all_facilities: {len(all_facilities)}')
print(f'one_star_facilities: {len(one_star_facilities)}')
print(f'two_star_facilities: {len(two_star_facilities)}')
print(f'three_star_facilities: {len(three_star_facilities)}')
print(f'bib_gourmand_facilities: {len(bib_gourmand_facilities)}')

In [None]:
# Assign word counts to Frequency Variables
freq_one_star_facilities = pd.Series(one_star_facilities).value_counts()
freq_two_star_facilities = pd.Series(two_star_facilities).value_counts()
freq_three_star_facilities = pd.Series(three_star_facilities).value_counts()
freq_bib_gourmand_facilities = pd.Series(bib_gourmand_facilities).value_counts()
freq_all_facilities = pd.Series(all_facilities).value_counts()

In [None]:
# Test it
freq_all_facilities

In [None]:
# Create Frequency DataFrame
word_counts = pd.concat([freq_all_facilities,
                         freq_one_star_facilities, 
                         freq_two_star_facilities,
                         freq_three_star_facilities,
                         freq_bib_gourmand_facilities,
                         freq_all_reviews,
                         freq_one_star_reviews,
                         freq_two_star_reviews,
                         freq_three_star_reviews,
                         freq_bib_gourmand_reviews], axis=1
         ).fillna(0).astype(int)
word_counts.columns = ['all_facilities',
                         '1_star_facilities', 
                         '2_star_facilities',
                         '3_star_facilities',
                         'bib_gourmand_facilities',
                         'all_reviews',
                         '1_star_reviews',
                         '2_star_reviews',
                         '3_star_reviews',
                         'bib_gourmand_reviews']
word_counts