In [1]:
# Import data and main configurations

import logging

logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s.%(msecs)03d - %(levelname)s : %(message)s',
        datefmt='%H:%M:%S')
logger = logging.getLogger(__name__)
logger.info('Initializing logger.. complete!')


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

import requests
import json
import re
from geopy.geocoders import Nominatim


import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.util import bigrams, trigrams, ngrams
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

from wordcloud import WordCloud, STOPWORDS
from textblob import TextBlob

from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score  
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import classification_report

import lightgbm as lgb
import gc
from sklearn import preprocessing

import lightgbm as lgb


12:35:12.116 - INFO : Initializing logger.. complete!


In [2]:
def load_data(sample = False, n = 1000):
    """
    Loads the wine dataset
    It can be the full dataset or just a sample
    """
    
    path = 'data/winemag-data-130k-v2.csv'

    df = pd.read_csv(path).drop(['Unnamed: 0'], axis=1)
    
    if sample:
        df = df[:n]

    logger.info(f'File uploaded with {df.shape[0]} rows and {df.shape[1]} columns')

    numerical_columns = df.select_dtypes(include = np.number).columns
    categorical_columns = df.columns.drop(numerical_columns)
    target = 'variety'

    logger.info(f'Numerical columns: {numerical_columns}')
    logger.info(f'Categorical columns: {categorical_columns}')
    
    return df


In [3]:
def clean_text_columns(df, cols, option = 'alphanumeric'):
    """
    Cleans text columns including lowering and regex
    with the following options:
    
    alphanumeric - keeps letters, numbers and white spaces.
    
    """
    
    if option == 'alphanumeric':
        regex = r'([^A-Za-z0-9 ]+)'

    
    logger.info('Cleaning {cols}...')
    for col in cols:
        df[col] = df[col].str.replace(regex, '').str.lower()
        
    return df

In [4]:
def drop_cols(df, cols):
    
    init = df.shape[0]  
    logger.info(f'Droping columns {cols}')
    
    df = df.drop(cols, axis=1)
    
    end = df.shape[0]
    logger.info(f'Dataframe columns reduced: {init} -> {end}')

    return df

In [5]:
# Select the number of classes to consider

def select_top_classes(df, target_col = 'target', n=40):
    top_varieties = n
    initial_rows = df.shape[0]

    top_classes = (df.target.value_counts().sort_values(ascending=False).head(top_varieties)).index
    df = df[df['target'].isin(top_classes)]

    logger.info(f'Reducing the dataset to highest {top_varieties} classes.')
    logger.info(f'Dataframe reduced from {initial_rows} rows to {df.shape[0]}.')
    
    return df

In [6]:
def propagate_missing_values(df, null_col, info_col):
    
    null_values_init = df[df[null_col].isnull()].shape[0]

    logger.info(f'Creating a map of {null_col} -> {info_col} and apply to the null values...')

    prop_map = df[[null_col, info_col]].sort_values(info_col, ascending=True, na_position='last').groupby(info_col)[null_col].first()
    df[null_col] = df[info_col].map(prop_map)


    null_values_final = df[df[null_col].isnull()].shape[0]
    logger.info(f'Map succeccefully applied! Went from {null_values_init} missing {null_col} to {null_values_final}.')
    
    return df

In [7]:
def copy_nulls_from_col(df, null_col, info_col):
   
    logger.info(f'Copy vaalues from {info_col} to {null_col}')

    nulls = df[df[null_col].isnull()].shape[0]
    df.loc[df[null_col].isnull(), null_col] = df[info_col]
    
    logger.info(f'Finished! {nulls} values were copied.')
    
    return df

In [8]:
def convert_nulls_to_unknown(df, col, value = 'unknown'):
    
    logger.info(f'Converting null values to {value}..')

    df.loc[df.col.isnull(), col] = value
    return df

In [9]:
def generate_buckets(df, bin_col = 'points', new_col = 'bucket', n_bins = 8):
    
    logger.info(f'Bucketing column {bin_col} in {n_bins} buckets to column {new_col}')
    
    df[new_col] = pd.qcut(df[bin_col], n_bins, labels=range(n_bins))
    
#     df[new_col] = df[new_col].astype('Int64')
    
    return df

In [10]:
def find_null_location_in_google(df, option = 'country', null_col = 'country', info_col = 'winery'):
    """
    This function utilizes the scaleserp API to query Google Knowledge Graphs
    
    Takes as input:
    
    null_col: the column with the null values
    info_col: the col to query google KG
    option: the location option we want to retrieve
    
    
    Obs: uncomment following 'complete' line to get the complete 
    list of missing values and comment the 'example' one

    """

    init = df[null_col].isnull().sum()
    logger.info(f'Finding {option} by query Google Knowledge Graphs for {info_col}. There are {init} values missing.')
        
    # complete
    # info_values = df[df[null_col].isnull()][info_col].unique()  
    
    # example
    info_values = ['Gotsa Family Wines', 'Barton & Guestier','Kakhetia Traditional Winemaking', 'Tsililis']           
    
    loc_map = {}

    for value in info_values:

        params = {
          'api_key': '802701F0BC0E4726914B6982E70A541A',
          'q': f'{value} {info_col}'
        }

        api_result = requests.get('https://api.scaleserp.com/search', params).json()

        try:

            local_map = api_result['knowledge_graph']['local_map']
            lat = local_map['gps_coordinates']['latitude']
            long = local_map['gps_coordinates']['longitude']

            geolocator = Nominatim(user_agent="app")
            location = geolocator.reverse(f"{lat},{long}", language = 'en')
            result = location.raw['address'][option]

            loc_map[value] = result

            print(f'{value}: {result}')


        except:
            print(f'{value}: {option} not found')
            pass

    logger.info(f'Applying map of fetched `{null_col}`...')   

    df.loc[df[null_col].isnull(), null_col] = df[info_col].map(loc_map)
    
    
    end = df[null_col].isnull().sum()
    logger.info(f'Map aplied. Null values went from {init} to {end}')
    
    return df

In [11]:
def get_number_seq_from_text_column(df, new_col, col, seq_len = 4):
    
    regex = str('\d'*seq_len)
    regex = f'({regex})'
    
    logger.info(f'Finding a sequence of {seq_len}  numbers {regex}in column {col}')    
    df[new_col] = df[col].str.extract(regex, expand=False) 
    
#     df[new_col] = df[col].astype('Int64')
  
    return df

In [12]:
def remove_outliers_with_treshold(df, col, threshold = 10):
    """
    1 Standard Deviation from the Mean: 68%
    2 Standard Deviations from the Mean: 95%
    3 Standard Deviations from the Mean: 99.7%
    
    """
    logger.info(f'Removing outliers from column {col} out of {threshold} std dev.')
   
    init = df.shape[0]
    
    data_mean = np.mean(df[col])
    data_std = np.std(df[col])
    cut_off = data_std * threshold
    upper = data_mean + cut_off
    lower = data_mean - cut_off
    lower = lower if lower>0 else 0
    
    df = df[(df[col] > lower) & (df[col] < upper)]
    
    end = df.shape[0]
    
    logger.info(f'Dataframe rows reduced {init} -> {end}')
    return df

    

In [13]:
def null_values_stat_impute(df, null_col, info_col, mode = np.median):
    """
    Impute the null values with the 
    median/mean of a column
    """

    logger.info(f'Imputing the {mode} of column {info_col} into null values of {null_col}')
    mapper = df.groupby(info_col).median()[null_col]
    df.loc[df[null_col].isnull(),null_col] = df[info_col].map(mapper)
    df[null_col] = df[null_col].astype('float64')
    
    return df

In [14]:
def fill_nulls_with_median(df, null_col, info_col):
    
    
    logger.info(f'Imputing the median of column {info_col} into null values of {null_col}')

    mapper = df.groupby(info_col).median()[null_col]
    df.loc[df[null_col].isnull(),null_col] = df[info_col].map(mapper)
    df[null_col] = df[null_col].astype('float64')
    
    return df

In [15]:
def correct_grape_names(df, col):
    
    logger.info(f'Correcting grapes names and matching foreign names...')
    def _correct_name(row):
        regexp = [r'shiraz', r'ugni blanc', r'cinsaut', r'carinyena', r'^ribolla$', r'palomino', r'turbiana', r'verdelho', r'viura', r'pinot bianco|weissburgunder', r'garganega|grecanico', r'moscatel', r'moscato', r'melon de bourgogne', r'trajadura|trincadeira', r'cannonau|garnacha', r'grauburgunder|pinot grigio', r'pinot noir|pinot nero', r'colorino', r'mataro|monastrell', r'mourv(\w+)']
        grapename = ['syrah', 'trebbiano', 'cinsault', 'carignan', 'ribolla gialla', 'palomino','verdicchio', 'verdejo','macabeo', 'pinot blanc', 'garganega', 'muscatel', 'muscat', 'muscadet', 'treixadura', 'grenache', 'pinot gris', 'pinot noir', 'lambrusco', 'mourvedre', 'mourvedre']
        f = row
        for exsearch, gname in zip(regexp, grapename):
            f = re.sub(exsearch, gname, f)
        return f


    name_pairs = [('spatburgunder', 'pinot noir'), ('garnacha', 'grenache'), ('pinot nero', 'pinot noir'),
                  ('alvarinho', 'albarino'), ('assyrtico', 'assyrtiko'), ('black muscat', 'muscat hamburg'),
                  ('kekfrankos', 'blaufrankisch'), ('garnacha blanca', 'grenache blanc'),
                  ('garnacha tintorera', 'alicante bouschet'), ('sangiovese grosso', 'sangiovese')
                 ]
    
    df[col] = df[col].apply(lambda row: _correct_name(row))
    
    for wrong, right in name_pairs:
        df[col] = df[col].replace(wrong, right) 
        
    return df


In [16]:
def create_ratio(df, col1, col2, col_name):
    """
    Creates a new column with the ratio of 2
    given numerical columns.
    """
        
    df.loc[(~df[col1].isnull()) & (~df[col2].isnull()), col_name] = round(df[col1]/df[col2],2)
    
    logger.info(f'Created ratio feature {col_name} ({col1} / {col2})')

    
    return df

In [17]:
def get_stopwords_list(df, cols, language = 'english', extra_words = []):
    """
    Creates a list with the stopwords of input language
    and additionally the categories of given columns.
    
    """
    
    logger.info('Creating a list for stopwords..')
    
    extra_words = ['.', ',', '`', '"', "'", '!', ';', 'wine', 'fruit', '%', 'flavour', 'aromas', 'palate']
    stop_words = stopwords.words(language)
    
    for col in cols:
        stop_words.append(df[col].unique())
    
    
    stop_words = stop_words + extra_words
    
    return stop_words


In [18]:
def process_small_text_column(df, col, stop_words, regex = '[^a-zA-Z]', wanted_tags = []):
    """
    Function to process small text columns, like descriptions.
    Processing includes:
    
    1. Parse with the input regex and lowercase
    2. Remove stopwords from input language and from input categorical columns
    3. Keep only words with input (grammar) tags
    4. Convert each word to its lemma
    5. Convert list of tokens back to string
    
    """
    
    logger.info(f'Processing text column {col}...')
    
    wanted_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR', 'VBN', 'VBP']
    lemmatizer=WordNetLemmatizer()


    extra_words = ['.', ',', '`', '"', "'", '!', ';', 'wine', 'fruit', '%', 'flavour', 'aromas', 'palate']
    stop_words = stopwords.words('english')
    stop_words = stop_words + extra_words + list(df.columns)

    def _process_text(text):
        tokens = word_tokenize(re.sub(regex, ' ',text).lower())
        tokens = [token for token in tokens if not token in stop_words]
        text = nltk.pos_tag(tokens)
        text = [lemmatizer.lemmatize(token[0]) for token in text if token[1] in wanted_tags]
        text = ' '.join(text)
        return text
    
    df[col] = df[col].apply(lambda x: _process_text(x))
    
    logger.info('Finished processing!')

    return df

In [19]:
def generate_word_cloud(df, col, top_n_words = 30):
    
    logger.info(f'Vectorizing `{col}` feature to Word Cloud..')

    vectorizer = CountVectorizer(ngram_range=(1,3), 
                                 max_features=500)
    
    X = vectorizer.fit_transform(df[col])
    matrix = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names())
    matrix.index = df.index
    matrix.head()


    logger.info(f'Generating wordcloud...')
    

    wordcloud = WordCloud(
        background_color='black',
        max_words=top_n_words,
        max_font_size=40, 
        scale=5,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(matrix))

    fig = plt.figure(1, figsize=(12, 12))
    plt.axis('off')

    plt.imshow(wordcloud)
    plt.show()

    logger.info(f'Done!')
    
    return
    

In [20]:
def analyse_sentiment_from_text_col(df, col):
    """
    
    """

    logger.info(f'Calculating sentiment polarity and subjectivity of `{col}`...')

    df['sentiment_pol']= df[col].apply(lambda x: float(round(TextBlob(x).sentiment.polarity, 2)))
    df['sentiment_sub']= df[col].apply(lambda x: float(round(TextBlob(x).sentiment.subjectivity, 2)))
    
    return df

In [21]:
def get_tfidf_matrix_of_col(df, col, max_features=500, ngrams_range=(1,3)):
    """
    Takes a text column and returns a vector matrix
    with the most frequent terms.
    """
    
    logger.info(f'Vectorizing `{col}` feature with TFIDF...')

    vectorizer = TfidfVectorizer(ngram_range=ngrams_range, 
                                 max_df = 0.25, 
                                 min_df = 0.005, 
                                 max_features=max_features,
                                 norm='l2',
                                 smooth_idf=True) 

    X = vectorizer.fit_transform(df[col])
    
    matrix = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names())
    matrix.index = df.index
    
    for col in matrix.columns:
        col = '_'.join(col)
    
    logger.info(f'Finish! Created sparse matrix with shape {matrix.shape}')
    
    return matrix

In [22]:
def target_encode_multiclass(X,y): #
    """
    X,y are pandas df and series
    
    """
    
    logger.info(f'Target encoding categorical features...')
    y=y.astype(str)   #convert to string to onehot encode
    enc=ce.OneHotEncoder().fit(y)
    y_onehot=enc.transform(y)
    class_names=y_onehot.columns  #names of onehot encoded columns
    
    X_obj=X.select_dtypes('object') #separate categorical columns
    X=X.select_dtypes(exclude='object') 
    for class_ in class_names:
      
        enc=ce.TargetEncoder()
        enc.fit(X_obj,y_onehot[class_]) #convert all categorical 
        temp=enc.transform(X_obj)       #columns for class_
        temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
        X=pd.concat([X,temp],axis=1)    #add to original dataset

    return X

In [191]:
def print_stats(preds, target, labels, sep='-', sep_len=80, fig_size=(10,8)):
    print('Accuracy = %.3f' % metrics.accuracy_score(target, preds))
    print(sep*sep_len)
#     print('Classification report:')
#     print(metrics.classification_report(target, preds))
    
    return


In [24]:
#Data Cleaning
df = load_data(sample = False, n = 10000)
df = df.rename(columns={'variety': 'target'})
df = select_top_classes(df, 'target', 40)
df = propagate_missing_values(df, 'country', 'winery')
df = propagate_missing_values(df, 'province', 'winery')
df = copy_nulls_from_col(df, 'region_1', 'province')
# df = find_null_location_in_google(df, option = 'country', null_col = 'country', info_col = 'winery')
# df = convert_nulls_to_unknown(df, col, value = unknown)
# df = remove_outliers_with_treshold(df, price, threshold = 10)

#Feature Engineering
df = analyse_sentiment_from_text_col(df, 'description')
df = generate_buckets(df, bin_col = 'points' , new_col = 'points_bucket', n_bins = 8)
df = generate_buckets(df, bin_col = 'price', new_col = 'price_bucket', n_bins = 8)
df = get_number_seq_from_text_column(df, 'year' , 'title', seq_len = 4)
# df = null_values_stat_impute(df, price, points_bucked, mode = np.median)
df = correct_grape_names(df, 'target')

# new features
df = create_ratio(df, col1 = 'points', col2 = 'price', col_name = 'quality_price_ratio')
df.to_csv('silver')

12:35:15.966 - INFO : File uploaded with 129971 rows and 13 columns
12:35:15.970 - INFO : Numerical columns: Index(['points', 'price'], dtype='object')
12:35:15.971 - INFO : Categorical columns: Index(['country', 'description', 'designation', 'province', 'region_1',
       'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety',
       'winery'],
      dtype='object')
12:35:16.025 - INFO : Reducing the dataset to highest 40 classes.
12:35:16.026 - INFO : Dataframe reduced from 129971 rows to 111797.
12:35:16.036 - INFO : Creating a map of country -> winery and apply to the null values...
12:35:16.170 - INFO : Map succeccefully applied! Went from 40 missing country to 23.
12:35:16.175 - INFO : Creating a map of province -> winery and apply to the null values...
12:35:16.308 - INFO : Map succeccefully applied! Went from 40 missing province to 23.
12:35:16.309 - INFO : Copy vaalues from province to region_1
12:35:16.328 - INFO : Finished! 17681 values were copied.
12:35:16.

In [25]:
# description processing
df = clean_text_columns(df, ['target', 'winery', 'taster_name', 'region_1', 'province'])
stop_words = get_stopwords_list(df, cols = ['country', 'province', 'winery'], language = 'english', extra_words = [])
df = process_small_text_column(df, 'description', stop_words = stop_words)
matrix = get_tfidf_matrix_of_col(df, 'description', max_features=500)
df = pd.concat([df, matrix], axis=1)
# generate_word_cloud(df, 'target', top_n_words = 40)

In [25]:
df.to_csv('gold.csv')

12:36:27.916 - INFO : Cleaning {cols}...
12:36:28.490 - INFO : Creating a list for stopwords..
12:36:28.526 - INFO : Processing text column description...
12:39:31.422 - INFO : Finished processing!
12:39:31.423 - INFO : Vectorizing `description` feature with TFIDF...
12:39:43.628 - INFO : Finish! Created sparse matrix with shape (111797, 500)


In [167]:
df = pd.read_csv('gold.csv')
df = df.drop('Unnamed: 0', axis=1)

# deal with nulls, encodings, norms
df = df.drop(['description', 'designation', 'region_2', 'taster_twitter_handle', 'title', 'province', 'region_1', 'winery'], axis=1)

In [168]:
le = preprocessing.LabelEncoder()
ohe = preprocessing.OneHotEncoder(handle_unknown='ignore')

# le_cols = ['target']
# ohe_cols = df[['country', 'taster_name']]

# ohe_predictors = pd.get_dummies(ohe_cols)
# df = pd.concat([df, ohe_predictors], axis=1).drop(['country', 'taster_name'], axis=1)

In [169]:
# df = df.dropna()
df

Unnamed: 0,country,points,price,taster_name,target,points_bucket,price_bucket,year,quality_price_ratio,sentiment_pol,...,wine,winemaker,wood,woody,year.1,yellow,young,zest,zesty,zinfandel
0,Italy,87,,kerin okeefe,white blend,2,,2013.0,,0.13,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,Portugal,87,15.0,roger voss,portuguese red,2,1.0,2011.0,5.80,0.22,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,US,87,14.0,paul gregutt,pinot gris,2,1.0,2013.0,6.21,0.02,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,US,87,13.0,alexander peartree,riesling,2,0.0,2013.0,6.69,0.17,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,US,87,65.0,paul gregutt,pinot noir,2,7.0,2012.0,1.34,0.31,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111792,Germany,90,28.0,anna lee c iijima,riesling,4,4.0,2013.0,3.21,0.60,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
111793,US,90,75.0,paul gregutt,pinot noir,4,7.0,2004.0,1.20,0.00,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
111794,France,90,30.0,roger voss,gewrztraminer,4,4.0,2013.0,3.00,0.09,...,0.0,0.0,0.0,0.0,0.281992,0.0,0.0,0.0,0.0,0.0
111795,France,90,32.0,roger voss,pinot gris,4,4.0,2012.0,2.81,0.12,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [170]:
# df[numerical_columns] = pd.to_numeric(df[numerical_columns])

X = df.drop(['target'], axis=1)
y = le.fit_transform(df['target'])


categorical_columns = ['country', 'taster_name']
numerical_columns = [col for col in X.columns if col not in categorical_columns]


from collections import Counter

class_weight = dict(Counter(y))


for col in numerical_columns:
    X[col] = X[col].astype('float64')

for col in categorical_columns:
    X[col] = X[col].astype('category')

# for col in numerical_columns:
#     X[col] = preprocessing.scale(X[col]) 

In [233]:
# Define train, valid and test datasets


X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, random_state=42) #, stratify=y)

# X_train, X_test, y_train, y_test = train_test_split(X, y, 
#                                                     test_size=0.2, random_state=42)

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
#                                                   test_size=0.25, random_state=42) # 0.25 x 0.8 = 0.2

print(X_train.shape)
# del df; 
# gc.collect();


(89437, 510)


In [248]:
# Define model parameters
# https://www.kaggle.com/somang1418/tuning-hyperparameters-under-10-minutes-lgbm
# Use an optimizer instead of grid search

params = {
         'n_estimators': [400],
         'num_leaves': [20,50], 
         'metric': ['multi_error'],
         'num_class': [df.target.nunique],
         'boosting_type':  ['gbdt'], #otimized to gbdt
         'learning_rate': [0.03], # otimized to o.03
         'max_depth': [20,30,40], #reduces overfitting
         'bagging_fraction': [0.6], # speeds up training and reduces overfit. otimized to 0.6
         'feature_fraction': [0.7], # speeds up training and reduces overfit, otimized to 0.7
         'lambda_l1': [0, 0.1],
         'lambda_l2': [0, 0.1]
         
        }

lgbm = lgb.LGBMClassifier(

#         nfold = 3, 
#         stratified=True,
        seed = 42,
        objective = 'multiclass',
        is_unbalance = True,
        metric = 'multi_error'

        
#         class_weight = class_weight # this decreases the accuracy a lot... weird
)


In [249]:
clf = GridSearchCV(lgbm, params, n_jobs=-1, verbose=1, cv = 2, return_train_score=True) # refit = True

In [250]:
%%time
clf.fit(X_train, y_train, verbose = True);

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 200.3min finished


CPU times: user 1h 38min 20s, sys: 1min 37s, total: 1h 39min 58s
Wall time: 3h 26min 51s


GridSearchCV(cv=2, error_score=nan,
             estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      is_unbalance=True, learning_rate=0.1,
                                      max_depth=-1, metric='multi_error',
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=100,
                                      n_jobs=-1, num_leaves=31,
                                      objective='multiclass', random_sta...
                         'n_estimators': [400],
                         'num_class': [<bound method IndexOpsMixin.nunique of 0            white blend
1         portuguese red
2             pinot gris
3               riesling
4             pinot noir
               ...     

In [251]:
pd.DataFrame(clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bagging_fraction,param_boosting_type,param_feature_fraction,param_lambda_l1,param_lambda_l2,param_learning_rate,...,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,mean_train_score,std_train_score
0,2106.717119,4.607581,460.474902,1.848634,0.6,gbdt,0.7,0.0,0.0,0.03,...,"{'bagging_fraction': 0.6, 'boosting_type': 'gb...",0.700552,0.700836,0.700694,0.000142,22,0.864574,0.863615,0.864094,0.000479
1,3522.537009,13.952864,591.010019,1.072611,0.6,gbdt,0.7,0.0,0.0,0.03,...,"{'bagging_fraction': 0.6, 'boosting_type': 'gb...",0.711197,0.709893,0.710545,0.000652,12,0.971242,0.970057,0.97065,0.000592
2,2111.888338,3.980512,461.092896,1.91977,0.6,gbdt,0.7,0.0,0.0,0.03,...,"{'bagging_fraction': 0.6, 'boosting_type': 'gb...",0.700552,0.700836,0.700694,0.000142,22,0.864574,0.863615,0.864094,0.000479
3,3898.801051,20.186629,593.120646,2.294687,0.6,gbdt,0.7,0.0,0.0,0.03,...,"{'bagging_fraction': 0.6, 'boosting_type': 'gb...",0.712538,0.710318,0.711428,0.00111,5,0.976408,0.974776,0.975592,0.000816
4,2109.210137,5.787094,459.984924,1.588703,0.6,gbdt,0.7,0.0,0.0,0.03,...,"{'bagging_fraction': 0.6, 'boosting_type': 'gb...",0.700552,0.700836,0.700694,0.000142,22,0.864574,0.863615,0.864094,0.000479
5,3994.937822,18.13892,595.878026,2.035827,0.6,gbdt,0.7,0.0,0.0,0.03,...,"{'bagging_fraction': 0.6, 'boosting_type': 'gb...",0.712561,0.710072,0.711316,0.001244,7,0.976676,0.975313,0.975994,0.000682
6,2045.638161,0.53536,454.171454,1.031184,0.6,gbdt,0.7,0.0,0.1,0.03,...,"{'bagging_fraction': 0.6, 'boosting_type': 'gb...",0.701223,0.701776,0.701499,0.000276,13,0.860682,0.859568,0.860125,0.000557
7,3204.138321,26.105205,583.438805,2.107711,0.6,gbdt,0.7,0.0,0.1,0.03,...,"{'bagging_fraction': 0.6, 'boosting_type': 'gb...",0.710906,0.710653,0.71078,0.000126,11,0.968782,0.967665,0.968223,0.000559
8,2079.39531,2.611996,455.744992,2.529174,0.6,gbdt,0.7,0.0,0.1,0.03,...,"{'bagging_fraction': 0.6, 'boosting_type': 'gb...",0.701223,0.701776,0.701499,0.000276,13,0.860682,0.859568,0.860125,0.000557
9,3745.898948,10.940469,604.215117,2.587434,0.6,gbdt,0.7,0.0,0.1,0.03,...,"{'bagging_fraction': 0.6, 'boosting_type': 'gb...",0.711532,0.710944,0.711238,0.000294,8,0.97444,0.972629,0.973534,0.000905


In [252]:
bst = clf.best_estimator_
y_pred = bst.predict(X_test)

In [253]:
print('Test')
print_stats(y_pred, y_test, X_test.columns, sep='-', sep_len=40, fig_size=(10,8))  

Test
Accuracy = 0.733
----------------------------------------


In [254]:
y_pred_1 = bst.predict(X_train)
print('train')
print_stats(y_pred_1, y_train, X_train.columns, sep='-', sep_len=40, fig_size=(10,8))  

train
Accuracy = 0.918
----------------------------------------


In [209]:
bst.get_params

<bound method LGBMModel.get_params of LGBMClassifier(bagging_fraction=0.6, boosting_type='gbdt', class_weight=None,
               colsample_bytree=1.0, feature_fraction=0.7,
               importance_type='split', is_unbalance=True, lambda_l1=0,
               lambda_l2=0, learning_rate=0.03, max_depth=20,
               metric='multi_error', min_child_samples=20,
               min_child_weight=0.001, min_split_gain=0.0, n_estimators=200,
               n_jobs=-1,
               num_class=<bound method In...n.nunique of 0            white blend
1         portuguese red
2             pinot gris
3               riesling
4             pinot noir
               ...      
111792          riesling
111793        pinot noir
111794     gewrztraminer
111795        pinot gris
111796     gewrztraminer
Name: target, Length: 111797, dtype: object>,
               num_leaves=50, objective='multiclass', random_state=None,
               reg_alpha=0.0, reg_lambda=0.0, seed=42, silent=True,
          

In [None]:
feat_imp = pd.Series(lgbm.feature_importances_, index=X.columns)
feat_imp.nlargest(30).plot(kind='barh', figsize=(8,10))

In [None]:
import shap
shap.initjs()

shap_values = shap.TreeExplainer(bst.booster_).shap_values(X_train)
shap.summary_plot(shap_values, X_train)

In [None]:
# Create the visualizer and draw the vectors
from yellowbrick.text import TSNEVisualizer


X_train = pd.get_dummies(X_train)
plt.figure(figsize = [15,9])
tsne = TSNEVisualizer()
tsne.fit(X_train, y_train)
tsne.poof()

In [60]:
aux = X.drop(['country', 'taster_name'], axis=1)

tsne = TSNEVisualizer()
tsne.fit(aux, y)
tsne.show()

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').