In [4]:
#load needed libraries
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords # import stopwords list
#nltk.download('stopwords')
#retrive cleaned dataset
%store -r df
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,description_len,points_range,price_range,vintage
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,20.0,Sicily & Sardinia,Etna,,Kerin O’Keefe,#NAME?,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,172,"[85,90)","[20,40)",2013
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,#NAME?,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,227,"[85,90)","[0,20)",2011
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,186,"[85,90)","[0,20)",2013
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,199,"[85,90)","[0,20)",2013
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,249,"[85,90)","[60,100)",2012


In [5]:
from nltk.corpus import stopwords # import stopwords list

#Convert the stopwords to a set to fasten the process
stops = set(stopwords.words("english"))

def description_to_words (description):
    '''
    Function to convert a description to a string of words.
    The input is a single string (a raw description).
    The output is a single string (a preprocessed description).
    '''
    
    # 1. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", description)
        
    # 2. Convert to lower case & split into individual words
    words = letters_only.lower().split()
    
    # 3. Remove the stop words
    meaningful_words = [word for word in words if not word in stops]
    
    return(meaningful_words)

df["description_clean"] = df["description"].apply(description_to_words)
df["description_clean"].head()



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


0    [aromas, include, tropical, fruit, broom, brim...
1    [ripe, fruity, wine, smooth, still, structured...
2    [tart, snappy, flavors, lime, flesh, rind, dom...
3    [pineapple, rind, lemon, pith, orange, blossom...
4    [much, like, regular, bottling, comes, across,...
Name: description_clean, dtype: object

In [6]:
#Stemming
from nltk.stem.snowball import SnowballStemmer
ss = SnowballStemmer("english")

def stemming (text):
    '''
    Function performing stemming on the words.
    Input is a list of strings.
    The output is a list of stemmed words.
    '''
    stem_words = [ss.stem(word) for word in text]
    return(stem_words)

df["description_stem"] = df["description_clean"].apply(stemming)

In [7]:
#Lemmatizing
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

def lemmatizing (text):
    '''
    Function performing lemmatizing on the words.
    Input is a list of strings.
    The output is a list of lemmatized words.
    '''
    lemmatized = []
    
    for word, tag in pos_tag(text):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized.append(lem.lemmatize(word, pos))
        
    return(lemmatized)

df["description_lem"] = df["description_clean"].apply(lemmatizing)

In [8]:
def get_binary (points):
    '''
    Function to get binary outcome.
    Input is number of points wine has.
    Output is 1 if wine is better than average, 0 if average or worse than average
    '''
    
    if points > df.points.mean():
        return(1)
    elif points <= df.points.mean():
        return(0)
    
df["better_than_avg"] = df["points"].apply(get_binary)

In [25]:
df.country[df.country == "US"].count()/df.country.count()
df.better_than_avg[df.better_than_avg == 1].count()/df.better_than_avg.count()

0.47279200853845643

In [9]:
df.select_dtypes(include=['object']).columns

Index(['country', 'description', 'designation', 'province', 'region_1',
       'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety',
       'winery', 'points_range', 'price_range', 'description_clean',
       'description_stem', 'description_lem'],
      dtype='object')

In [10]:
def create_dummy_df(df, cat_cols, dummy_na):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains all columns that were not specified as categorical
            2. removes all the original columns in cat_cols
            3. dummy columns for each of the categorical columns in cat_cols
            4. if dummy_na is True - it also contains dummy columns for the NaN values
            5. Use a prefix of the column name with an underscore (_) for separating 
    '''
   
    for col in  cat_cols:
        try:
            # for each cat add dummy var, drop original column
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            continue
    return df

df_to_feed = df[["country", "variety", "price_range", "winery", "vintage", "description_len"]]
df_cat = create_dummy_df(df_to_feed, ["country", "variety", "price_range", "winery"], dummy_na = False)
df_cat.shape

(119928, 17493)

In [13]:
#for some reason create_dummy_df didn't create a dummy column for winery - dropping winery
df_cat.head()

Unnamed: 0,vintage,description_len,country_Armenia,country_Australia,country_Austria,country_Bosnia and Herzegovina,country_Brazil,country_Bulgaria,country_Canada,country_Chile,...,winery_Écluse,winery_Élevée Winegrowers,winery_Éric & Jöel Durand,winery_Ïl Macchione,winery_Ñandú,winery_Órale,winery_Öko,winery_Ökonomierat Rebholz,winery_àMaurice,winery_Štoka
0,2013,172,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2011,227,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2013,186,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2013,199,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2012,249,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Prepare data for CountVectoriser
description_list = df["description_lem"].tolist()
cv_input = [" ".join(sublist) for sublist in description_list]

# Initialize the "CountVectorizer" object , sklearn bag of words tool.
vectorizer = CountVectorizer(analyzer = "word", 
                            ngram_range = (1,2),
                            max_features = 2000)

description_vectorized = vectorizer.fit_transform(cv_input)

In [15]:
from sklearn.model_selection import train_test_split

y = np.array(df.better_than_avg)
X = sp.sparse.hstack([description_vectorized, df_cat.values.astype(float)],format = 'csr')
features_names = vectorizer.get_feature_names() + df_cat.columns.tolist()

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 40)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

#Instantiate a LinearRegression model with normalized data
lm_model = LinearRegression(normalize=True)
    
#Fit your model to the training data
lm_model.fit(X_train, y_train)
        
#Predict the response for the training data and the test data
y_test_preds = lm_model.predict(X_test) 
y_train_preds = lm_model.predict(X_train)
    
#Obtain an rsquared value for both the training and test data
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)
    
print(test_score, train_score) 

In [16]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 500, random_state = 40)

# Train the model on training data
rf.fit(X_train, y_train)

#Predict the response for the training data and the test data
y_test_preds = rf.predict(X_test) 
y_train_preds = rf.predict(X_train)

NameError: name 'r2_score' is not defined

In [27]:
# calculate the absolute errors
errors = abs(y_test_preds - y_test)

# print out the mean absolute error
print("Mean Absolute Error: ", round(np.mean(errors),2))

Mean Absolute Error:  0.16


In [30]:
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, confusion_matrix, roc_auc_score
f1 = f1_score(y_test, y_test_preds)
accuracy = accuracy_score(y_test, y_test_preds)
precision = precision_score(y_test, y_test_preds)
recall = recall_score(y_test, y_test_preds)
confusion = confusion_matrix(y_test, y_test_preds)
auc = roc_auc_score(y_test, y_test_preds)

print("The f1 score is " + str(f1))
print("The accuracy is " + str(accuracy))
print("The precision is " + str(precision))
print("The recall is " + str(recall))
print("Area under the curve " + str(auc))
print(confusion)

The f1 score is 0.8296836982968371
The accuracy is 0.8412380761790408
The precision is 0.8369306287446763
The recall is 0.8225611919120256
Area under the curve 0.8401847314126756
[[13628  2259]
 [ 2501 11594]]


In [36]:
# Get numerical feature importances
importances = list(rf.feature_importances_)

# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 4)) for feature, importance in zip(features_names, importances)]

# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)

# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: description_len      Importance: 0.0555
Variable: price_range_[40,60)  Importance: 0.0229
Variable: price_range_[60,100) Importance: 0.0151
Variable: vintage              Importance: 0.0103
Variable: rich                 Importance: 0.007
Variable: price_range_[100,500) Importance: 0.0063
Variable: vineyard             Importance: 0.0058
Variable: year                 Importance: 0.0053
Variable: age                  Importance: 0.0052
Variable: long                 Importance: 0.0052
Variable: price_range_[20,40)  Importance: 0.005
Variable: black                Importance: 0.0049
Variable: simple               Importance: 0.0048
Variable: wine                 Importance: 0.0048
Variable: complex              Importance: 0.0046
Variable: drink                Importance: 0.0042
Variable: balance              Importance: 0.004
Variable: elegant              Importance: 0.004
Variable: flavor               Importance: 0.0039
Variable: fruit                Importance: 0.0039
Var

Variable: juiciness            Importance: 0.0001
Variable: juicy red            Importance: 0.0001
Variable: juicy wine           Importance: 0.0001
Variable: juniper              Importance: 0.0001
Variable: key                  Importance: 0.0001
Variable: kind                 Importance: 0.0001
Variable: kiss                 Importance: 0.0001
Variable: knit                 Importance: 0.0001
Variable: la                   Importance: 0.0001
Variable: laden                Importance: 0.0001
Variable: lamb                 Importance: 0.0001
Variable: lasting              Importance: 0.0001
Variable: lead way             Importance: 0.0001
Variable: leesy                Importance: 0.0001
Variable: lemon flavor         Importance: 0.0001
Variable: lemon peel           Importance: 0.0001
Variable: lemongrass           Importance: 0.0001
Variable: licorice flavor      Importance: 0.0001
Variable: lie                  Importance: 0.0001
Variable: life                 Importance: 0.0001


Variable: winery_Alzinger      Importance: 0.0
Variable: winery_Além          Importance: 0.0
Variable: winery_AmByth        Importance: 0.0
Variable: winery_Amaboko       Importance: 0.0
Variable: winery_Amador Cellars Importance: 0.0
Variable: winery_Amador Foothill Winery Importance: 0.0
Variable: winery_Amador Garcia Importance: 0.0
Variable: winery_Amadores      Importance: 0.0
Variable: winery_Amalaya       Importance: 0.0
Variable: winery_Amalia        Importance: 0.0
Variable: winery_Amalie Robert Importance: 0.0
Variable: winery_Amantis       Importance: 0.0
Variable: winery_Amapola Creek Importance: 0.0
Variable: winery_Amaral        Importance: 0.0
Variable: winery_Amarano       Importance: 0.0
Variable: winery_Amaro Vineyard Importance: 0.0
Variable: winery_Amastuola     Importance: 0.0
Variable: winery_Amathia       Importance: 0.0
Variable: winery_Amatore       Importance: 0.0
Variable: winery_Amavi         Importance: 0.0
Variable: winery_Amayna        Importance: 0.0
Va

Variable: winery_Cadaretta     Importance: 0.0
Variable: winery_Cade          Importance: 0.0
Variable: winery_Cadence       Importance: 0.0
Variable: winery_Cadus         Importance: 0.0
Variable: winery_Caernarvon Cellars Importance: 0.0
Variable: winery_Cafaggio      Importance: 0.0
Variable: winery_Café du Midi  Importance: 0.0
Variable: winery_Cagliero      Importance: 0.0
Variable: winery_Caiarossa     Importance: 0.0
Variable: winery_Cain          Importance: 0.0
Variable: winery_Cairdean Estate Importance: 0.0
Variable: winery_Cairdeas      Importance: 0.0
Variable: winery_Cairnbrae     Importance: 0.0
Variable: winery_Caitec        Importance: 0.0
Variable: winery_Cakebread     Importance: 0.0
Variable: winery_CalNaturale   Importance: 0.0
Variable: winery_Cala de' Poeti Importance: 0.0
Variable: winery_Calabretta    Importance: 0.0
Variable: winery_Calafia       Importance: 0.0
Variable: winery_Calama        Importance: 0.0
Variable: winery_Calamares     Importance: 0.0
Varia

Variable: winery_Château Haut Maurin Importance: 0.0
Variable: winery_Château Haut Mazerolles Importance: 0.0
Variable: winery_Château Haut Millet Importance: 0.0
Variable: winery_Château Haut Mondain Importance: 0.0
Variable: winery_Château Haut Mouleyre Importance: 0.0
Variable: winery_Château Haut Nadeau Importance: 0.0
Variable: winery_Château Haut Nouchet Importance: 0.0
Variable: winery_Château Haut Pasquet Importance: 0.0
Variable: winery_Château Haut Peyrefaure Importance: 0.0
Variable: winery_Château Haut Peyruguet Importance: 0.0
Variable: winery_Château Haut Pingat Importance: 0.0
Variable: winery_Château Haut Pougnan Importance: 0.0
Variable: winery_Château Haut Prieur Importance: 0.0
Variable: winery_Château Haut Seguin Importance: 0.0
Variable: winery_Château Haut Selve Importance: 0.0
Variable: winery_Château Haut Sociondo Importance: 0.0
Variable: winery_Château Haut Tour de Coutelin Importance: 0.0
Variable: winery_Château Haut Vieux Chêne Importance: 0.0
Variable: win

Variable: winery_Conte Collalto Importance: 0.0
Variable: winery_Conte Ferdinando Guicciardini Importance: 0.0
Variable: winery_Conte Leopardi Importance: 0.0
Variable: winery_Conte d'Attimis-Maniago Importance: 0.0
Variable: winery_Conterno Fantino Importance: 0.0
Variable: winery_Conti Costanti Importance: 0.0
Variable: winery_Conti Formentini Importance: 0.0
Variable: winery_Conti Leopardi di San Leopardo Importance: 0.0
Variable: winery_Conti Zecca   Importance: 0.0
Variable: winery_Conti di Buscareto Importance: 0.0
Variable: winery_Conti di Sn Bonifacio Importance: 0.0
Variable: winery_Contini       Importance: 0.0
Variable: winery_Contino       Importance: 0.0
Variable: winery_Continuum     Importance: 0.0
Variable: winery_Contour       Importance: 0.0
Variable: winery_Contrada Michele Importance: 0.0
Variable: winery_Contrada Salandra Importance: 0.0
Variable: winery_Contrada Santo Spirito di Passopisciaro Importance: 0.0
Variable: winery_Contrade di Taurasi Importance: 0.0
Var

Variable: winery_Elio Altare   Importance: 0.0
Variable: winery_Elio Filippino Importance: 0.0
Variable: winery_Elio Grasso   Importance: 0.0
Variable: winery_Elio Perrone  Importance: 0.0
Variable: winery_Elios         Importance: 0.0
Variable: winery_Elisabetta    Importance: 0.0
Variable: winery_Eliseo Silva  Importance: 0.0
Variable: winery_Elizabeth Chambers Importance: 0.0
Variable: winery_Elizabeth Rose Importance: 0.0
Variable: winery_Elk Cove      Importance: 0.0
Variable: winery_Elk Creek Vineyards Importance: 0.0
Variable: winery_Elkhorn Peak  Importance: 0.0
Variable: winery_Elkhorn Ridge Importance: 0.0
Variable: winery_Elki          Importance: 0.0
Variable: winery_Ellanelle     Importance: 0.0
Variable: winery_Ellena        Importance: 0.0
Variable: winery_Ellman Family Importance: 0.0
Variable: winery_Elqui Wines   Importance: 0.0
Variable: winery_Elsom Cellars Importance: 0.0
Variable: winery_Elvio Cogno   Importance: 0.0
Variable: winery_Elyse         Importance: 0.0


Variable: winery_Jamesport     Importance: 0.0
Variable: winery_Jamie Slone Wines Importance: 0.0
Variable: winery_Jamieson Canyon Importance: 0.0
Variable: winery_Jamieson Ranch Importance: 0.0
Variable: winery_Jana          Importance: 0.0
Variable: winery_Jankara       Importance: 0.0
Variable: winery_Jansz         Importance: 0.0
Variable: winery_Janzen        Importance: 0.0
Variable: winery_Jardin        Importance: 0.0
Variable: winery_Jardín de Lúculo Importance: 0.0
Variable: winery_Jarvis        Importance: 0.0
Variable: winery_Jasci & Marchesani Importance: 0.0
Variable: winery_Jason-Stephens Importance: 0.0
Variable: winery_Jasper        Importance: 0.0
Variable: winery_Jasper Hill   Importance: 0.0
Variable: winery_Jasper Sisco  Importance: 0.0
Variable: winery_Jaspi         Importance: 0.0
Variable: winery_Jaume Serra   Importance: 0.0
Variable: winery_Javadi        Importance: 0.0
Variable: winery_Jawbone       Importance: 0.0
Variable: winery_Jax           Importance: 0

Variable: winery_Marius        Importance: 0.0
Variable: winery_Mark David    Importance: 0.0
Variable: winery_Mark Moretti  Importance: 0.0
Variable: winery_Mark Ryan     Importance: 0.0
Variable: winery_Mark West     Importance: 0.0
Variable: winery_Market Vineyards Importance: 0.0
Variable: winery_Markham       Importance: 0.0
Variable: winery_Marko         Importance: 0.0
Variable: winery_Markowitsch   Importance: 0.0
Variable: winery_Marktree      Importance: 0.0
Variable: winery_Markus Altenburger Importance: 0.0
Variable: winery_Markus Huber  Importance: 0.0
Variable: winery_Markus Molitor Importance: 0.0
Variable: winery_Markus Wine Co Importance: 0.0
Variable: winery_Marland       Importance: 0.0
Variable: winery_Marlargo      Importance: 0.0
Variable: winery_Marlborough Estate Reserve Importance: 0.0
Variable: winery_Marojallia    Importance: 0.0
Variable: winery_Maroon        Importance: 0.0
Variable: winery_Marotti Campi Importance: 0.0
Variable: winery_Marquee       Import

Variable: winery_Pio Cesare    Importance: 0.0
Variable: winery_Pio Wines     Importance: 0.0
Variable: winery_Piocho        Importance: 0.0
Variable: winery_Pioiero       Importance: 0.0
Variable: winery_Piollot Père et Fils Importance: 0.0
Variable: winery_Piombaia Rossi-Cantini Importance: 0.0
Variable: winery_Piot-Sévillano Importance: 0.0
Variable: winery_Piper Sonoma  Importance: 0.0
Variable: winery_Piper-Heidsieck Importance: 0.0
Variable: winery_Pipers Brook Vineyard Estate Importance: 0.0
Variable: winery_Piping Shrike Importance: 0.0
Variable: winery_Pippin Hill Farm Importance: 0.0
Variable: winery_Piquentum     Importance: 0.0
Variable: winery_Pircas Negras Importance: 0.0
Variable: winery_Piro          Importance: 0.0
Variable: winery_Piro Piro Piccolo Importance: 0.0
Variable: winery_Pirouette     Importance: 0.0
Variable: winery_Pirro Varone  Importance: 0.0
Variable: winery_Pisa Range    Importance: 0.0
Variable: winery_Pisano        Importance: 0.0
Variable: winery_Pi

Variable: winery_Stella Fino   Importance: 0.0
Variable: winery_Stella Mia    Importance: 0.0
Variable: winery_Stella Monsi  Importance: 0.0
Variable: winery_Stella Rosa   Importance: 0.0
Variable: winery_Stellar Organics Importance: 0.0
Variable: winery_Stellareese   Importance: 0.0
Variable: winery_Stellenbosch Hills Importance: 0.0
Variable: winery_Stellenbosch Vineyards Importance: 0.0
Variable: winery_Stellenryck   Importance: 0.0
Variable: winery_Stellina      Importance: 0.0
Variable: winery_Steltzner     Importance: 0.0
Variable: winery_Stemilt Creek Importance: 0.0
Variable: winery_Stemmari      Importance: 0.0
Variable: winery_Stemmler      Importance: 0.0
Variable: winery_Steorra       Importance: 0.0
Variable: winery_Stephan Ehlen Importance: 0.0
Variable: winery_Stephane Tissot Importance: 0.0
Variable: winery_Stephanie     Importance: 0.0
Variable: winery_Stephen & Walker Importance: 0.0
Variable: winery_Stephen Ross  Importance: 0.0
Variable: winery_Stephen Vincent Impor

Variable: winery_William Church Importance: 0.0
Variable: winery_William Cole  Importance: 0.0
Variable: winery_William Fèvre Importance: 0.0
Variable: winery_William Grassie Importance: 0.0
Variable: winery_William Harrison Importance: 0.0
Variable: winery_William Hatcher Importance: 0.0
Variable: winery_William Hill Estate Importance: 0.0
Variable: winery_William James Importance: 0.0
Variable: winery_William Knuttel Importance: 0.0
Variable: winery_William Marie Importance: 0.0
Variable: winery_William Murdoch Importance: 0.0
Variable: winery_Williams & Heim Importance: 0.0
Variable: winery_Williams & Humbert Importance: 0.0
Variable: winery_Williamson Vineyard Importance: 0.0
Variable: winery_Williamson Wines Importance: 0.0
Variable: winery_Willm         Importance: 0.0
Variable: winery_Willow Creek  Importance: 0.0
Variable: winery_Willow Crest  Importance: 0.0
Variable: winery_WillowBrook Cellars Importance: 0.0
Variable: winery_Willowbrook   Importance: 0.0
Variable: winery_Wil