In [2]:
#load needed libraries
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords # import stopwords list

#retrive cleaned dataset
%store -r df
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,description_len,points_range,price_range,vintage
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,20.0,Sicily & Sardinia,Etna,,Kerin O’Keefe,#NAME?,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,172,"[85,90)","[20,40)",2013
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,#NAME?,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,227,"[85,90)","[0,20)",2011
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,186,"[85,90)","[0,20)",2013
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,199,"[85,90)","[0,20)",2013
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,249,"[85,90)","[60,100)",2012


In [15]:
from nltk.corpus import stopwords # import stopwords list

#Convert the stopwords to a set to fasten the process
stops = set(stopwords.words("english"))

def description_to_words (description):
    '''
    Function to convert a description to a string of words.
    The input is a single string (a raw description).
    The output is a single string (a preprocessed description).
    '''
    
    # 1. Remove HTML
    review_text = BeautifulSoup(description).get_text()
    
    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
        
    # 3. Convert to lower case & split into individual words
    words = letters_only.lower().split()
    
    # 4. Remove the stop words
    meaningful_words = [word for word in words if not word in stops]
    
    return(meaningful_words)

df["description_clean"] = df["description"].apply(description_to_words)
df["description_clean"].head()

0    [aromas, include, tropical, fruit, broom, brim...
1    [ripe, fruity, wine, smooth, still, structured...
2    [tart, snappy, flavors, lime, flesh, rind, dom...
3    [pineapple, rind, lemon, pith, orange, blossom...
4    [much, like, regular, bottling, comes, across,...
Name: description_clean, dtype: object

In [17]:
#Stemming
from nltk.stem.snowball import SnowballStemmer
ss = SnowballStemmer("english")

def stemming (text):
    '''
    Function performing stemming on the words.
    Input is a list of strings.
    The output is a list of stemmed words.
    '''
    stem_words = [ss.stem(word) for word in text]
    return(stem_words)

df["description_stem"] = df["description_clean"].apply(stemming)

In [23]:
#Lemmatizing
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

def lemmatizing (text):
    '''
    Function performing lemmatizing on the words.
    Input is a list of strings.
    The output is a list of lemmatized words.
    '''
    lemmatized = []
    
    for word, tag in pos_tag(text):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized.append(lem.lemmatize(word, pos))
        
    return(lemmatized)

df["description_lem"] = df["description_clean"].apply(lemmatizing)

In [26]:
def get_binary (points):
    '''
    Function to get binary outcome.
    Input is number of points wine has.
    Output is 1 if wine is better than average, 0 if average or worse than average
    '''
    
    if points > df.points.mean():
        return(1)
    elif points <= df.points.mean():
        return(0)
    
df["better_than_avg"] = df["points"].apply(get_binary)

In [21]:
df.select_dtypes(include=['object']).columns

Index(['country', 'description', 'designation', 'province', 'region_1',
       'region_2', 'taster_name', 'taster_twitter_handle', 'title', 'variety',
       'winery', 'points_range', 'price_range', 'description_clean',
       'description_stem'],
      dtype='object')

In [43]:
def create_dummy_df(df, cat_cols, dummy_na):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains all columns that were not specified as categorical
            2. removes all the original columns in cat_cols
            3. dummy columns for each of the categorical columns in cat_cols
            4. if dummy_na is True - it also contains dummy columns for the NaN values
            5. Use a prefix of the column name with an underscore (_) for separating 
    '''
   
    for col in  cat_cols:
        try:
            # for each cat add dummy var, drop original column
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            continue
    return df

df_to_feed = df[["country", "variety", "price_range", "winery", "vintage", "description_len"]]
df_cat = create_dummy_df(df_to_feed, ["country", "variety", "price_range", "winery"], dummy_na = False)
df_cat.shape

(119928, 750)

In [52]:
#for some reason create_dummy_df didn't create a dummy column for winery - dropping winery
df_cat = df_cat.drop(["winery"], axis=1)

In [35]:
from sklearn.feature_extraction.text import CountVectorizer

# Prepare data for CountVectoriser
description_list = df["description_lem"].tolist()
cv_input = [" ".join(sublist) for sublist in description_list]

# Initialize the "CountVectorizer" object , sklearn bag of words tool.
vectorizer = CountVectorizer(analyzer = "word", 
                            ngram_range = (1,2),
                            max_features = 2000)

description_vectorized = vectorizer.fit_transform(cv_input)

['abound',
 'abundant',
 'acacia',
 'accent',
 'accessible',
 'accompany',
 'acid',
 'acidic',
 'acidity',
 'acidity balance',
 'acidity crisp',
 'acidity cut',
 'acidity drink',
 'acidity end',
 'acidity finish',
 'acidity firm',
 'acidity flavor',
 'acidity give',
 'acidity keep',
 'acidity make',
 'acidity provide',
 'acidity well',
 'acidity wine',
 'acre',
 'across',
 'across palate',
 'add',
 'add complexity',
 'addition',
 'additional',
 'aftertaste',
 'aftertaste drink',
 'age',
 'age drink',
 'age month',
 'age potential',
 'age well',
 'age wine',
 'age year',
 'aggressive',
 'air',
 'alcohol',
 'allow',
 'allspice',
 'allure',
 'almond',
 'almost',
 'almost sweet',
 'along',
 'alongside',
 'alongside firm',
 'already',
 'also',
 'also show',
 'although',
 'always',
 'american',
 'among',
 'amount',
 'ample',
 'ample acidity',
 'animal',
 'anise',
 'another',
 'another year',
 'ap',
 'ap ritif',
 'appeal',
 'appear',
 'appellation',
 'appetizing',
 'apple',
 'apple citrus',
 

In [53]:
from sklearn.model_selection import train_test_split

y = np.array(df.better_than_avg)
X = sp.sparse.hstack([description_vectorized, df_cat.values.astype(float)],format = 'csr')
features_names = vectorizer.get_feature_names() + df_cat.columns.tolist()

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 40)

In [56]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

#Instantiate a LinearRegression model with normalized data
lm_model = LinearRegression(normalize=True)
    
#Fit your model to the training data
lm_model.fit(X_train, y_train)
        
#Predict the response for the training data and the test data
y_test_preds = lm_model.predict(X_test) 
y_train_preds = lm_model.predict(X_train)
    
#Obtain an rsquared value for both the training and test data
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)
    
print(test_score, train_score) 

0.5128444066297537 0.5387928010936389


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 1000, random_state = 40)

# Train the model on training data
rf.fit(X_train, y_train)

#Predict the response for the training data and the test data
y_test_preds = rf.predict(X_test) 
y_train_preds = rf.predict(X_train)

#Obtain an rsquared value for both the training and test data
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)

print(test_score, train_score)

In [13]:
#can we predict if wine will be better or worse than the average based on 
#the grape, the vintage and the description length?

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

X = df.drop(["description", "winery", "designation", "points", "province", 
             "region_1", "region_2", "title", "description_processed"], axis = 1)
y = df['points']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42)

In [11]:
df.drop(["description", "winery" "designation", "points", "province", "region_1", "region_2", "title", "description_processed"], axis = 1)
df.drop(["winery"], axis = 1)

Unnamed: 0,description,designation,points,price,province,region_1,region_2,title,description_len,vintage,...,variety_Çalkarası,variety_Žilavka,"points_range_[85,90)","points_range_[90,95)","points_range_[95,100]","price_range_[100,500)","price_range_[20,40)","price_range_[40,60)",price_range_[500+],"price_range_[60,100)"
0,This tremendous 100% varietal wine hails from ...,Martha's Vineyard,96,235.0,California,Napa Valley,Napa,0,355,2013,...,0,0,0,0,1,1,0,0,0,0
1,"Ripe aromas of fig, blackberry and cassis are ...",Carodorum Selección Especial Reserva,96,110.0,Northern Spain,Toro,,0,318,2013,...,0,0,0,0,1,1,0,0,0,0
2,Mac Watson honors the memory of a wine once ma...,Special Selected Late Harvest,96,90.0,California,Knights Valley,Sonoma,0,280,2013,...,0,0,0,0,1,0,0,0,0,1
3,"This spent 20 months in 30% new French oak, an...",Reserve,96,65.0,Oregon,Willamette Valley,Willamette Valley,0,386,2013,...,0,0,0,0,1,0,0,0,0,1
4,"This is the top wine from La Bégude, named aft...",La Brûlade,95,66.0,Provence,Bandol,,0,376,2013,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93071,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,227,2013,...,0,0,0,1,0,0,1,0,0,0
93072,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Oregon,Oregon Other,Citation 2004 Pinot Noir (Oregon),233,2004,...,0,0,0,1,0,0,0,0,0,1
93073,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,225,2013,...,0,0,0,1,0,0,1,0,0,0
93074,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),216,2012,...,0,0,0,1,0,0,1,0,0,0


In [14]:
#Instantiate a LinearRegression model with normalized data
lm_model = LinearRegression(normalize=True)
    
#Fit your model to the training data
lm_model.fit(X_train, y_train)
        
#Predict the response for the training data and the test data
y_test_preds = lm_model.predict(X_test) 
y_train_preds = lm_model.predict(X_train)
    
#Obtain an rsquared value for both the training and test data
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)
    
print(test_score, train_score) 


-1.0965497393212993e+26 0.8635667439855326
