# --- CLASSIFICATION USING ONLY WINE DESCRIPTIONS ---

In [1]:
import pandas as pd
import numpy as np
import random
from collections import Counter

# NLP Libraries
import re
import nltk
from nltk.stem import WordNetLemmatizer

# One-Hot Encoding
from nltk.corpus import wordnet as wn
from string import punctuation as punc

# Document to Vector Embedding 
from gensim.models.doc2vec import TaggedDocument
from gensim.parsing.preprocessing import preprocess_string
from gensim.models import Phrases
from gensim.models import Doc2Vec
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

# Saving to file
import pickle

# Miscellaneous Functions
from sklearn import utils
from sklearn.metrics import accuracy_score, f1_score

  from pandas import Panel


In [2]:
from decision_tree import *
from helper_functions import *
from pruning import *
from random_forest import *

In [3]:
# Functions to save/load objects to/from file
def save_obj(obj, name):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
class Switch(dict):
    def __getitem__(self, item):
        for key in self.keys():                   # iterate over the intervals
            if item in key:                       # if the argument is part of that interval
                return super().__getitem__(key)   # return its associated value
        raise KeyError(item)                      # if not in any interval, raise KeyError
        
switch = Switch({
    range(80, 88): 'Average',
    range(88, 94): 'Good',
    range(94, 101): 'Excellent'
})

def switch_value(i):
    return switch[i]

In [149]:
df = pd.read_csv('winemag-data_130k.csv', index_col=0)
df = df.drop(['designation','region_1','region_2','taster_name','taster_twitter_handle'], axis=1)
df = df.reindex(columns = ['country', 'price', 'province', 'title', 'variety', 'winery', 'description', 'points'])
df = df.dropna()
df.head(3)

Unnamed: 0,country,price,province,title,variety,winery,description,points
1,Portugal,15.0,Douro,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,"This is ripe and fruity, a wine that is smooth...",87
2,US,14.0,Oregon,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,"Tart and snappy, the flavors of lime flesh and...",87
3,US,13.0,Michigan,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,"Pineapple rind, lemon pith and orange blossom ...",87


In [150]:
df['points'] = df['points'].apply(switch_value)
print(df.points.value_counts(), '\n\n', len(df), ' rows')

Good         67023
Average      48232
Excellent     5660
Name: points, dtype: int64 

 120915  rows


# --- BEGINNING OF DOC2VEC ---

## Text Processing

In [151]:
filtered_df = df
filtered_df.head(3)

Unnamed: 0,country,price,province,title,variety,winery,description,points
1,Portugal,15.0,Douro,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,"This is ripe and fruity, a wine that is smooth...",Average
2,US,14.0,Oregon,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,"Tart and snappy, the flavors of lime flesh and...",Average
3,US,13.0,Michigan,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,"Pineapple rind, lemon pith and orange blossom ...",Average


In [152]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()

def normalize_text(text):
    norm_text = text.lower()
    #Replace and breaks with regular spaces
    norm_text = norm_text.replace('<br />',' ')
    norm_text = norm_text.replace(', ',' ')
    #Use regex to pad all punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    norm_text = norm_text.lower()
    return norm_text

def tokenize_text(text):
    tokens = []
    for sentence in nltk.sent_tokenize(text):
        for word in tokenizer.tokenize(text): #nltk.word_tokenize(sentence):
            if len(word)<2:
                continue
            tokens.append(word.lower())
    return tokens

def process_text(text):
    token_list_orig = tokenize_text(text)
    token_list = []
    for token_orig in token_list_orig:
        token = lemmatizer.lemmatize(normalize_text(token_orig), pos='a') #pos = 'a' --> adjective
        if token.isdigit()==False and token not in token_list:
            token_list.append(token)
    return token_list

In [153]:
STOPWORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that's", "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'will', 'yet', 'therefore']

# Dictionary of elements and their respective counts
counts = Counter(STOPWORDS)
# Print elements which have 2 or more instances.
for i in counts:
    if counts[i] > 1:
        print(i, counts[i])
        
print(filtered_df['description'])   
filtered_df['description']=filtered_df['description'].transform(process_text)
filtered_df['description']=filtered_df['description'].transform(lambda x: [word for word in x if word not in set(STOPWORDS)])

1         This is ripe and fruity, a wine that is smooth...
2         Tart and snappy, the flavors of lime flesh and...
3         Pineapple rind, lemon pith and orange blossom ...
4         Much like the regular bottling from 2012, this...
5         Blackberry and raspberry aromas show a typical...
                                ...                        
129966    Notes of honeysuckle and cantaloupe sweeten th...
129967    Citation is given as much as a decade of bottl...
129968    Well-drained gravel soil gives this wine its c...
129969    A dry style of Pinot Gris, this is crisp with ...
129970    Big, rich and off-dry, this is powered by inte...
Name: description, Length: 120915, dtype: object


In [154]:
print(filtered_df['description'].iloc[104450]) #Example of processed description

['faint', 'scent', 'red', 'cherry', 'greets', 'nose', 'also', 'makes', 'brief', 'appearance', 'fresh', 'clean', 'rather', 'light', 'body', 'slightly', 'rustic', 'honest', 'straightforward', 'drink', 'soon']


## Identify Useful Bigrams or Trigrams

In [155]:
bigram = Phrases(filtered_df['description'], min_count=3, delimiter=b' ')
trigram = Phrases(bigram[filtered_df['description']], min_count=3, delimiter=b' ')

for i in range(len(filtered_df['description'])):
    description = filtered_df['description'].iloc[i]
    bigrams_list = [b for b in bigram[description] if b.count(' ') == 1]
    trigrams_list = [t for t in trigram[bigram[description]] if t.count(' ') == 2]
    
    # Add identified bigrams to the tokenized description
    if len(bigrams_list) != 0:
        #print(bigrams_list)
        for sequence in bigrams_list:
            if sequence not in description:
                filtered_df['description'].iloc[i].append(sequence)

    if len(trigrams_list) !=0:
        #print(trigrams_list)
        for sequence in trigrams_list:
             if sequence not in description:
                filtered_df['description'].iloc[i].append(sequence)

In [156]:
print(filtered_df['description'].iloc[104450]) #Example of processed description with bi(tri)grams added

['faint', 'scent', 'red', 'cherry', 'greets', 'nose', 'also', 'makes', 'brief', 'appearance', 'fresh', 'clean', 'rather', 'light', 'body', 'slightly', 'rustic', 'honest', 'straightforward', 'drink', 'soon', 'greets nose', 'brief appearance', 'drink soon']


In [157]:
random.seed(42)
train, test = train_test_split(filtered_df, test_size=0.3)

train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['description'], tags=[r.points]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['description'], tags=[r.points]), axis=1)

In [158]:
print(train_tagged, '\n\n', test_tagged)

1         ([ripe, fruity, wine, smooth, still, structure...
3         ([pineapple, rind, lemon, pith, orange, blosso...
4         ([much, like, regular, bottling, comes, across...
6         ([bright, informal, red, opens, aromas, candie...
7         ([dry, restrained, wine, offers, spice, profus...
                                ...                        
129966    ([notes, honeysuckle, cantaloupe, sweeten, del...
129967    ([citation, given, much, decade, bottle, age, ...
129968    ([well, drained, gravel, soil, gives, wine, cr...
129969    ([dry, style, pinot, gris, crisp, some, acidit...
129970    ([big, rich, dry, powered, intense, spiciness,...
Length: 84641, dtype: object 

 90112     ([riverbend, vineyard, estate, fielding, hills...
15686     ([sweet, rounded, very, satisfying, wine, pack...
3512      ([opens, enticing, scents, white, spring, flow...
104450    ([winery, best, barrel, bottling, chardonnay, ...
38883     ([nicely, put, together, totally, generic, sen...
        

In [159]:
test_tagged[104450] #Example of tagged description

TaggedDocument(words=['winery', 'best', 'barrel', 'bottling', 'chardonnay', 'excels', 'fronts', 'showing', 'lemon', 'peels', 'browned', 'butter', 'creamy', 'lily', 'pan', 'seared', 'apples', 'light', 'crisp', 'savory', 'nose', 'mouthfeel', 'rich', 'decorated', 'zesty', 'line', 'salty', 'acidity', 'cuts', 'curd', 'flavors', 'altogether', 'mouthwatering', 'perfect', 'seaside', 'meal', 'lemon peels', 'browned butter', 'pan seared', 'acidity cuts', 'pan seared apples'], tags=['Excellent'])

## Building Doc2Vec Vocabulary

In [160]:
import multiprocessing
cores = multiprocessing.cpu_count()

# Build a Distributed Bag of Words model
model_dbow = Doc2Vec(dm=0, vector_size=300, window=6, alpha=0.1, negative=0, hs=1, min_count=1, sample=0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=10)
    model_dbow.alpha-=0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|███████████████████████████████████████████████████████████████████████████████████| 84641/84641 [00:00<00:00, 2568758.17it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 84641/84641 [00:00<00:00, 2495852.68it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 84641/84641 [00:00<00:00, 2819397.58it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 84641/84641 [00:00<00:00, 2424766.65it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 84641/84641 [00:00<00:00, 2173322.67it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 84641/84641 [00:00<00:00, 2233329.67it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 84641/84641 [00:00<00:00, 2233371.82it/s]
100%|███████████████████████████████████████████████████████████████████████

In [161]:
save_obj(filtered_df, 'filtered_df_BI(TRI)GRAMS_ADDED')
save_obj(model_dbow, 'Doc2VecModel_dim300_BI(TRI)BIGRAMS_ADDED')
save_obj(train_tagged, 'train_tagged_BI(TRI)GRAMS_ADDED')
save_obj(test_tagged, 'test_tagged_BI(TRI)GRAMS_ADDED')

## Quick Read of Objects

In [4]:
filtered_df = load_obj('filtered_df_BI(TRI)GRAMS_ADDED')
model_dbow = load_obj('Doc2VecModel_dim300_BI(TRI)BIGRAMS_ADDED')
train_tagged = load_obj('train_tagged_BI(TRI)GRAMS_ADDED')
test_tagged = load_obj('test_tagged_BI(TRI)GRAMS_ADDED')

In [5]:
def build_vector(model, tagged_docs):
    sentences = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sentences])
    return targets, regressors

In [6]:
y_train, X_train = build_vector(model_dbow, train_tagged)
y_test, X_test = build_vector(model_dbow, test_tagged)

y_test, y_train = np.asarray(y_test), np.asarray(y_train)
X_test, X_train = np.asarray(X_test), np.asarray(X_train)

In [7]:
print('Training features:\n', X_train, '\n\nTesting features:\n', X_test)

Training features:
 [[-0.18063109 -0.09168802  0.05263169 ... -0.53402215  0.09448549
   0.8700811 ]
 [ 0.01169075 -0.17287932 -0.5011579  ...  0.68881804 -0.24040882
  -0.16493876]
 [ 0.29427034 -0.42070842  0.8691367  ... -0.60931605  0.43962625
  -0.4169078 ]
 ...
 [-0.3071592   1.0296812  -0.5473156  ...  0.19299982 -0.15332144
   0.36165422]
 [ 0.8991018  -0.55979055  0.24753436 ... -0.3449637  -0.19222908
  -0.3669207 ]
 [ 0.44568652 -0.28273207 -0.6249972  ... -0.07680467  0.27394786
   0.29635316]] 

Testing features:
 [[ 0.19210516 -0.18950948  0.39476228 ...  0.42772606  0.26536658
   0.1264891 ]
 [ 0.07479941 -0.3259339  -0.19050281 ...  0.13997473  0.10704373
   0.5419218 ]
 [-1.0062314   0.7475895  -0.5165186  ... -0.47335842  0.11224923
   0.18396878]
 ...
 [-0.15938194  0.59519124 -0.4622667  ... -0.36517146 -0.35467997
   0.17865384]
 [ 0.6753725   0.28228733  0.70300704 ...  0.20463043  0.32598522
   0.24419516]
 [ 0.07302083 -0.09450662 -0.11047351 ...  0.22620979 -0.

In [8]:
print(type(X_test), type(y_test), type(X_train), type(y_train))
print('Training features shape: ', X_train.shape, '\nTesting features shape: ', X_test.shape)
print('Training labels shape: ', y_train.shape, '\nTesting labels shape: ', y_test.shape)

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Training features shape:  (84641, 300) 
Testing features shape:  (36274, 300)
Training labels shape:  (84641,) 
Testing labels shape:  (36274,)


In [9]:
print('Training labels:\n', y_train, '\n\nTesting labels:\n', y_test)

Training labels:
 ['Average' 'Average' 'Average' ... 'Good' 'Good' 'Good'] 

Testing labels:
 ['Good' 'Good' 'Good' ... 'Average' 'Average' 'Good']


# --- END OF DOC2VEC ---

# --- CLASSIFICATION ---

## Create Training & Testing Sets

In [10]:
tr = X_train, y_train.reshape(X_train.shape[0], 1) #X_train_variety.reshape(X_train.shape[0], 1), y_train.reshape(X_train.shape[0], 1)
ts = X_test, y_test.reshape(X_test.shape[0], 1)    #X_test_variety.reshape(X_test.shape[0], 1), y_test.reshape(X_test.shape[0], 1)
TRAIN = np.hstack(tr)
TEST = np.hstack(ts)

# Columns = [0, 1, ..., 48, 49]: One for each dimension of the document vectors
columns = list(range(X_train.shape[1]))
for i in range(len(columns)):
    columns[i]=str(columns[i])
columns.append('label')

TRAIN_df = pd.DataFrame(TRAIN, columns=columns)
TEST_df = pd.DataFrame(TEST, columns=columns)

## Cross Validation

In [11]:
TRAIN_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,label
0,-0.18063109,-0.09168802,0.052631687,-0.043893747,-0.34156874,0.259521,-0.45582443,-0.12794867,-0.034824472,0.07642121,...,0.6598472,0.056489363,0.50765467,-0.0833376,0.121211246,-0.105902575,-0.53402215,0.09448549,0.8700811,Average
1,0.0116907535,-0.17287932,-0.5011579,0.13859753,0.14515564,-0.35263827,0.07524348,-0.36220583,-0.40334728,-0.063262105,...,-0.562625,-0.74703544,0.51362324,-0.33625287,-0.15952665,0.17526025,0.68881804,-0.24040882,-0.16493876,Average
2,0.29427034,-0.42070842,0.8691367,-0.10208137,0.4608331,0.4718892,0.369057,0.31387204,0.8788484,-0.20462957,...,-0.39358893,-0.6458911,-0.38713202,0.31280017,-0.927837,-0.585237,-0.60931605,0.43962625,-0.4169078,Average
3,0.035120413,0.1886508,0.09495963,0.005063279,-0.3539823,0.12063132,-0.057408195,0.105671465,0.10154272,-0.14498043,...,-0.110404305,0.75273836,-0.057315934,0.48987964,0.22876121,-0.07744057,-0.4477602,0.11079756,-0.18533617,Average
4,-0.47292754,-0.05432705,0.40259084,0.2410157,0.098396845,0.20488013,-0.58484787,0.02871327,-0.5234438,-0.015211833,...,0.4958639,-0.32117414,-0.17192987,0.020348648,-0.36109847,-0.28892007,0.059827104,0.16394907,0.11296543,Average


In [13]:
num_folds=5

# We select k random samples from our dataset, and divide them into num_folds disjoint sets of equal length
indices = TRAIN_df.index.tolist()
cv_dataset_indices = random.sample(population=indices, k=1000)
cv_dataset = TRAIN_df.loc[cv_dataset_indices]
cv_dataset = np.asarray(cv_dataset)
cv = cross_validation_fold_split(dataset=cv_dataset, folds = num_folds)
cv = np.asarray(cv)

## Single Decision Tree (PRUNED)

In [None]:
# We then train our model(s) on num_folds-1 of the sets and evaluate on the final set (giving every set a chance to be the evaluation set)
print("TREE CROSS VALIDATION RESULTS")
print('Cross Validation Split Shape: ', cv.shape)

total_accuracy = 0
for i in range(num_folds):
    df_cv_train, df_cv_test = cross_validation_train_test_split(cv_set=cv, df=TRAIN_df, test_set_index=i)
    cv_tree = decision_tree_algorithm(df=df_cv_train, ml_task='classification', max_depth=10)
    
    j = random_exclude(excluded=i, range_list=range(num_folds))
    _, df_val = cross_validation_train_test_split(cv_set=cv, df=TRAIN_df, test_set_index=j)
    cv_tree_pruned = post_pruning(cv_tree, df_cv_train, df_val, ml_task="classification")
    
    accuracy = calculate_accuracy(df_cv_test, cv_tree)
    print("Accuracy for Test Fold: ", i, " ", accuracy)
    total_accuracy += accuracy
    
cv_accuracy = total_accuracy/num_folds
print('Cross Validation Accuracy: ', cv_accuracy)

## Random Forest (PRUNED)

In [None]:
print("RANDOM FOREST CROSS VALIDATION RESULTS")
print('Cross Validation Split Shape: ', cv.shape)

total_accuracy = 0
for i in range(num_folds):
    df_cv_train, df_cv_test = cross_validation_train_test_split(cv_set=cv, df=TRAIN_df, test_set_index=i)
    cv_forest = multiprocessor_random_forest_algorithm(train_df=df_cv_train, n_trees=50, n_bootstrap=375, n_features=9999, 
                                                tree_max_depth=10, ml_task='classification')
    accuracy, predictions = calculate_forest_accuracy(df_cv_test, cv_forest)
    print("Accuracy for Test Fold: ", i, " ", accuracy)
    total_accuracy += accuracy

cv_accuracy = total_accuracy/numf_folds
print('\n\nCross Validation Accuracy: ', cv_accuracy)

RANDOM FOREST CROSS VALIDATION RESULTS
Cross Validation Split Shape:  (5, 200, 301)
