# --- CLASSIFYING WINE TYPES BASED ON WINEMAKER'S DESCRIPTIONS --- 

In [1]:
import pandas as pd
import numpy as np
import random
from collections import Counter

# NLP Libraries
import re
import nltk
from nltk.stem import WordNetLemmatizer

# One-Hot Encoding
from nltk.corpus import wordnet as wn
from string import punctuation as punc

# Document to Vector Embedding 
from gensim.models.doc2vec import TaggedDocument
from gensim.parsing.preprocessing import preprocess_string
from gensim.models import Phrases
from gensim.models import Doc2Vec
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

# Saving to file
import pickle

# Miscellaneous Functions
from sklearn import utils
from sklearn.metrics import accuracy_score, f1_score

  from pandas import Panel


In [2]:
from decision_tree import *
from helper_functions import *
from pruning import *
from random_forest import *

In [3]:
# Functions to save/load objects to/from file
def save_obj(obj, name):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
class Switch(dict):
    def __getitem__(self, item):
        for key in self.keys():                   # iterate over the intervals
            if item in key:                       # if the argument is part of that interval
                return super().__getitem__(key)   # return its associated value
        raise KeyError(item)                      # if not in any interval, raise KeyError

switch = Switch({
    "White Wines": 0,
    "Red Wines": 1
})


def switch_value(i):
    return switch[i]

In [6]:
df = pd.read_csv('./DATASETS/winemaker_data.csv', encoding='latin-1')
df = df.rename(columns={"Varietal_WineType_Name": "label", "Winemakers_Notes":"description"})

df.head()

Unnamed: 0,Name,abv,year,PriceRetail,Appellation_Region_Name,Varietal_Name,label,description
0,Milbrandt Traditions Merlot 2007,14.2,2007.0,14.99,Washington,Merlot,Red Wines,Our 2007 Traditions Merlot features grapes fro...
1,MacLaren Drouthy Neebors Syrah 2009,14.0,2009.0,35.0,California,Syrah/Shiraz,Red Wines,Deep Purple color. Layered aromatics: Black Ra...
2,Cliff Lede Poetry Stags Leap District Cabernet...,14.5,2001.0,125.0,California,Cabernet Sauvignon,Red Wines,Beautiful bottle-aged aromas are revealing the...
3,Tenuta di Biserno Campo di Sasso Insoglio del ...,14.5,2007.0,34.0,Italy,Other Red Blends,Red Wines,Insoglio del Cinghiale is the foundation wine ...
4,Gordon Brothers Cabernet Sauvignon 2010,13.8,2010.0,26.99,Washington,Cabernet Sauvignon,Red Wines,"Black cherry and cranberry, vanilla roasting o..."


In [7]:
df['label'] = df['label'].apply(switch_value)
print(df.label.value_counts(), '\n\n', len(df), ' rows')

1    1000
0    1000
Name: label, dtype: int64 

 2000  rows


# --- BEGINNING OF DOC2VEC ---

## Text Processing

In [8]:
filtered_df = df
filtered_df.head(3)

Unnamed: 0,Name,abv,year,PriceRetail,Appellation_Region_Name,Varietal_Name,label,description
0,Milbrandt Traditions Merlot 2007,14.2,2007.0,14.99,Washington,Merlot,1,Our 2007 Traditions Merlot features grapes fro...
1,MacLaren Drouthy Neebors Syrah 2009,14.0,2009.0,35.0,California,Syrah/Shiraz,1,Deep Purple color. Layered aromatics: Black Ra...
2,Cliff Lede Poetry Stags Leap District Cabernet...,14.5,2001.0,125.0,California,Cabernet Sauvignon,1,Beautiful bottle-aged aromas are revealing the...


In [9]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
lemmatizer = WordNetLemmatizer()

def normalize_text(text):
    norm_text = text.lower()
    #Replace and breaks with regular spaces
    norm_text = norm_text.replace('<br />',' ')
    norm_text = norm_text.replace(', ',' ')
    #Use regex to pad all punctuation with spaces on both sides
    norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
    norm_text = norm_text.lower()
    return norm_text

def tokenize_text(text):
    tokens = []
    for sentence in nltk.sent_tokenize(text):
        for word in tokenizer.tokenize(text): #nltk.word_tokenize(sentence):
            if len(word)<2:
                continue
            tokens.append(word.lower())
    return tokens

def process_text(text):
    token_list_orig = tokenize_text(text)
    token_list = []
    for token_orig in token_list_orig:
        token = lemmatizer.lemmatize(normalize_text(token_orig), pos='a') #pos = 'a' --> adjective
        if token.isdigit()==False and token not in token_list:
            token_list.append(token)
    return token_list

In [10]:
STOPWORDS = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that's", "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'will', 'yet', 'therefore']

# Dictionary of elements and their respective counts
counts = Counter(STOPWORDS)
# Print elements which have 2 or more instances.
for i in counts:
    if counts[i] > 1:
        print(i, counts[i])
        
print(filtered_df['description'])   
filtered_df['description']=filtered_df['description'].transform(process_text)
filtered_df['description']=filtered_df['description'].transform(lambda x: [word for word in x if word not in set(STOPWORDS)])

0       Our 2007 Traditions Merlot features grapes fro...
1       Deep Purple color. Layered aromatics: Black Ra...
2       Beautiful bottle-aged aromas are revealing the...
3       Insoglio del Cinghiale is the foundation wine ...
4       Black cherry and cranberry, vanilla roasting o...
                              ...                        
1995    Our 2009 Estate Chardonnay is refreshing and c...
1996    Delle Venezie, Italy The Tre Venezie region is...
1997    Grown on the hillside above the Santa Maria Be...
1998    Framed by a light lemony acidity and vibrant m...
1999    Because of the popularity and demand of our Dr...
Name: description, Length: 2000, dtype: object


In [11]:
index = random.randint(0, len(filtered_df))
print(filtered_df['description'].iloc[index]) #Example of processed description

['some', 'best', 'chardonnays', 'washington', 'state', 'grown', 'slightly', 'cool', 'growing', 'conditions', 'yakima', 'valley', 'northern', 'latitudes', 'columbia', 'schmitt', 'vineyard', 'provides', 'nice', 'tropical', 'fruit', 'evergreen', 'latitude', 'river', 'contributes', 'crisp', 'acidity', 'minerality', 'elegantly', 'expressive', 'chardonnay', 'offers', 'enticing', 'mix', 'flint', 'asian', 'pear', 'vibrant', 'structure', 'wine', 'finely', 'balanced', 'richness', 'adding', 'complexity', 'clean', 'lingering', 'finish']


## Identify Useful Bigrams or Trigrams

In [12]:
bigram = Phrases(filtered_df['description'], min_count=3, delimiter=b' ')
trigram = Phrases(bigram[filtered_df['description']], min_count=3, delimiter=b' ')

for i in range(len(filtered_df['description'])):
    description = filtered_df['description'].iloc[i]
    bigrams_list = [b for b in bigram[description] if b.count(' ') == 1]
    trigrams_list = [t for t in trigram[bigram[description]] if t.count(' ') == 2]
    
    # Add identified bigrams to the tokenized description
    if len(bigrams_list) != 0:
        #print(bigrams_list)
        for sequence in bigrams_list:
            if sequence not in description:
                filtered_df['description'].iloc[i].append(sequence)
    '''
    if len(trigrams_list) !=0:
        #print(trigrams_list)
        for sequence in trigrams_list:
             if sequence not in description:
                filtered_df['description'].iloc[i].append(sequence)
    '''

In [13]:
print(filtered_df['description'].iloc[index]) #Example of processed description with bi(tri)grams added

['some', 'best', 'chardonnays', 'washington', 'state', 'grown', 'slightly', 'cool', 'growing', 'conditions', 'yakima', 'valley', 'northern', 'latitudes', 'columbia', 'schmitt', 'vineyard', 'provides', 'nice', 'tropical', 'fruit', 'evergreen', 'latitude', 'river', 'contributes', 'crisp', 'acidity', 'minerality', 'elegantly', 'expressive', 'chardonnay', 'offers', 'enticing', 'mix', 'flint', 'asian', 'pear', 'vibrant', 'structure', 'wine', 'finely', 'balanced', 'richness', 'adding', 'complexity', 'clean', 'lingering', 'finish', 'washington state', 'cool growing', 'tropical fruit', 'crisp acidity', 'chardonnay offers', 'asian pear', 'adding complexity', 'lingering finish']


In [15]:
random.seed(42)
train, test = train_test_split(filtered_df, test_size=0.3)

train_tagged = train.apply(
    lambda r: TaggedDocument(words=r['description'], tags=[r.label]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=r['description'], tags=[r.label]), axis=1)

In [16]:
print(train_tagged, '\n\n', test_tagged)

0       ([traditions, merlot, features, grapes, differ...
2       ([beautiful, bottle, aged, aromas, revealing, ...
5       ([bright, violet, red, color, extends, primary...
8       ([crozes, hermitage, le, clos, cuvee, deep, ru...
9       ([dark, night, absolutely, explosive, nose, ri...
                              ...                        
1993    ([guenoc, lake, county, sauvignon, blanc, made...
1995    ([estate, chardonnay, refreshing, crisp, flora...
1996    ([delle, venezie, italy, tre, region, made, th...
1997    ([grown, hillside, santa, maria, bench, barbar...
1999    ([popularity, demand, dry, riesling, sweet, th...
Length: 1400, dtype: object 

 1309    ([ghost, pines, chardonnay, possesses, express...
228     ([almira, los, dos, bright, cherry, red, viole...
51      ([vineyard, sources, old, vine, zinfandel, blu...
1518    ([bright, clear, appearance, sauvignon, blanc,...
563     ([illustration, offers, intense, aromatics, bl...
                              ...        

In [19]:
train_tagged[index] #Example of tagged description

TaggedDocument(words=['dark', 'purple', 'color', 'generous', 'aromas', 'currants', 'blackberry', 'licorice', 'chocolate', 'black', 'olive', 'espresso', 'vanilla', 'wine', 'round', 'full', 'palate', 'forward', 'fruit', 'great', 'structure', 'balanced', 'acid', 'tannins', 'decant', 'youth', 'enjoy', 'next', 'years', 'blend', 'cabernet', 'sauvignon', 'malbec', 'franc', 'petit', 'verdot', 'purple color', 'black olive', 'next years', 'blend cabernet', 'malbec franc', 'petit verdot'], tags=[1])

## Building Doc2Vec Vocabulary

In [16]:
import multiprocessing
cores = multiprocessing.cpu_count()

# Build a Distributed Bag of Words model
model_dbow = Doc2Vec(dm=0, vector_size=50, window=5, alpha=0.1, negative=0, hs=1, min_count=1, sample=0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha-=0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|█████████████████████████████████████████████████████████████████████████| 1400/1400 [00:00<00:00, 853120.09it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 1400/1400 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████| 1400/1400 [00:00<00:00, 1404118.99it/s]
100%|████████████████████████████████████████████████████████████████████████| 1400/1400 [00:00<00:00, 1406136.40it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 1400/1400 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████| 1400/1400 [00:00<00:00, 1404118.99it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 1400/1400 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████| 1400/1400 [00:00<00:00, 1401438.09it/s]
100%|███████████████████████████████████████████

In [21]:
save_obj(filtered_df, 'Doc2Vec_Set2_filtered_df_BIGRAMS_ADDED')
save_obj(model_dbow, 'Doc2Vec_Set2_dim50_BIBIGRAMS_ADDED')
save_obj(train_tagged, 'Doc2Vec_Set2_train_tagged_BIGRAMS_ADDED')
save_obj(test_tagged, 'Doc2Vec_Set2_test_tagged_BIGRAMS_ADDED')

## Quick Read of Objects

In [None]:
filtered_df = load_obj('Doc2Vec_Set2_filtered_df_BIGRAMS_ADDED')
model_dbow = load_obj('Doc2Vec_Set2_dim50_BIBIGRAMS_ADDED')
train_tagged = load_obj('Doc2Vec_Set2_train_tagged_BIGRAMS_ADDED')
test_tagged = load_obj('Doc2Vec_Set2_test_tagged_BIGRAMS_ADDED')

In [18]:
def build_vector(model, tagged_docs):
    sentences = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sentences])
    return targets, regressors

In [19]:
y_train, X_train = build_vector(model_dbow, train_tagged)
y_test, X_test = build_vector(model_dbow, test_tagged)

y_test, y_train = np.asarray(y_test), np.asarray(y_train)
X_test, X_train = np.asarray(X_test), np.asarray(X_train)

In [20]:
print('Training features:\n', X_train, '\n\nTesting features:\n', X_test)

Training features:
 [[ 0.5243982  -0.03275125  0.05669038 ...  1.0344319  -0.596706
   0.20215616]
 [-0.10164765 -0.04866321 -0.10057602 ...  0.7482713   1.3736737
  -0.16044329]
 [ 1.5054115   0.26774165  0.5422134  ... -0.00344261  0.18010773
   1.9383588 ]
 ...
 [ 0.53142196  0.8812689   0.15873995 ... -1.3987981  -0.40616184
   0.4942059 ]
 [-0.2781031  -0.66414523 -0.20700526 ... -0.12378209  0.14133808
   0.15886061]
 [ 0.75472564 -1.5386854  -0.48001447 ... -0.02232955 -1.1940879
   0.64506173]] 

Testing features:
 [[-0.23453255 -0.60255736 -0.27274948 ... -0.06704514 -1.2434884
   0.16366035]
 [ 0.09970373  0.43510538  0.0881969  ...  0.44452217 -0.22727357
   1.1556609 ]
 [-0.01916254  0.60981625 -0.35722786 ...  0.45252728 -0.75482935
  -0.252179  ]
 ...
 [-0.12269566  0.17664967  0.801761   ... -0.02225018 -0.5020351
   0.29009673]
 [ 0.30077466 -0.19669685 -0.04734763 ...  0.20968905  0.5462311
   0.6462368 ]
 [ 0.5650323   0.24459806 -0.02922523 ...  0.02188571  0.4730297

In [21]:
print(type(X_test), type(y_test), type(X_train), type(y_train))
print('Training features shape: ', X_train.shape, '\nTesting features shape: ', X_test.shape)
print('Training labels shape: ', y_train.shape, '\nTesting labels shape: ', y_test.shape)

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'numpy.ndarray'>
Training features shape:  (1400, 50) 
Testing features shape:  (600, 50)
Training labels shape:  (1400,) 
Testing labels shape:  (600,)


In [22]:
print('Training labels:\n', y_train, '\n\nTesting labels:\n', y_test)

Training labels:
 [1 1 1 ... 0 0 0] 

Testing labels:
 [0 1 1 0 1 1 1 1 0 1 0 0 0 0 1 0 1 1 1 1 1 1 0 0 1 0 1 0 0 0 0 1 1 1 0 1 0
 0 1 0 0 1 0 1 1 0 1 1 0 1 0 1 1 1 1 0 1 0 1 0 1 0 1 0 1 0 1 1 0 1 0 0 0 0
 0 1 0 1 0 1 1 0 1 0 1 1 0 0 0 1 0 0 1 0 0 1 1 1 1 1 0 1 0 0 0 1 0 0 1 0 0
 0 1 1 1 1 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 1 1 1 1 0 0 0 1 1 0 0 1 0 0 1 1 1
 0 1 0 0 0 1 0 0 1 0 0 1 1 0 1 0 0 1 0 1 0 1 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0
 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1
 0 1 0 0 0 1 0 1 0 1 1 1 0 0 1 1 1 1 0 0 1 0 1 0 1 0 0 0 1 1 0 1 0 1 0 0 0
 1 1 0 0 0 0 1 0 1 0 0 0 1 1 0 1 1 1 1 1 1 1 0 0 1 0 0 1 1 0 0 1 1 1 1 1 0
 0 1 1 0 1 1 0 1 0 0 0 1 1 0 0 0 1 1 1 0 0 1 1 1 1 1 0 1 0 0 0 1 1 1 0 1 0
 1 1 1 0 0 1 1 0 1 0 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1 0 1 1 0 0 1 1 1 1 1 0 0
 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 0 1
 1 1 0 1 0 0 1 0 1 1 0 0 0 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1 0 0 1 1 1 0 0 1 1
 0 1 0 0 1 1 0 1 1 1 0 0 0 0 1 0 0 0 1 0 0 1 

# --- END OF DOC2VEC ---

# --- CLASSIFICATION ---

## Create Training & Testing Sets

In [23]:
tr = X_train, y_train.reshape(X_train.shape[0], 1) #X_train_variety.reshape(X_train.shape[0], 1), y_train.reshape(X_train.shape[0], 1)
ts = X_test, y_test.reshape(X_test.shape[0], 1)    #X_test_variety.reshape(X_test.shape[0], 1), y_test.reshape(X_test.shape[0], 1)
TRAIN = np.hstack(tr)
TEST = np.hstack(ts)

# Columns = [0, 1, ..., 48, 49]: One for each dimension of the document vectors
columns = list(range(X_train.shape[1]))
for i in range(len(columns)):
    columns[i]=str(columns[i])
columns.append('label')

TRAIN_df = pd.DataFrame(TRAIN, columns=columns)
TEST_df = pd.DataFrame(TEST, columns=columns)

## Cross Validation

In [24]:
TRAIN_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,label
0,0.524398,-0.032751,0.05669,-0.920059,-0.4211,-0.203994,0.476724,0.570324,-0.183935,2.08444,...,0.365105,0.222436,1.318043,0.170131,0.004944,-1.363862,1.034432,-0.596706,0.202156,1.0
1,-0.101648,-0.048663,-0.100576,1.4073,0.003519,0.121042,0.093383,-1.260965,1.189414,0.04846,...,0.00236,-0.424213,-0.030207,-0.585183,-0.329262,0.881722,0.748271,1.373674,-0.160443,1.0
2,1.505412,0.267742,0.542213,-1.005325,-0.179881,0.468256,-0.317871,-0.495346,1.144418,-0.016699,...,0.120182,-0.258227,0.045762,0.319787,-0.150069,1.334856,-0.003443,0.180108,1.938359,1.0
3,-0.26676,0.871653,-1.397439,-1.590761,-0.4491,-0.310976,0.696647,-0.399794,1.280667,0.093377,...,-0.171234,-0.750394,-0.122897,-1.70645,0.137792,-0.465734,-0.720827,-0.416541,-0.985605,1.0
4,0.406107,-0.119848,-0.459487,-0.564603,-0.869797,-0.120177,0.230116,-0.301367,-0.126521,-0.148975,...,0.728755,-0.240772,-0.423331,-0.604241,0.539562,0.392587,0.577998,0.4899,0.374064,1.0


In [25]:
num_folds=5

# We select k random samples from our dataset, and divide them into num_folds disjoint sets of equal length
indices = TRAIN_df.index.tolist()
cv_dataset_indices = random.sample(population=indices, k=1000)
cv_dataset = TRAIN_df.loc[cv_dataset_indices]
cv_dataset = np.asarray(cv_dataset)
cv = cross_validation_fold_split(dataset=cv_dataset, folds = num_folds)
cv = np.asarray(cv)

## Single Decision Tree (PRUNED)

In [26]:
# We then train our model(s) on num_folds-1 of the sets and evaluate on the final set (giving every set a chance to be the evaluation set)
print("TREE CROSS VALIDATION RESULTS")
print('Cross Validation Split Shape: ', cv.shape)

total_accuracy = 0
for i in range(num_folds):
    df_cv_train, df_cv_test = cross_validation_train_test_split(cv_set=cv, df=TRAIN_df, test_set_index=i)
    cv_tree = decision_tree_algorithm(df=df_cv_train, ml_task='classification', max_depth=10)
    
    j = random_exclude(excluded=i, range_list=range(num_folds))
    _, df_val = cross_validation_train_test_split(cv_set=cv, df=TRAIN_df, test_set_index=j)
    cv_tree_pruned = post_pruning(cv_tree, df_cv_train, df_val, ml_task="classification")
    
    accuracy = calculate_accuracy(df_cv_test, cv_tree)
    print("Accuracy for Test Fold: ", i, " ", accuracy)
    total_accuracy += accuracy
    
cv_accuracy = total_accuracy/num_folds
print('Cross Validation Accuracy: ', cv_accuracy)

TREE CROSS VALIDATION RESULTS
Cross Validation Split Shape:  (5, 200, 51)


FEATURE_TYPES:

 50 


Accuracy for Test Fold:  0   0.735


FEATURE_TYPES:

 50 


Accuracy for Test Fold:  1   0.78


FEATURE_TYPES:

 50 


Accuracy for Test Fold:  2   0.73


FEATURE_TYPES:

 50 


Accuracy for Test Fold:  3   0.77


FEATURE_TYPES:

 50 


Accuracy for Test Fold:  4   0.77
Cross Validation Accuracy:  0.757


## Random Forest (PRUNED)

In [None]:
print("RANDOM FOREST CROSS VALIDATION RESULTS")
print('Cross Validation Split Shape: ', cv.shape)

total_accuracy = 0
for i in range(num_folds):
    df_cv_train, df_cv_test = cross_validation_train_test_split(cv_set=cv, df=TRAIN_df, test_set_index=i)
    cv_forest = multiprocessor_random_forest_algorithm(train_df=df_cv_train, n_trees=50, n_bootstrap=180, n_features=9999, 
                                                tree_max_depth=10, ml_task='classification')
    accuracy, predictions = calculate_forest_accuracy(df_cv_test, cv_forest)
    print("Accuracy for Test Fold: ", i, " ", accuracy)
    total_accuracy += accuracy

cv_accuracy = total_accuracy/num_folds
print('\n\nCross Validation Accuracy: ', cv_accuracy)

# --- WORD2VEC ---

In [66]:
sentences = filtered_df['description'].copy()
sentences

0       [traditions, merlot, features, grapes, differe...
1       [deep, purple, color, layered, aromatics, blac...
2       [beautiful, bottle, aged, aromas, revealing, f...
3       [insoglio, del, cinghiale, foundation, wine, t...
4       [black, cherry, cranberry, vanilla, roasting, ...
                              ...                        
1995    [estate, chardonnay, refreshing, crisp, floral...
1996    [delle, venezie, italy, tre, region, made, thr...
1997    [grown, hillside, santa, maria, bench, barbara...
1998    [framed, light, lemony, acidity, vibrant, mine...
1999    [popularity, demand, dry, riesling, sweet, tho...
Name: description, Length: 2000, dtype: object

In [67]:
token_count = sum([len(sentence) for sentence in sentences])
print('The wine corpus contains {0:,} tokens'.format(token_count))

The wine corpus contains 87,333 tokens


In [68]:
import gensim.models.word2vec as w2v

num_features = 300
min_word_count = 1
num_workers = multiprocessing.cpu_count()
context_size = 10
downsampling = 1e-3
seed=42

wine2vec = w2v.Word2Vec(sg=1, seed=seed, workers=num_workers, size=num_features, min_count=min_word_count, window=context_size, sample=downsampling)
wine2vec.build_vocab(sentences)

In [69]:
print('Word2Vec vocabulary length:', len(wine2vec.wv.vocab))
print(wine2vec.corpus_count)

Word2Vec vocabulary length: 8510
2000


In [70]:
wine2vec.train(sentences, total_examples=wine2vec.corpus_count, epochs=wine2vec.iter)

  wine2vec.train(sentences, total_examples=wine2vec.corpus_count, epochs=wine2vec.iter)


(401623, 436665)

In [71]:
def get_word_vector(model, word):
    return model.wv[word]

df_new = filtered_df.copy()
df_new['description'] = df_new['description'].transform(lambda x: [get_word_vector(model=wine2vec, word=word) for word in x])
df_new['description']

0       [[-0.032882363, -0.015514586, -0.03601328, 0.0...
1       [[-0.15854508, 0.30994657, 0.0013985692, -0.11...
2       [[0.008912634, 0.2211171, 0.09175639, -0.06798...
3       [[-0.015497052, -0.0043974225, -0.01539303, 0....
4       [[-0.1530068, 0.33843523, 0.16195166, -0.07445...
                              ...                        
1995    [[-0.03134968, -0.01678482, -0.0066042803, -0....
1996    [[-0.025455082, -0.006362297, -0.016307851, 0....
1997    [[-0.032143537, -0.012416244, 0.006038067, -0....
1998    [[-0.027692253, 0.31596544, 0.20393181, -0.036...
1999    [[-0.03012332, -0.018497268, -0.037862334, 0.0...
Name: description, Length: 2000, dtype: object

In [72]:
print('There are {} features for each description:\n\n'.format(df_new['description'].iloc[0][0].shape[0])) 
df_new['description']= [np.sum(df_new['description'].iloc[i], axis=0) for i in range(len(df_new['description']))]
df_new['description']

There are 300 features for each description:




0       [-3.2449803, 9.214225, 0.38900614, -2.4666688,...
1       [-2.4679615, 8.614912, 2.0543778, -1.1914982, ...
2       [-2.1554103, 5.142528, 1.2000762, -0.8881511, ...
3       [-2.9846292, 6.2345457, 0.2841835, -0.79096794...
4       [-2.1482818, 10.142084, 3.112584, -1.9630648, ...
                              ...                        
1995    [-0.09059665, 0.59008676, 1.8152654, 1.9556797...
1996    [-0.8923338, 4.583701, 3.9287446, 0.57219845, ...
1997    [-1.8054626, 5.4407167, 3.1684525, -0.64351463...
1998    [-0.5464104, 1.6006874, 1.8525207, 2.287367, -...
1999    [-2.8178847, 3.8953316, 2.4202645, 0.9484857, ...
Name: description, Length: 2000, dtype: object

In [73]:
df_new = df_new[['description', 'label']]

train_df, test_df = train_test_split(df=df_new, test_size=0.3)
print('train_df: \n{}'.format(train_df), '\n\n', 'test_df: \n{}'.format(test_df))

train_df: 
                                            description  label
0     [-3.2449803, 9.214225, 0.38900614, -2.4666688,...      1
1     [-2.4679615, 8.614912, 2.0543778, -1.1914982, ...      1
3     [-2.9846292, 6.2345457, 0.2841835, -0.79096794...      1
4     [-2.1482818, 10.142084, 3.112584, -1.9630648, ...      1
5     [-1.7842454, 7.4831624, 0.5973474, -2.5305257,...      1
...                                                 ...    ...
1994  [-0.019414235, 7.5020313, 5.814876, 0.76685077...      0
1995  [-0.09059665, 0.59008676, 1.8152654, 1.9556797...      0
1997  [-1.8054626, 5.4407167, 3.1684525, -0.64351463...      0
1998  [-0.5464104, 1.6006874, 1.8525207, 2.287367, -...      0
1999  [-2.8178847, 3.8953316, 2.4202645, 0.9484857, ...      0

[1400 rows x 2 columns] 

 test_df: 
                                            description  label
153   [-0.7823523, 6.136071, 0.8799547, -1.7928402, ...      1
1333  [-0.7810608, 8.100519, 6.598667, -0.1386536, -...      0
491   

In [74]:
X=np.asarray([train_df.iloc[i][0] for i in range(len(train_df))])
y=np.asarray(train_df.label).reshape(X.shape[0],1)

In [75]:
Xtest=np.asarray([test_df.iloc[i][0] for i in range(len(test_df))])
ytest=np.asarray(test_df.label).reshape(Xtest.shape[0],1)

In [76]:
X, y

(array([[ -3.2449803 ,   9.214225  ,   0.38900614, ..., -14.482726  ,
         -11.38087   ,  -7.6981416 ],
        [ -2.4679615 ,   8.614912  ,   2.0543778 , ..., -10.133245  ,
          -7.85954   ,  -5.5641994 ],
        [ -2.9846292 ,   6.2345457 ,   0.2841835 , ..., -12.646857  ,
          -9.072554  ,  -5.4624825 ],
        ...,
        [ -1.8054626 ,   5.4407167 ,   3.1684525 , ..., -10.309441  ,
          -8.465744  ,  -5.362092  ],
        [ -0.5464104 ,   1.6006874 ,   1.8525207 , ..., -10.3321495 ,
          -8.065953  ,  -4.666677  ],
        [ -2.8178847 ,   3.8953316 ,   2.4202645 , ..., -13.915132  ,
         -11.083644  ,  -5.9618073 ]], dtype=float32),
 array([[1],
        [1],
        [1],
        ...,
        [0],
        [0],
        [0]], dtype=int64))

In [77]:
X.shape, y.shape, Xtest.shape, ytest.shape

((1400, 300), (1400, 1), (600, 300), (600, 1))

In [78]:
columns = ['_'+str(i)+'_' for i in range(0,300)]
columns.append('target_label')
D = pd.DataFrame(data=np.concatenate((X,y), axis=1), columns=columns)
T = pd.DataFrame(data=np.concatenate((Xtest,ytest), axis=1), columns=columns)

In [49]:
save_obj(X, 'X')
save_obj(y, 'y')
save_obj(D, 'D')
save_obj(T, 'T')

In [7]:
X, y, D, T = load_obj('X'), load_obj('y'), load_obj('D'), load_obj('T')

In [79]:
D

Unnamed: 0,_0_,_1_,_2_,_3_,_4_,_5_,_6_,_7_,_8_,_9_,...,_291_,_292_,_293_,_294_,_295_,_296_,_297_,_298_,_299_,target_label
0,-3.244980,9.214225,0.389006,-2.466669,-10.108183,-5.349817,3.663701,-6.674496,-3.192860,-0.020569,...,3.266598,-9.103728,10.761422,7.853642,-6.807631,-11.809257,-14.482726,-11.380870,-7.698142,1.0
1,-2.467962,8.614912,2.054378,-1.191498,-4.495191,-3.291983,0.057604,-5.222356,-3.280240,-0.862789,...,5.248476,-6.066744,3.794762,5.778044,-4.476419,-6.985113,-10.133245,-7.859540,-5.564199,1.0
2,-2.984629,6.234546,0.284184,-0.790968,-7.697112,-4.354197,2.343929,-4.118157,-2.299702,-0.969656,...,3.313917,-8.509166,10.492888,7.616368,-4.438893,-10.610935,-12.646857,-9.072554,-5.462482,1.0
3,-2.148282,10.142084,3.112584,-1.963065,-5.608982,-3.803071,-0.209855,-6.279403,-4.356566,-0.283045,...,5.195980,-6.178975,4.177288,6.340839,-4.801833,-7.717837,-10.322958,-9.140126,-6.472419,1.0
4,-1.784245,7.483162,0.597347,-2.530526,-6.072765,-2.718414,2.127453,-4.741004,-1.905276,0.291880,...,2.386324,-4.541891,4.129561,4.933238,-4.609901,-6.064564,-7.777425,-6.084000,-4.322505,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1395,-0.019414,7.502031,5.814876,0.766851,-3.070222,-3.554436,-3.886941,-3.782379,-5.996253,-2.069686,...,6.950454,-8.104604,7.740019,7.116932,-2.408879,-9.247143,-10.376175,-10.049765,-6.210229,0.0
1396,-0.090597,0.590087,1.815265,1.955680,-1.017433,-1.686828,-1.714668,-0.239967,-2.067820,-2.066956,...,2.653470,-5.235921,6.025311,2.179415,-1.323213,-4.543195,-5.856121,-4.781673,-2.858314,0.0
1397,-1.805463,5.440717,3.168453,-0.643515,-6.843784,-2.577157,0.319431,-3.165191,-2.531389,-0.510012,...,2.706594,-8.551054,11.039748,7.269344,-2.962161,-9.371391,-10.309441,-8.465744,-5.362092,0.0
1398,-0.546410,1.600687,1.852521,2.287367,-3.114530,-3.411109,-0.876414,-1.460925,-2.559625,-2.372160,...,4.194547,-8.384384,10.655892,4.800525,-2.094435,-9.022962,-10.332150,-8.065953,-4.666677,0.0


## Cross Validation

In [80]:
num_folds=5

# We select k random samples from our dataset, and divide them into num_folds disjoint sets of equal length
indices = D.index.tolist()
cv_dataset_indices = random.sample(population=indices, k=500)
cv_dataset = D.loc[cv_dataset_indices]
cv_dataset = np.asarray(cv_dataset)
cv = cross_validation_fold_split(dataset=cv_dataset, folds = num_folds)
cv = np.asarray(cv)

## SINGLE DECISION TREE (PRUNED)

In [52]:
# We then train our model(s) on num_folds-1 of the sets and evaluate on the final set (giving every set a chance to be the evaluation set)
print("TREE CROSS VALIDATION RESULTS")
print('Cross Validation Split Shape: ', cv.shape)

total_accuracy = 0
for i in range(num_folds):
    df_cv_train, df_cv_test = cross_validation_train_test_split(cv_set=cv, df=D, test_set_index=i)
    cv_tree = decision_tree_algorithm(df=df_cv_train, ml_task='classification', max_depth=10)
    
    j = random_exclude(excluded=i, range_list=range(num_folds))
    _, df_val = cross_validation_train_test_split(cv_set=cv, df=D, test_set_index=j)
    cv_tree_pruned = post_pruning(cv_tree, df_cv_train, df_val, ml_task="classification")
    
    accuracy = calculate_accuracy(df_cv_test, cv_tree_pruned)
    print("Accuracy for Test Fold: ", i, " ", accuracy)
    total_accuracy += accuracy
    
cv_accuracy = total_accuracy/num_folds
print('Cross Validation Accuracy: ', cv_accuracy)

TREE CROSS VALIDATION RESULTS
Cross Validation Split Shape:  (5, 100, 301)
Accuracy for Test Fold:  0   0.97
Accuracy for Test Fold:  1   0.93
Accuracy for Test Fold:  2   0.89
Accuracy for Test Fold:  3   0.97
Accuracy for Test Fold:  4   0.95
Cross Validation Accuracy:  0.942


In [82]:
cv_tree_pruned

{'_90_ <= 0.859322726726532': [{'_135_ <= -0.2206837236881256': [0.0,
    {'_135_ <= 0.32357239723205566': [{'_75_ <= 0.6699714660644531': [1.0,
        0.0]},
      1.0]}]},
  1.0]}

## RANDOM FOREST (PRUNED)

In [53]:
print("RANDOM FOREST CROSS VALIDATION RESULTS")
print('Cross Validation Split Shape: ', cv.shape)

total_accuracy = 0
for i in range(num_folds):
    df_cv_train, df_cv_test = cross_validation_train_test_split(cv_set=cv, df=D, test_set_index=i)
    cv_forest = multiprocessor_random_forest_algorithm(train_df=df_cv_train, n_trees=50, n_bootstrap=175, n_features=9999, 
                                                tree_max_depth=10, ml_task='classification')
    accuracy, predictions = calculate_forest_accuracy(df_cv_test, cv_forest, ml_task="classification")
    print("Accuracy for Test Fold: ", i, " ", accuracy)
    total_accuracy += accuracy

cv_accuracy = total_accuracy/num_folds
print('\n\nCross Validation Accuracy: ', cv_accuracy)

RANDOM FOREST CROSS VALIDATION RESULTS
Cross Validation Split Shape:  (5, 100, 301)

Time taken to build and prune forest = 556.6860954761505 seconds

Forest contains: 50 trees
precision: 	 0.98
recall: 	 0.98
fscore: 	 0.98
support:	 None
Accuracy for Test Fold:  0   0.98

Time taken to build and prune forest = 412.11194109916687 seconds

Forest contains: 50 trees
precision: 	 0.9502403846153845
recall: 	 0.95
fscore: 	 0.9500250626566417
support:	 None
Accuracy for Test Fold:  1   0.95


Process ForkPoolWorker-56:
Process ForkPoolWorker-4:
Process ForkPoolWorker-82:
Process ForkPoolWorker-146:
Process ForkPoolWorker-52:
Process ForkPoolWorker-98:
Process ForkPoolWorker-85:
Process ForkPoolWorker-103:
Process ForkPoolWorker-92:
Process ForkPoolWorker-101:
Process ForkPoolWorker-148:
Process ForkPoolWorker-113:
Process ForkPoolWorker-105:
Process ForkPoolWorker-64:
Process ForkPoolWorker-20:
Process ForkPoolWorker-6:
Process ForkPoolWorker-31:
Process ForkPoolWorker-38:
Process ForkPoolWorker-13:
Process ForkPoolWorker-138:
Process ForkPoolWorker-8:
Process ForkPoolWorker-117:
Process ForkPoolWorker-116:
Process ForkPoolWorker-45:
Process ForkPoolWorker-100:
Process ForkPoolWorker-33:
Process ForkPoolWorker-91:
Process ForkPoolWorker-28:
Process ForkPoolWorker-62:
Process ForkPoolWorker-134:
Process ForkPoolWorker-68:
Process ForkPoolWorker-70:
Process ForkPoolWorker-46:
Process ForkPoolWorker-29:
Process ForkPoolWorker-42:
Process ForkPoolWorker-126:
Process ForkPoolWor

KeyboardInterrupt: 

Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-127:
Process ForkPoolWorker-119:
Process ForkPoolWorker-136:
Process ForkPoolWorker-53:
Traceback (most recent call last):
Process ForkPoolWorker-84:
Process ForkPoolWorker-97:
Process ForkPoolWorker-80:
Traceback (most recent call last):
Process ForkPoolWorker-99:
Process ForkPoolWorker-131:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-142:
Process ForkPoolWorker-83:
  File "/opt/conda/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Process ForkPoolWorker-11:
Process ForkPoolWorker-71:
Traceback (most recent call last):
Process ForkPoolWorker-107:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-122:
Traceback (most recent call last

# BAG OF WORDS

## Building the Vocabulary

In [51]:
vocabulary = []
for description in filtered_df['description']:
    for token in description:
        if token not in vocabulary:
            vocabulary.append(token)

In [62]:
save_obj(vocabulary, 'GC_SET2_BoW_WineVocabulary')

In [41]:
start = random.randint(0, len(vocabulary)-81)
end = start + 80
vocab_sample = vocabulary[start:end]
print('Length of wine vocabulary: {}'.format(len(vocabulary)), '\n\n', 'Random Sample of Vocabulary: \n{}'.format(vocab_sample))

Length of wine vocabulary: 8510 

 Random Sample of Vocabulary: 
['packs', 'myriad', 'trave', 'moka', 'another years', 'jewel', 'rightly', 'king', 'verona', 'trays', 'january', 'pouring', 'botanical', 'attractively', 'sperss', 'climatic', 'induced', 'seghesio', 'seventh', 'formally', 'crack', 'medley', 'airing', 'flushes', 'vegtables', 'plantings', 'casablanca', 'implement', 'aggressive', 'sumptuous', 'barbeques', 'sports', 'impeccable', 'livio', 'felluga', 'vertigo', 'currange', 'ready drink', 'flores', 'zuccardi', 'bluish', 'comparable', 'georges', 'normally', 'versions', 'arcus', 'confectionary', 'mulling', 'succulence', 'persisting', 'tartare', 'cassoulet', 'rabbit', 'responsible', 'piluna', 'scrub', 'underneath', 'turns', 'patton', 'pantry', 'loamy', 'quietly', 'fails', 'regarded', 'aperitif', 'wafting', 'mildly', 'stuff', 'breathes', 'folks', 'impatient', 'cream sauces', 'wadenswil', 'overlays', 'assertive', 'inkwell', 'bottom', 'brewed', 'teenager', 'talk']


In [42]:
# Create a dictionary to store the counts and a dictionary to store feature word synonyms
dict_count = {}
dict_syns = {}
            
# Find synonyms of feature words and store them in dict_syns
for word in vocabulary:
    dict_syns[word] = []
    dict_syns[word].append(word)
    for syn in wn.synsets(word):
        for lem in syn.lemmas():
            if lem.name().replace('_', ' ').lower() not in dict_syns[word]:
                dict_syns[word].append(lem.name().replace('_', ' ').lower()) 

In [65]:
save_obj(dict_syns, 'GC_SET2_BoW_DictSynonyms')

In [43]:
descriptions_array = np.asarray(filtered_df['description'])              
for word in dict_syns.keys():
    dict_count[word] = 0
    #Note that the descriptions stored in filtered_df are already tokenized
    for description in descriptions_array: 
        for i in range(len(dict_syns[word])):
            if dict_syns[word][i] in description:
                dict_count[word] += 1

In [67]:
save_obj(dict_count, 'GC_SET2_BoW_DictCount')

In [44]:
dict_count_sample = [(key,value) for key,value in dict_count.items() if key in vocab_sample]
print('Length of wine count dictionary: {}'.format(len(dict_count)), '\n\n', 'Random Sample of count dicti: \n{}'.format(dict_count_sample))

Length of wine count dictionary: 8510 

 Random Sample of count dicti: 
[('packs', 77), ('myriad', 4), ('trave', 1), ('moka', 1), ('another years', 4), ('jewel', 5), ('rightly', 2), ('king', 40), ('verona', 1), ('trays', 1), ('january', 3), ('pouring', 10), ('botanical', 1), ('attractively', 73), ('sperss', 1), ('climatic', 1), ('induced', 81), ('seghesio', 1), ('seventh', 1), ('formally', 3), ('crack', 75), ('medley', 3), ('airing', 17), ('flushes', 253), ('vegtables', 1), ('plantings', 43), ('casablanca', 5), ('implement', 8), ('aggressive', 5), ('sumptuous', 54), ('barbeques', 8), ('sports', 13), ('impeccable', 2), ('livio', 1), ('felluga', 1), ('vertigo', 1), ('currange', 1), ('ready drink', 7), ('flores', 1), ('zuccardi', 2), ('bluish', 38), ('comparable', 115), ('georges', 2), ('normally', 4), ('versions', 11), ('arcus', 1), ('confectionary', 2), ('mulling', 5), ('succulence', 6), ('persisting', 23), ('tartare', 1), ('cassoulet', 1), ('rabbit', 3), ('responsible', 1), ('piluna', 

In [46]:
labels = np.asarray(filtered_df['label'])
descriptions = np.asarray(filtered_df['description'])

In [47]:
print(labels, labels.shape)

[1 1 1 ... 0 0 0] (2000,)


In [48]:
# OneHot encode our feature words for the model
feature_matrix = np.empty((len(labels), len(vocabulary)))
for i in range(len(labels)):
    for j in range(len(vocabulary)):
        for k in dict_syns[vocabulary[j]]:
            if k in descriptions[i]:
                feature_matrix[i, j] = 1

feature_matrix = np.concatenate((feature_matrix, labels.reshape(feature_matrix.shape[0], 1)), axis=1)
print(feature_matrix, '\n', feature_matrix.shape)

[[1. 1. 1. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 1. 0.]] 
 (2000, 8511)


In [None]:
save_obj(feature_matrix, 'GC_SET2_BoW_FeatureMatrix')

In [None]:
dict_syns = load_obj('GC_SET2_BoW_DictSynonyms')
dict_count = load_obj('GC_SET2_BoW_DictCount')
vocabulary = load_obj('GC_SET2_BoW_WineVocabulary')
feature_matrix = load_obj('GC_SET2_BoW_FeatureMatrix')

## Training Testing Split

In [56]:
columns = vocabulary
columns.append('target_label')
BoW_df = pd.DataFrame(data=feature_matrix, columns=columns)
BoW_df

Unnamed: 0,traditions,merlot,features,grapes,different,sites,columbia,valley,diversity,terroir,...,market,homerun,proposition,crave,revival,rising,ashes,artwork,portraying,target_label
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
TRAIN, TEST = train_test_split(df=BoW_df, test_size=0.3)
save_obj(TRAIN, 'TRAIN_BoW_WINEMAKER_DATASET')
save_obj(TEST, 'TEST_BoW_WINEMAKER_DATASET')

## Cross Validation

In [60]:
num_folds=5

# We select k random samples from our dataset, and divide them into num_folds disjoint sets of equal length
indices = TRAIN.index.tolist()
cv_dataset_indices = random.sample(population=indices, k=500)
cv_dataset = TRAIN.loc[cv_dataset_indices]
cv_dataset = np.asarray(cv_dataset)
cv = cross_validation_fold_split(dataset=cv_dataset, folds = num_folds)
cv = np.asarray(cv)

## SINGLE TREE (PRUNED)

In [61]:
# We then train our model(s) on num_folds-1 of the sets and evaluate on the final set (giving every set a chance to be the evaluation set)
print("TREE CROSS VALIDATION RESULTS")
print('Cross Validation Split Shape: ', cv.shape)

total_accuracy = 0
for i in range(num_folds):
    df_cv_train, df_cv_test = cross_validation_train_test_split(cv_set=cv, df=TRAIN, test_set_index=i)
    cv_tree = decision_tree_algorithm(df=df_cv_train, ml_task='classification', max_depth=10)
    
    j = random_exclude(excluded=i, range_list=range(num_folds))
    _, df_val = cross_validation_train_test_split(cv_set=cv, df=TRAIN, test_set_index=j)
    cv_tree_pruned = post_pruning(cv_tree, df_cv_train, df_val, ml_task="classification")
    
    accuracy = calculate_accuracy(df_cv_test, cv_tree_pruned)
    print("Accuracy for Test Fold: ", i, " ", accuracy)
    total_accuracy += accuracy
    
cv_accuracy = total_accuracy/num_folds
print('Cross Validation Accuracy: ', cv_accuracy)

TREE CROSS VALIDATION RESULTS
Cross Validation Split Shape:  (5, 100, 8511)
Accuracy for Test Fold:  0   0.91
Accuracy for Test Fold:  1   0.94
Accuracy for Test Fold:  2   0.93
Accuracy for Test Fold:  3   0.95
Accuracy for Test Fold:  4   0.98
Cross Validation Accuracy:  0.9420000000000002


In [63]:
cv_tree_pruned

{'black = 1.0': [1.0,
  {'crimson = 1.0': [{'apples = 1.0': [0.0,
      {'grapefruits = 1.0': [0.0, 1.0]}]},
    {'cabernets = 1.0': [1.0,
      {'tannins = 1.0': [1.0,
        {'syrah = 1.0': [1.0,
          {'blueberry = 1.0': [1.0,
            {'nobility = 1.0': [1.0,
              {'cherries = 1.0': [1.0,
                {'winemaking = 1.0': [{'tonic = 1.0': [0.0, 1.0]},
                  0.0]}]}]}]}]}]}]}]}]}

## RANDOM FOREST (PRUNED)

In [None]:
print("RANDOM FOREST CROSS VALIDATION RESULTS")
print('Cross Validation Split Shape: ', cv.shape)

total_accuracy = 0
for i in range(num_folds):
    df_cv_train, df_cv_test = cross_validation_train_test_split(cv_set=cv, df=D, test_set_index=i)
    cv_forest = multiprocessor_random_forest_algorithm(train_df=df_cv_train, n_trees=50, n_bootstrap=175, n_features=9999, 
                                                tree_max_depth=10, ml_task='classification')
    accuracy, predictions = calculate_forest_accuracy(df_cv_test, cv_forest, ml_task="classification")
    print("Accuracy for Test Fold: ", i, " ", accuracy)
    total_accuracy += accuracy

cv_accuracy = total_accuracy/num_folds
print('\n\nCross Validation Accuracy: ', cv_accuracy)