# Feature Extraction

In [1]:
import pandas as pd
import numpy as np
import sys
import nltk
import warnings
warnings.simplefilter(action='ignore')

In [2]:
df = pd.read_csv('top_24_winery.csv',index_col = 0)
df.head()

Unnamed: 0,winery,description,points,price
0,Domaine Zind-Humbrecht,"Rich gold in color. Broad, layered aromas of v...",90,84.0
1,Testarossa,"Cooked cranberry is spiced with anise, pepperc...",91,64.0
2,Robert Mondavi,"Pithy, with grapefruit and lemon peel flavors,...",90,20.0
3,Testarossa,"Overly sweet and simple, and something of a di...",85,49.0
4,Robert Mondavi,"With rich, sweet blackberry and cocoa flavors,...",87,28.0


In [3]:
df.shape[0]

4435

In [4]:
%run ./Text_Normalization_Function.ipynb

Collecting html.parser
Installing collected packages: html.parser
Successfully installed html.parser
[33mYou are using pip version 19.0.3, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 19.0.3, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 19.0.3, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Original:   <p>The circus dog in a plissé skirt jumped over Python who wasn't that large, just 3 feet long.</p>
Processed:  ['<', 'p', '>', 'The', 'circus', 'dog', 'in', 'a', 'plissé', 'skirt', 'jumped', 'over', 'Python', 'who', 'was', "n't", 'that', 'large', ',', 'just', '3', 'feet', 'long.', '<', '/p', '>']
Original:   <p>The circus dog in a plissé skirt jumped over Python who wasn't that large, just 3 feet long.</p>
Processed:  <p>Th

In [5]:
from sklearn import metrics 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

In [6]:
norm_description = normalize_corpus(df.description)

In [7]:
print('RAW TEXT: ', np.array(df.description)[0])
print()
print('NORMALIZED TEXT: ', norm_description[0])

RAW TEXT:  Rich gold in color. Broad, layered aromas of very ripe fruit with hints of sweet smoke, brown sugar, honey and a savory earthy minerality. Full-bodied, bone dry, but richly textured with crisp acidity and a wide palette of flavors, ripe stone fruit, a creamy savory earthiness, sage and a balsamic kick on the finish. Very long length, slightly warm but with a lingering savory, mineral finish.

NORMALIZED TEXT:  rich gold color broad layer aroma ripe fruit hint sweet smoke brown sugar honey savory earthy minerality full bodied bone dry richly textured crisp acidity wide palette flavor ripe stone fruit creamy savory earthiness sage balsamic kick finish long length slightly warm linger savory mineral finish


In [33]:
df['description']=norm_description

In [34]:
target_names=df['winery'].unique()
target_names

array(['Domaine Zind-Humbrecht', 'Testarossa', 'Robert Mondavi',
       'Louis Latour', 'Undurraga', 'Chehalem', 'Iron Horse', 'Foxen',
       'Chateau Ste. Michelle', 'Wines & Winemakers', 'Georges Duboeuf',
       'Maryhill', 'Fess Parker', 'Santa Ema', 'Kendall-Jackson',
       'Williams Selyem', 'Concha y Toro', 'Trapiche',
       'Feudi di San Gregorio', 'Louis Jadot', 'Bründlmayer',
       'DFJ Vinhos', 'V. Sattui', 'Columbia Crest', 'Montes',
       'Casa Santos Lima', 'Jean-Luc and Paul Aegerter',
       'Chanson Père et Fils', 'Lynmar', 'Siduri', 'Kunde',
       'Gary Farrell', 'Albert Bichot'], dtype=object)

**Please seperate the data into test dataset and training dataset up to this point.**

In [47]:
def train_test_with_words(df):
    def get_train_test(df):
        test=df.sample(n=887, replace=False,axis=0)
        test_bunch['data']=test['description']
        test_bunch['target']=test['winery']
        train=df.drop(axis=0, index=test.index)
        train_bunch['data']=train['description']
        train_bunch['target']=train['winery']
    
        return test_bunch,train_bunch
    test_bunch={'data':[],'target':[]}
    train_bunch={'data':[],'target':[]}
    test,train=get_train_test(df)
    
    ## BOW train!!
    vectorizer_bow = CountVectorizer(stop_words = 'english') 
    feature_matrix_bow = vectorizer_bow.fit_transform(train['data']).astype(float) # replace norm_description by your training data
    
    # 2-gram train!!!
    vectorizer_2grams = TfidfVectorizer(ngram_range = (2,2)) 
    feature_matrix_2grams = vectorizer_2grams.fit_transform(train['data']).astype(float) # replace norm_description by your training data
    
    # 3-gram train!!!
    vectorizer_3grams = TfidfVectorizer(ngram_range = (3,3)) 
    feature_matrix_3grams = vectorizer_3grams.fit_transform(train['data']).astype(float) # replace norm_description by your training data
    
    # for this part, please fit the testdata
    feature_matrix_TEST_bow = vectorizer_bow.transform(test['data'])
    feature_matrix_TEST_2grams = vectorizer_2grams.transform(test['data'])
    feature_matrix_TEST_3grams = vectorizer_3grams.transform(test['data'])
    
    return test,train,feature_matrix_bow,feature_matrix_2grams,feature_matrix_3grams,feature_matrix_TEST_bow,feature_matrix_TEST_2grams,feature_matrix_TEST_3grams

## Model building

In [48]:
accuracy_2grams=[]
accuracy_3grams=[]
accuracy_bow=[]
for i in range (0,10):
    test,train,feature_matrix_bow,feature_matrix_2grams,feature_matrix_3grams,feature_matrix_TEST_bow,feature_matrix_TEST_2grams,feature_matrix_TEST_3grams=train_test_with_words(df)
    from sklearn.naive_bayes import MultinomialNB
    
    #Naive Bayes

    #using 2 grams
    clf_2grams = MultinomialNB(alpha=0.1) 
    clf_2grams.fit(feature_matrix_2grams, train['target'])
    predicted_nb_2grams = clf_2grams.predict(feature_matrix_TEST_2grams)
    cm_2grams = metrics.confusion_matrix(test['target'], predicted_nb_2grams)
    #Confusion_matrix=pd.DataFrame(data = cm, columns = target_names,index = target_names)
    accuracy_2grams.append(metrics.accuracy_score(test['target'], predicted_nb_2grams))
    
    #using 3 grams
    clf_3grams = MultinomialNB(alpha=0.1) 
    clf_3grams.fit(feature_matrix_3grams, train['target'])
    predicted_nb_3grams = clf_3grams.predict(feature_matrix_TEST_3grams)
    cm_3grams = metrics.confusion_matrix(test['target'], predicted_nb_3grams)
    accuracy_3grams.append(metrics.accuracy_score(test['target'], predicted_nb_3grams))
    
    #using bag of words
    clf_bow = MultinomialNB(alpha=0.1) 
    clf_bow.fit(feature_matrix_bow, train['target'])
    predicted_nb_bow = clf_bow.predict(feature_matrix_TEST_bow)
    cm_bow = metrics.confusion_matrix(test['target'], predicted_nb_bow)
    accuracy_bow.append(metrics.accuracy_score(test['target'], predicted_nb_bow))

print("Accuracy rate of using 2 grams: \n", np.mean(list(accuracy_2grams))) 
print("Accuracy rate of using 3 grams: \n", np.mean(list(accuracy_3grams))) 
print("Accuracy rate of using bag of words: \n", np.mean(list(accuracy_bow))) 

Accuracy rate of using 2 grams: 
 0.37440811724915446
Accuracy rate of using 3 grams: 
 0.29255918827508454
Accuracy rate of using bag of words: 
 0.46674182638105977


In [49]:
accuracy_2grams=[]
accuracy_3grams=[]
accuracy_bow=[]
for i in range (0,10):
    test,train,feature_matrix_bow,feature_matrix_2grams,feature_matrix_3grams,feature_matrix_TEST_bow,feature_matrix_TEST_2grams,feature_matrix_TEST_3grams=train_test_with_words(df)
    from sklearn import linear_model
    
    #Logit Regression

    #using 2 grams
    clf_2grams = linear_model.SGDClassifier(loss='log') 
    clf_2grams.fit(feature_matrix_2grams, train['target'])
    predicted_nb_2grams = clf_2grams.predict(feature_matrix_TEST_2grams)
    cm_2grams = metrics.confusion_matrix(test['target'], predicted_nb_2grams)
    #Confusion_matrix=pd.DataFrame(data = cm, columns = target_names,index = target_names)
    accuracy_2grams.append(metrics.accuracy_score(test['target'], predicted_nb_2grams))
    
    #using 3 grams
    clf_3grams = linear_model.SGDClassifier(loss='log') 
    clf_3grams.fit(feature_matrix_3grams, train['target'])
    predicted_nb_3grams = clf_3grams.predict(feature_matrix_TEST_3grams)
    cm_3grams = metrics.confusion_matrix(test['target'], predicted_nb_3grams)
    accuracy_3grams.append(metrics.accuracy_score(test['target'], predicted_nb_3grams))
    
    #using bag of words 
    clf_bow = linear_model.SGDClassifier(loss='log') 
    clf_bow.fit(feature_matrix_bow, train['target'])
    predicted_nb_bow = clf_bow.predict(feature_matrix_TEST_bow)
    cm_bow = metrics.confusion_matrix(test['target'], predicted_nb_bow)
    accuracy_bow.append(metrics.accuracy_score(test['target'], predicted_nb_bow))

print("Accuracy rate of using 2 grams: \n", np.mean(list(accuracy_2grams))) 
print("Accuracy rate of using 3 grams: \n", np.mean(list(accuracy_3grams))) 
print("Accuracy rate of using bag of words: \n", np.mean(list(accuracy_bow))) 

Accuracy rate of using 2 grams: 
 0.36820744081172496
Accuracy rate of using 3 grams: 
 0.2732807215332581
Accuracy rate of using bag of words: 
 0.4167981961668546


In [50]:
accuracy_2grams=[]
accuracy_3grams=[]
accuracy_bow=[]
for i in range (0,10):
    test,train,feature_matrix_bow,feature_matrix_2grams,feature_matrix_3grams,feature_matrix_TEST_bow,feature_matrix_TEST_2grams,feature_matrix_TEST_3grams=train_test_with_words(df)
    from sklearn import linear_model
    
    #SVM 

    #using 2 grams
    clf_2grams = linear_model.SGDClassifier(loss='hinge') 
    clf_2grams.fit(feature_matrix_2grams, train['target'])
    predicted_nb_2grams = clf_2grams.predict(feature_matrix_TEST_2grams)
    cm_2grams = metrics.confusion_matrix(test['target'], predicted_nb_2grams)
    #Confusion_matrix=pd.DataFrame(data = cm, columns = target_names,index = target_names)
    accuracy_2grams.append(metrics.accuracy_score(test['target'], predicted_nb_2grams))
    
    #using 3 grams
    clf_3grams = linear_model.SGDClassifier(loss='hinge') 
    clf_3grams.fit(feature_matrix_3grams, train['target'])
    predicted_nb_3grams = clf_3grams.predict(feature_matrix_TEST_3grams)
    cm_3grams = metrics.confusion_matrix(test['target'], predicted_nb_3grams)
    accuracy_3grams.append(metrics.accuracy_score(test['target'], predicted_nb_3grams))
    
    #using bag of words 
    clf_bow = linear_model.SGDClassifier(loss='hinge') 
    clf_bow.fit(feature_matrix_bow, train['target'])
    predicted_nb_bow = clf_bow.predict(feature_matrix_TEST_bow)
    cm_bow = metrics.confusion_matrix(test['target'], predicted_nb_bow)
    accuracy_bow.append(metrics.accuracy_score(test['target'], predicted_nb_bow))

print("Accuracy rate of using 2 grams: \n", np.mean(list(accuracy_2grams))) 
print("Accuracy rate of using 3 grams: \n", np.mean(list(accuracy_3grams))) 
print("Accuracy rate of using bag of words: \n", np.mean(list(accuracy_bow))) 

Accuracy rate of using 2 grams: 
 0.39481397970687715
Accuracy rate of using 3 grams: 
 0.2939120631341601
Accuracy rate of using bag of words: 
 0.3996617812852311
