# Feature Extraction

In [1]:
import pandas as pd
import numpy as np
import sys
import nltk
import warnings
warnings.simplefilter(action='ignore')

In [9]:
df = pd.read_csv('wine_review_top10_variety.csv',index_col = 0)
df.head()
df.shape

(68220, 2)

In [3]:
%run ./Text_Normalization_Function.ipynb

Collecting html.parser
Installing collected packages: html.parser
Successfully installed html.parser
[33mYou are using pip version 19.0.3, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 19.0.3, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 19.0.3, however version 19.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Original:   <p>The circus dog in a plissé skirt jumped over Python who wasn't that large, just 3 feet long.</p>
Processed:  ['<', 'p', '>', 'The', 'circus', 'dog', 'in', 'a', 'plissé', 'skirt', 'jumped', 'over', 'Python', 'who', 'was', "n't", 'that', 'large', ',', 'just', '3', 'feet', 'long.', '<', '/p', '>']
Original:   <p>The circus dog in a plissé skirt jumped over Python who wasn't that large, just 3 feet long.</p>
Processed:  <p>Th

In [4]:
from sklearn import metrics 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

In [5]:
norm_description = normalize_corpus(df.description)

In [6]:
print('RAW TEXT: ', np.array(df.description)[0])
print()
print('NORMALIZED TEXT: ', norm_description[0])

RAW TEXT:  Pineapple rind, lemon pith and orange blossom start off the aromas. The palate is a bit more opulent, with notes of honey-drizzled guava and mango giving way to a slightly astringent, semidry finish.

NORMALIZED TEXT:  pineapple rind lemon pith orange blossom start aroma palate bit opulent note honey drizzled guava mango give way slightly astringent semidry finish


In [7]:
df['description']=norm_description

In [8]:
target_names=df['variety'].unique()
target_names

array(['Riesling', 'Pinot Noir', 'Cabernet Sauvignon', 'Chardonnay',
       'Red Blend', 'Sauvignon Blanc', 'Bordeaux-style Red Blend', 'Rosé',
       'Syrah'], dtype=object)

**Please seperate the data into test dataset and training dataset up to this point.**

In [10]:
def train_test_with_words(df):
    def get_train_test(df):
        test=df.sample(n=13700, replace=False,axis=0)
        test_bunch['data']=test['description']
        test_bunch['target']=test['variety']
        train=df.drop(axis=0, index=test.index)
        train_bunch['data']=train['description']
        train_bunch['target']=train['variety']
    
        return test_bunch,train_bunch
    test_bunch={'data':[],'target':[]}
    train_bunch={'data':[],'target':[]}
    test,train=get_train_test(df)
    
    ## BOW train!!
    vectorizer_bow = CountVectorizer(stop_words = 'english') 
    feature_matrix_bow = vectorizer_bow.fit_transform(train['data']).astype(float) # replace norm_description by your training data
    
    # 2-gram train!!!
    vectorizer_2grams = TfidfVectorizer(ngram_range = (2,2)) 
    feature_matrix_2grams = vectorizer_2grams.fit_transform(train['data']).astype(float) # replace norm_description by your training data
    
    # 3-gram train!!!
    vectorizer_3grams = TfidfVectorizer(ngram_range = (3,3)) 
    feature_matrix_3grams = vectorizer_3grams.fit_transform(train['data']).astype(float) # replace norm_description by your training data
    
    # for this part, please fit the testdata
    feature_matrix_TEST_bow = vectorizer_bow.transform(test['data'])
    feature_matrix_TEST_2grams = vectorizer_2grams.transform(test['data'])
    feature_matrix_TEST_3grams = vectorizer_3grams.transform(test['data'])
    
    return test,train,feature_matrix_bow,feature_matrix_2grams,feature_matrix_3grams,feature_matrix_TEST_bow,feature_matrix_TEST_2grams,feature_matrix_TEST_3grams

## Model building

In [11]:
accuracy_2grams=[]
accuracy_3grams=[]
accuracy_bow=[]
for i in range (0,10):
    test,train,feature_matrix_bow,feature_matrix_2grams,feature_matrix_3grams,feature_matrix_TEST_bow,feature_matrix_TEST_2grams,feature_matrix_TEST_3grams=train_test_with_words(df)
    from sklearn.naive_bayes import MultinomialNB
    
    #Naive Bayes

    #using 2 grams
    clf_2grams = MultinomialNB(alpha=0.1) 
    clf_2grams.fit(feature_matrix_2grams, train['target'])
    predicted_nb_2grams = clf_2grams.predict(feature_matrix_TEST_2grams)
    cm_2grams = metrics.confusion_matrix(test['target'], predicted_nb_2grams)
    #Confusion_matrix=pd.DataFrame(data = cm, columns = target_names,index = target_names)
    accuracy_2grams.append(metrics.accuracy_score(test['target'], predicted_nb_2grams))
    
    #using 3 grams
    clf_3grams = MultinomialNB(alpha=0.1) 
    clf_3grams.fit(feature_matrix_3grams, train['target'])
    predicted_nb_3grams = clf_3grams.predict(feature_matrix_TEST_3grams)
    cm_3grams = metrics.confusion_matrix(test['target'], predicted_nb_3grams)
    accuracy_3grams.append(metrics.accuracy_score(test['target'], predicted_nb_3grams))
    
    #using bag of words
    clf_bow = MultinomialNB(alpha=0.1) 
    clf_bow.fit(feature_matrix_bow, train['target'])
    predicted_nb_bow = clf_bow.predict(feature_matrix_TEST_bow)
    cm_bow = metrics.confusion_matrix(test['target'], predicted_nb_bow)
    accuracy_bow.append(metrics.accuracy_score(test['target'], predicted_nb_bow))

print("Accuracy rate of using 2 grams: \n", np.mean(list(accuracy_2grams))) 
print("Accuracy rate of using 3 grams: \n", np.mean(list(accuracy_3grams))) 
print("Accuracy rate of using bag of words: \n", np.mean(list(accuracy_bow))) 

Accuracy rate of using 2 grams: 
 0.736934306569343
Accuracy rate of using 3 grams: 
 0.6989270072992702
Accuracy rate of using bag of words: 
 0.7715255474452555


In [12]:
accuracy_2grams=[]
accuracy_3grams=[]
accuracy_bow=[]
for i in range (0,10):
    test,train,feature_matrix_bow,feature_matrix_2grams,feature_matrix_3grams,feature_matrix_TEST_bow,feature_matrix_TEST_2grams,feature_matrix_TEST_3grams=train_test_with_words(df)
    from sklearn import linear_model
    
    #Logit Regression

    #using 2 grams
    clf_2grams = linear_model.SGDClassifier(loss='log') 
    clf_2grams.fit(feature_matrix_2grams, train['target'])
    predicted_nb_2grams = clf_2grams.predict(feature_matrix_TEST_2grams)
    cm_2grams = metrics.confusion_matrix(test['target'], predicted_nb_2grams)
    #Confusion_matrix=pd.DataFrame(data = cm, columns = target_names,index = target_names)
    accuracy_2grams.append(metrics.accuracy_score(test['target'], predicted_nb_2grams))
    
    #using 3 grams
    clf_3grams = linear_model.SGDClassifier(loss='log') 
    clf_3grams.fit(feature_matrix_3grams, train['target'])
    predicted_nb_3grams = clf_3grams.predict(feature_matrix_TEST_3grams)
    cm_3grams = metrics.confusion_matrix(test['target'], predicted_nb_3grams)
    accuracy_3grams.append(metrics.accuracy_score(test['target'], predicted_nb_3grams))
    
    #using bag of words 
    clf_bow = linear_model.SGDClassifier(loss='log') 
    clf_bow.fit(feature_matrix_bow, train['target'])
    predicted_nb_bow = clf_bow.predict(feature_matrix_TEST_bow)
    cm_bow = metrics.confusion_matrix(test['target'], predicted_nb_bow)
    accuracy_bow.append(metrics.accuracy_score(test['target'], predicted_nb_bow))

print("Accuracy rate of using 2 grams: \n", np.mean(list(accuracy_2grams))) 
print("Accuracy rate of using 3 grams: \n", np.mean(list(accuracy_3grams))) 
print("Accuracy rate of using bag of words: \n", np.mean(list(accuracy_bow))) 

Accuracy rate of using 2 grams: 
 0.6605839416058394
Accuracy rate of using 3 grams: 
 0.48320437956204376
Accuracy rate of using bag of words: 
 0.8223941605839415


In [13]:
accuracy_2grams=[]
accuracy_3grams=[]
accuracy_bow=[]
for i in range (0,10):
    test,train,feature_matrix_bow,feature_matrix_2grams,feature_matrix_3grams,feature_matrix_TEST_bow,feature_matrix_TEST_2grams,feature_matrix_TEST_3grams=train_test_with_words(df)
    from sklearn import linear_model
    
    #SVM 

    #using 2 grams
    clf_2grams = linear_model.SGDClassifier(loss='hinge') 
    clf_2grams.fit(feature_matrix_2grams, train['target'])
    predicted_nb_2grams = clf_2grams.predict(feature_matrix_TEST_2grams)
    cm_2grams = metrics.confusion_matrix(test['target'], predicted_nb_2grams)
    #Confusion_matrix=pd.DataFrame(data = cm, columns = target_names,index = target_names)
    accuracy_2grams.append(metrics.accuracy_score(test['target'], predicted_nb_2grams))
    
    #using 3 grams
    clf_3grams = linear_model.SGDClassifier(loss='hinge') 
    clf_3grams.fit(feature_matrix_3grams, train['target'])
    predicted_nb_3grams = clf_3grams.predict(feature_matrix_TEST_3grams)
    cm_3grams = metrics.confusion_matrix(test['target'], predicted_nb_3grams)
    accuracy_3grams.append(metrics.accuracy_score(test['target'], predicted_nb_3grams))
    
    #using bag of words 
    clf_bow = linear_model.SGDClassifier(loss='hinge') 
    clf_bow.fit(feature_matrix_bow, train['target'])
    predicted_nb_bow = clf_bow.predict(feature_matrix_TEST_bow)
    cm_bow = metrics.confusion_matrix(test['target'], predicted_nb_bow)
    accuracy_bow.append(metrics.accuracy_score(test['target'], predicted_nb_bow))

print("Accuracy rate of using 2 grams: \n", np.mean(list(accuracy_2grams))) 
print("Accuracy rate of using 3 grams: \n", np.mean(list(accuracy_3grams))) 
print("Accuracy rate of using bag of words: \n", np.mean(list(accuracy_bow))) 

Accuracy rate of using 2 grams: 
 0.7590510948905109
Accuracy rate of using 3 grams: 
 0.6987591240875912
Accuracy rate of using bag of words: 
 0.8132919708029197
