In [None]:
##INSPIRATIONS##

#http://ramhiser.com/2012/11/23/how-to-download-kaggle-data-with-python-and-requests-dot-py/
#https://www.kaggle.com/c/word2vec-nlp-tutorial/details/part-1-for-beginners-bag-of-words
#https://github.com/dandxy89/DeepLearning_MachineLearning/blob/master/EmbeddingKeras/imdb_embedding_w2v.py

In [1]:
import zipfile, requests, StringIO
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegressionCV
import matplotlib.pyplot as plt
from sklearn import metrics
from __future__ import division
import numpy as np
import gensim
import sqlite3
import nltk
#nltk.download() #only do this once
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

%matplotlib inline
pd.set_option('max_colwidth', 200)

<h2>Load Data</h2>

In [2]:
##LOAD THE DATA SETS FROM KAGGLE LOCALLY##


download_data =False

# The local path where the data set is saved.
local_filename = "C:\Users\machine\Desktop\MeetupJuly2016"


if download_data:
    
    # Kaggle Username and Password
    kaggle_info = {'UserName': "XXXXX", 'Password': "XXXX"}
   
    # The direct link to the Kaggle data set
    data_url = ['https://www.kaggle.com/snap/amazon-fine-food-reviews/downloads/amazon-fine-foods-release-2016-01-08-20-34-54.zip']


    for url in data_url:
        # Attempts to download the CSV file. Gets rejected because we are not logged in.
        r = requests.get(url)
        # Login to Kaggle and retrieve the data.
        r = requests.post(r.url, data = kaggle_info)
        z = zipfile.ZipFile(StringIO.StringIO(r.content))
        z.extractall(local_filename)


connection = sqlite3.connect(local_filename+'\\amazon-fine-foods\\database.sqlite')
reviews = pd.read_sql_query(""" SELECT Score, Summary, Text FROM Reviews WHERE Score != 3 """, connection)

   
print reviews.shape
reviews.head(n=10)


(525814, 3)


Unnamed: 0,Score,Summary,Text
0,5,Good Quality Dog Food,I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labr...
1,1,Not as Advertised,"Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as ""Jumbo""."
2,4,"""Delight"" says it all","This is a confection that has been around a few centuries. It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with ..."
3,2,Cough Medicine,If you are looking for the secret ingredient in Robitussin I believe I have found it. I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda. The fl...
4,5,Great taffy,"Great taffy at a great price. There was a wide assortment of yummy taffy. Delivery was very quick. If your a taffy lover, this is a deal."
5,4,Nice Taffy,"I got a wild hair for taffy and ordered this five pound bag. The taffy was all very enjoyable with many flavors: watermelon, root beer, melon, peppermint, grape, etc. My only complaint is there wa..."
6,5,Great! Just as good as the expensive brands!,"This saltwater taffy had great flavors and was very soft and chewy. Each candy was individually wrapped well. None of the candies were stuck together, which did happen in the expensive version, ..."
7,5,"Wonderful, tasty taffy",This taffy is so good. It is very soft and chewy. The flavors are amazing. I would definitely recommend you buying it. Very satisfying!!
8,5,Yay Barley,Right now I'm mostly just sprouting this so my cats can eat the grass. They love it. I rotate it around with Wheatgrass and Rye too
9,5,Healthy Dog Food,This is a very healthy dog food. Good for their digestion. Also good for small puppies. My dog eats her required amount at every feeding.


<h2>Functions</h2>

In [3]:
def binarize_score(score):
    """
    set scores of 1-3 to 0 and 4-5 as 1
    """
    
    if score <3:
        return 0
    else:
        return 1




##CLEAN /PROCESS REVIEWS AND RETURN LIST OR STRING##

def prepare_text(raw, remove_stopwords=False, stem=False, return_string=False ):
    
    #1. Remove HTML and make lower case
    cleaned = BeautifulSoup(raw,"lxml").get_text().lower()
    
    #perhaps useful for sentiment analysis.....
    #2. Replace numbers, smiliey and frown faces, ! and ? with coded word SM{int} in case these are valuable
    cleaned=re.sub(r'[0-9]+',r' DEG', cleaned) #replace numbers with a token
    cleaned=re.sub("(:\))",r' SM1',cleaned) #smiley
    cleaned=re.sub("(:\()",r' SM2',cleaned) #frown
    cleaned=re.sub("(!)",r' SM3',cleaned) #exclame
    cleaned=re.sub("(\?)",r' SM4',cleaned) #question
    
    cleaned=re.sub("'s","",cleaned) #remove 's
    cleaned=re.sub("'","",cleaned) #remove '
    
    
    #3. keep 'not' and the next word as negation may be important.
    cleaned=re.sub(r"not\s\b(.*?)\b", r"not_\1", cleaned)
    
    
    #4.keep letters (hyphens) and the coded tokens above, replace the rest with whitespace
    cleaned=re.sub("[^\-a-zA-ZSM\d]"," ",cleaned)  
    
    #5.Split into individual words on whitespace
    cleaned = cleaned.split()                             
    
      
    #6.Remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english")) 
        cleaned = [w for w in cleaned if not w in stops]   
   
    #7. Stem
    if stem:
        cleaned=[porter_stemmer.stem(w) for w in cleaned]
        
    
    #8.Concatenate back to a string?
    if return_string:
        cleaned= " ".join( cleaned )
    
    return(cleaned)



##RETURN LIST OF TOKENS##
def token_list(raw_string):
    tokens=raw_string.split()
              
    return tokens



##RETURN PERFORMANCE FOR BINARY CLASSIFIER##

def binary_perform(true,pred):
    print 'AUC: ',metrics.roc_auc_score(true,pred)
    print 'Accuracy: ', metrics.accuracy_score(true,(pred>0.5))
    return (pd.DataFrame(metrics.confusion_matrix(true,(pred>0.5)),index=['True_NEG','True_POS'],columns=['Pred_NEG','Pred_POS']))
    
    
    

##AVERAGE WORD VECTORS##
def avg_word_vectors(wordlist,model,size):
    """
    returns a vector of zero for reviews containing words where none of them
    met the min_count or were not seen in the training set
    
    Otherwise return an average of the embeddings vectors
    
    """
    
    sumvec=np.zeros(shape=(1,size)) #initialize correct size zero vector
    wordcnt=0
    
    for w in wordlist:
        if w in model: #if the word is in the word2vec model
            sumvec += model[w] #add to sum vector
            wordcnt +=1  #incremental counter
    
    sumvec=pd.Series(sumvec.reshape(size,))
    
    if wordcnt ==0:
        return sumvec
    
    else:
        return sumvec / wordcnt


    
    

<h2>Clean and process the reviews </h2>

In [4]:
##CLEAN AND PROCESS REVIEWS - BOTH AS LIST AND STRING ##
##NOT STEM TO ALLOW BETTER WORD SIM UNDERSTANDING##

reviews['Score_binary']=reviews['Score'].apply(binarize_score)

reviews['summary_str']=reviews['Summary'].apply(prepare_text,remove_stopwords=True,stem=False, return_string=True)
reviews['summary_lst']=reviews['summary_str'].apply(token_list)

reviews['text_str']=reviews['Text'].apply(prepare_text,remove_stopwords=True, stem=False,return_string=True)
reviews['text_lst']=reviews['text_str'].apply(token_list)

  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)


In [5]:
reviews.head()

Unnamed: 0,Score,Summary,Text,Score_binary,summary_str,summary_lst,text_str,text_lst
0,5,Good Quality Dog Food,I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labr...,1,good quality dog food,"[good, quality, dog, food]",bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better,"[bought, several, vitality, canned, dog, food, products, found, good, quality, product, looks, like, stew, processed, meat, smells, better, labrador, finicky, appreciates, product, better]"
1,1,Not as Advertised,"Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as ""Jumbo"".",0,advertised,[advertised],product arrived labeled jumbo salted peanuts peanuts actually small sized unsalted sure error vendor intended represent product jumbo,"[product, arrived, labeled, jumbo, salted, peanuts, peanuts, actually, small, sized, unsalted, sure, error, vendor, intended, represent, product, jumbo]"
2,4,"""Delight"" says it all","This is a confection that has been around a few centuries. It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with ...",1,delight says,"[delight, says]",confection around centuries light pillowy citrus gelatin nuts - case filberts cut tiny squares liberally coated powdered sugar tiny mouthful heaven chewy flavorful highly recommend yummy treat fam...,"[confection, around, centuries, light, pillowy, citrus, gelatin, nuts, -, case, filberts, cut, tiny, squares, liberally, coated, powdered, sugar, tiny, mouthful, heaven, chewy, flavorful, highly, ..."
3,2,Cough Medicine,If you are looking for the secret ingredient in Robitussin I believe I have found it. I got this in addition to the Root Beer Extract I ordered (which was good) and made some cherry soda. The fl...,0,cough medicine,"[cough, medicine]",looking secret ingredient robitussin believe found got addition root beer extract ordered good made cherry soda flavor medicinal,"[looking, secret, ingredient, robitussin, believe, found, got, addition, root, beer, extract, ordered, good, made, cherry, soda, flavor, medicinal]"
4,5,Great taffy,"Great taffy at a great price. There was a wide assortment of yummy taffy. Delivery was very quick. If your a taffy lover, this is a deal.",1,great taffy,"[great, taffy]",great taffy great price wide assortment yummy taffy delivery quick taffy lover deal,"[great, taffy, great, price, wide, assortment, yummy, taffy, delivery, quick, taffy, lover, deal]"


In [34]:
##TRAIN AND TEST##

X_train, X_test, y_train, y_test = train_test_split(reviews, reviews.Score_binary, test_size=0.20, random_state=64)

print 'X_train shape: ',X_train.shape
print 'y_train shape: ',y_train.shape
print 'X_test shape: ',X_test.shape
print 'y_test shape: ',y_test.shape

print 'Columns in X: ',X_train.columns

X_train shape:  (420651, 8)
y_train shape:  (420651L,)
X_test shape:  (105163, 8)
y_test shape:  (105163L,)
Columns in X:  Index([u'Score', u'Summary', u'Text', u'Score_binary', u'summary_str',
       u'summary_lst', u'text_str', u'text_lst'],
      dtype='object')


<h2>This is the classic TFIDF linear model</h2>

In [159]:
vectorizer = TfidfVectorizer(ngram_range=(1,1),max_df=0.80, min_df=5) #these parameters 
                                                                    #should be optimized further!
vectorizer.fit(X_train.text_str)
X_train_tfid=vectorizer.transform(X_train.text_str)
X_test_tfid=vectorizer.transform(X_test.text_str)

print 'X_train_tfid shape: ',X_train_tfid.shape
print 'X_test shape: ',X_test_tfid.shape

X_train_tfid shape:  (420651, 25929)
X_test shape:  (105163, 25929)


In [160]:
##FIT A BASIC REGULARIZED LOGISTIC REGRESSION##

LRcv=LogisticRegressionCV(cv=5) #use 5-fold cv to find regularization parameter C 
                                #(smaller C = more regularization)
LRcv.fit(X_train_tfid,y_train)

#predict test set
preds_LR = LRcv.predict_proba(X_test_tfid)[:,1]

#performance
binary_perform(y_test,preds_LR)

AUC:  0.962539264655
Accuracy:  0.935928035526


Unnamed: 0,Pred_NEG,Pred_POS
True_NEG,12195,4319
True_POS,2419,86230


<h2>Train Word2Vec</h2>

In [23]:
#train word2vec using skipgram model (sg=1), sampling 10 negative examples (negative=10)
#400 dimensional word vectors (size=400), window size of 5 words on each side (window=5)
#words have to be seen atleast 5 times across all documents (min_count=5)


model_w2v = gensim.models.Word2Vec(X_train.text_lst,sg=1, negative=10, size=400, window=5, min_count=5, workers=4)

In [20]:
model_w2v['bad'] #400 dimensional vector represents the word 'bad'

array([ -1.23606861e-01,  -2.68316627e-01,   2.98433360e-02,
        -2.98076477e-02,  -1.19921602e-01,   1.98405474e-01,
         1.50522530e-01,   1.11300781e-01,   6.35064617e-02,
         1.39275640e-01,   8.23053345e-02,   1.77483201e-01,
        -7.47933537e-02,  -2.06616428e-02,   3.54353734e-03,
        -4.15252447e-02,  -8.61085504e-02,   1.58804003e-03,
         9.39110592e-02,  -1.70067266e-01,  -7.99810365e-02,
        -4.85614017e-02,   2.99121857e-01,  -1.58234909e-02,
        -5.69150201e-04,   9.85797867e-02,  -2.13606358e-01,
        -1.57882750e-01,  -1.49433687e-01,  -1.72859594e-01,
         2.11090073e-02,   1.79099903e-01,   1.27554843e-02,
         1.92327932e-01,  -8.08020234e-02,  -2.29380384e-01,
        -1.03765607e-01,  -9.97763593e-04,  -5.28124869e-02,
        -4.51258004e-01,   3.30734823e-04,   1.48674130e-01,
        -2.62362659e-01,  -3.93698998e-02,   6.72200695e-02,
         1.88156322e-01,  -1.53170601e-01,  -2.27943912e-01,
        -1.24653969e-02,

In [22]:
model_w2v.most_similar(['bad'],topn=25) #top 25 words most similiar to bad vector 
                                        #(probably need more training data)

[(u'terrible', 0.4946768283843994),
 (u'cruddy', 0.4914777874946594),
 (u'woah', 0.48704078793525696),
 (u'bad-', 0.48413723707199097),
 (u'heyyyy', 0.4811605215072632),
 (u'nasty', 0.4763026833534241),
 (u'allerges', 0.4748576879501343),
 (u'version-', 0.4692271947860718),
 (u'rap', 0.4672160744667053),
 (u'short-changed', 0.46421128511428833),
 (u'rowdy', 0.46168819069862366),
 (u'undeserved', 0.46112364530563354),
 (u'plastic-y', 0.4593057930469513),
 (u'effecting', 0.4588324725627899),
 (u'bizarrely', 0.4578687846660614),
 (u'awry', 0.45783835649490356),
 (u'jtc', 0.45778846740722656),
 (u'good', 0.4556412696838379),
 (u'mediciny', 0.4542664885520935),
 (u'fine-so', 0.45211514830589294),
 (u'overdried', 0.44963207840919495),
 (u'first--great', 0.44939422607421875),
 (u'punish', 0.4482073187828064),
 (u'odder', 0.44737327098846436),
 (u'repulsive', 0.44680100679397583)]

In [24]:
##AVERAGE WORD VECTORS FOR EACH REVIEW##

X_train_avg=X_train.text_lst.apply(avg_word_vectors,model=model_w2v,size=400)
X_test_avg=X_test.text_lst.apply(avg_word_vectors,model=model_w2v,size=400)

print X_train_avg.shape
X_train_avg.head()

(420651, 400)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,390,391,392,393,394,395,396,397,398,399
11548,-0.027505,-0.109973,-0.178321,-0.002176,-0.009496,-0.055786,-0.00133,-0.03373,0.049621,0.03898,...,0.123395,-0.21521,-0.109778,0.068481,0.090897,0.060064,-0.054043,0.146011,-0.06331,0.060052
328858,0.05539,-0.030188,-0.104777,-0.056711,0.043442,-0.011004,0.042148,-0.058843,0.092274,-0.018265,...,0.104028,-0.127704,-0.03887,0.061692,0.101253,0.060593,0.03321,0.213583,-0.022627,0.022856
454122,0.014845,0.081913,-0.186295,-0.080111,-0.011029,0.005394,0.074054,-0.049991,0.138257,-0.012932,...,0.064803,-0.038005,-0.09377,0.055847,0.002942,-0.004128,0.008277,0.111504,-0.03502,0.035007
463521,0.057556,-0.061966,-0.098184,-0.032439,-0.051118,-0.038383,0.038852,0.003071,0.066702,0.058439,...,0.121871,-0.261655,-0.046266,0.023426,0.1461,0.068661,0.007724,0.137245,-0.004498,-0.047226
63873,-0.001678,-0.135287,-0.14527,-0.00033,-0.00339,-0.125997,0.09337,-0.00709,0.085538,0.019231,...,0.133823,-0.214508,0.023021,0.050308,0.109792,0.025889,-0.065633,0.18886,-0.059327,-0.00137


In [25]:
##TRAIN SAME TYPE OF REGULARIZED LOGISTIC REGRESSION##

LRcv_w2v=LogisticRegressionCV(cv=5) #use 5-fold cv to find regularization 
                                    #parameter C (smaller C = more regularization)
LRcv_w2v.fit(X_train_avg,y_train)

#predict test set
preds_LR_w2v = LRcv_w2v.predict_proba(X_test_avg)[:,1]

#performance
binary_perform(y_test,preds_LR_w2v)

AUC:  0.946360363942
Accuracy:  0.920580432281


Unnamed: 0,Pred_NEG,Pred_POS
True_NEG,10597,5917
True_POS,2435,86214


<h2>LSTM RNN</h2>

In [24]:
from gensim.corpora.dictionary import Dictionary

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout
np.random.seed(1337) # sets seed used by Keras

This can be really powerful but we will wait for a week without a GPU......So we will sample and run the code just as illustration. The performance is pretty good for only being trained on a small number of records, for a few epochs and with a simple model - we could add additional layers, play around with learning rates etc.

In [35]:
gensim_dict = Dictionary() #empty dictionary class from gensim  mapping between words and their integer ids

"""
model_w2v.vocab.keys() is a list of all tokens from the word2vec model
add to the dictionary each token from word2vec model
doc2bow will create a mapping of  the word list to (token_id, token_count) 2-tuples
"""
gensim_dict.doc2bow(model_w2v.vocab.keys(),allow_update=True)


"""
now our gensim_dict will allow us to iterate through each token and its assigned integer id
gensim_dict.items() return list of tuples of int ID and word.... 
[(0, u'individual-sized'),
 (1, u'woods'),
 (2, u'clotted'),
 .........
]

w2indx is a dictionary with the word as the key and integer id as the value
(note this is same as gensim_dict.id2token but reversing key and value)

"""

w2indx = {v: k+1 for k, v in gensim_dict.items()} #adding 1 to not use zero which will be used as padding



In [36]:
def parsedata(word_list):
    new_list=[]
    for word in word_list: #for each word in the list of words
        try:
            new_list.append(w2indx[word]) #append the integer mapped to the word to the list 
        except:
            new_list.append(0) #else add zero vector if the word was not found
    return new_list



In [37]:
train=X_train.text_lst.apply(parsedata)
test=X_test.text_lst.apply(parsedata)


In [38]:
n_symbols =len(w2indx)+1 #number of words plus 1 for zero (padding)
vocab_dim = 400 #size of word2vec word embeddings


embedding_weights = np.zeros((n_symbols, vocab_dim))

for word, index in w2indx.iteritems():
    embedding_weights[index, :] = model_w2v[word] #doesnt use index of 0 since that was not created in model_w2v dictionary

embedding_weights=[embedding_weights] #keras expects as a list with a single element of size (n_symbols, vocab_dim)

In [39]:
maxlen=50 #longest word list to allow - pad with zero to the left

train = sequence.pad_sequences(train.values, maxlen=maxlen) #use values to get the arrays from pandas
test = sequence.pad_sequences(test.values, maxlen=maxlen)

In [40]:
train[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,  4419,     0, 37190, 41767, 26184, 24983,
       22817, 24541, 15862, 16647,  6826, 21248,  6376, 15093, 22140,
       22789, 15516, 35281,  7006, 41225, 41767,  6173, 32635, 23604,
       36496, 11819,  5038, 22789, 12785])

In [62]:
#sample 50% of train to build a model without GPU
train2, val, y_train2, y_val = train_test_split(train, y_train, test_size=0.50, random_state=64)
print 'train2 shape: ',train2.shape
print 'y_train2 shape: ',y_train2.shape
print 'val shape: ',val.shape
print 'y_val shape: ',y_val.shape

train2 shape:  (210325L, 50L)
y_train2 shape:  (210325L,)
val shape:  (210326L, 50L)
y_val shape:  (210326L,)


In [66]:
model = Sequential()  
model.add(Embedding(output_dim=vocab_dim,
                    input_dim=n_symbols,
                    mask_zero=True,
                    weights=embedding_weights,
                    input_length=maxlen))  
model.add(LSTM(vocab_dim))
model.add(Dropout(0.5))
model.add(Dense(50, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam')

model.fit(x=train2, y=y_train2, validation_data= (val,y_val), batch_size=128, nb_epoch=3)

Train on 210325 samples, validate on 210326 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x11c6b48d0>

In [67]:
p=model.predict_proba(test)



In [68]:
#performance
binary_perform(y_test,p)

AUC:  0.968653307164
Accuracy:  0.945560701007


Unnamed: 0,Pred_NEG,Pred_POS
True_NEG,13657,2857
True_POS,2868,85781
