In [8]:
import pandas as pd
import numpy as np

# reading
df = pd.read_csv('../amazon-fine-food-reviews/Reviews.csv')
print(df.Id.count())

# cleaning
sorted_df = df.sort_values('ProductId', axis=0, ascending=True,inplace=False)
dedupped_df = df.drop_duplicates(subset={"UserId", "ProfileName", "Time","Text"},keep="first", inplace=False)

print(dedupped_df.Id.count())
print("{} % of data cleaned!".format(int(dedupped_df.Id.count()/df.Id.count() * 100) ))
# filtering 5 star products
five_rating_df = dedupped_df[dedupped_df['Score'] == 5]

print(five_rating_df.Id.count())

568454
393933
69 % of data cleaned!
250962


In [14]:
# Bag of words
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vector_array = vectorizer.fit_transform(five_rating_df.head(3).Text.values)
print(vectorizer.get_feature_names())
print(vector_array.toarray()) 

['I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.'
 'Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.'
 "This saltwater taffy had great flavors and was very soft and chewy.  Each candy was individually wrapped well.  None of the candies were stuck together, which did happen in the expensive version, Fralinger's.  Would highly recommend this candy!  I served it at a beach-themed party and everyone loved it!"]
['all', 'and', 'appreciates', 'assortment', 'at', 'be', 'beach', 'better', 'bought', 'candies', 'candy', 'canned', 'chewy', 'deal', 'delivery', 'did', 'dog', 'each', 'everyone', 'expensive', 'finicky', 'flavors', 'food', 'found', 'fralinger', 'good', 'great', 'had', 'happen', 'ha

In [9]:
from nltk.corpus import stopwords
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

input_df = five_rating_df.head(3).Text.values

# nltk.download('stopwords')
stop = set(stopwords.words('english'))
sno = nltk.stem.SnowballStemmer('english')
print(sno.stem('amaze'))
print(sno.stem('amazing'))

def clean_html(s):
    a=re.sub('[|>.*?|\.*|?.*?]',"",s)
    return a
def clean_punc(s):
    a=re.sub('[.|,|!,|)|(|/|\|”|\’|#|@|$|-|%|]',"",s)
    return a
def clean_word(s):
    out = ""
    for w in s:
        if  w not in stop:        
            cleaned_word = clean_punc(w)
            stemmed_word = sno.stem(cleaned_word)
            out = out + stemmed_word
    return out

html_cleaned_sentences = list(map(lambda x:clean_html(x), input_df))
stemmed_word_sentences =  list(map(clean_word, html_cleaned_sentences))
print(stemmed_word_sentences)
vector_array = vectorizer.fit_transform(stemmed_word_sentences)
print(vectorizer.get_feature_names())
print(vector_array.toarray()) 

amaz
amaz
['i hve bugh everl f he vl cnne g f pruc n hve fun he ll  be f g qul the pruc lk re lke  ew hn  prcee e n  ell beer m lbrr  fnck n he pprece h pruc beer hn  ', 'gre ff   gre prce  there w  we ren f u ff  delver w ver quck  if ur  ff lver h   el', "th lwer ff h gre flvr n w ver f n chew  ech cn w nvull wrppe well  nne f he cne were uck geher whch  hppen n he expenve vern frlnger'  wul hghl recen h cn  i erve    bech-hee pr n everne lve "]
['be', 'bech', 'beer', 'bugh', 'chew', 'cn', 'cne', 'cnne', 'delver', 'ech', 'el', 'ell', 'erve', 'everl', 'everne', 'ew', 'expenve', 'ff', 'flvr', 'fnck', 'frlnger', 'fun', 'geher', 'gre', 'he', 'hee', 'hghl', 'hn', 'hppen', 'hve', 'if', 'lbrr', 'lk', 'lke', 'll', 'lve', 'lver', 'lwer', 'nne', 'nvull', 'pprece', 'pr', 'prce', 'prcee', 'pruc', 'quck', 'qul', 're', 'recen', 'ren', 'th', 'the', 'there', 'uck', 'ur', 'ver', 'vern', 'vl', 'we', 'well', 'were', 'whch', 'wrppe', 'wul']
[[0.15040469 0.         0.30080938 0.15040469 0.         0.
  0

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vector_array = vectorizer.fit_transform(five_rating_df.head(3).Text.values)

print(vectorizer.get_feature_names())

['all', 'and', 'appreciates', 'assortment', 'at', 'be', 'beach', 'better', 'bought', 'candies', 'candy', 'canned', 'chewy', 'deal', 'delivery', 'did', 'dog', 'each', 'everyone', 'expensive', 'finicky', 'flavors', 'food', 'found', 'fralinger', 'good', 'great', 'had', 'happen', 'have', 'highly', 'if', 'in', 'individually', 'is', 'it', 'labrador', 'like', 'looks', 'loved', 'lover', 'meat', 'more', 'most', 'my', 'none', 'of', 'party', 'price', 'processed', 'product', 'products', 'quality', 'quick', 'recommend', 'saltwater', 'served', 'several', 'she', 'smells', 'soft', 'stew', 'stuck', 'taffy', 'than', 'the', 'them', 'themed', 'there', 'this', 'to', 'together', 'version', 'very', 'vitality', 'was', 'well', 'were', 'which', 'wide', 'would', 'wrapped', 'your', 'yummy']


In [37]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

# model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
# print(model.wv['nice'])
# print(model.wv.similarity('nice', 'good'))
# print(model.wv.most_similar('good'))

sentences = [['this', 'pizza', 'is', 'amazing', 'and', 'delicious'],
			['this', 'pizza', 'is', 'bad', 'and', 'waste'],
			['this', 'pizza', 'is', 'not', 'amazing', 'and', 'bad'],
			['this', 'pizza', 'is', 'tasty', 'and', 'awesome'],
			['this', 'pizza', 'is', 'amazing', 'and', 'awesome']]
w2v_model=Word2Vec(sentences ,min_count=2,size=3, workers=4)
print(list(w2v_model.wv.vocab))
print(w2v_model.wv.most_similar('bad'))



['this', 'pizza', 'is', 'amazing', 'and', 'bad', 'awesome']
[('pizza', 0.49744996428489685), ('this', -0.0525684654712677), ('and', -0.16359305381774902), ('amazing', -0.18564680218696594), ('is', -0.6302595138549805), ('awesome', -0.6873972415924072)]
