In [1]:
import pandas as pd
import gzip
import time
# Install a few python packages using pip
from common import utils
utils.require_package('nltk')
utils.require_package("wget")      # for fetching dataset
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Standard python helper libraries.
from __future__ import print_function
from __future__ import division
import os, sys, time
import collections
import itertools

# Numerical manipulation libraries.
import numpy as np
from scipy import stats, optimize

# NLTK is the Natural Language Toolkit, and contains several language datasets
# as well as implementations of many popular NLP algorithms.
# HINT: You should look at what is available here when thinking about your project!
import nltk

# Helper libraries (see the corresponding py files in this notebook's directory).
from common import utils, vocabulary
import segment

utils.require_package("tqdm")  # for nice progress bars
from tqdm import tqdm as ProgressBar

In [3]:
def parse(path):
  print('start parse')
  start_parse = time.time()
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)
  end_parse = time.time()
  print('end parse with time for parse',end_parse - start_parse)

def getDF(path):
  print('start getDF')
  start = time.time()
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  print('end getDF')
  end = time.time()
  print('time taken to load data = ',end-start)
  return pd.DataFrame.from_dict(df, orient='index')
#df = getDF('reviews_Toys_and_Games.json.gz')
df_vid = getDF('reviews_Video_Games.json.gz')
df_toys = getDF('reviews_Toys_and_Games.json.gz')

start getDF
start parse
end parse with time for parse 88.33158898353577
end getDF
time taken to load data =  88.33197402954102
start getDF
start parse
end parse with time for parse 127.22161197662354
end getDF
time taken to load data =  127.22227787971497


In [4]:
df_aut = getDF('reviews_Automotive.json.gz')

start getDF
start parse
end parse with time for parse 69.44895935058594
end getDF
time taken to load data =  69.44927167892456


In [6]:
df_hnk = getDF('reviews_Home_and_Kitchen.json.gz')

start getDF
start parse
end parse with time for parse 232.55289697647095
end getDF
time taken to load data =  232.5537166595459


In [7]:
print('toys reviews summary')
print(df_toys.shape)
print(df_toys.columns)
df_toys.head(5)
print('video games reviews summary')
print(df_vid.shape)
print(df_vid.columns)
df_vid.head(5)
print('Auto reviews summary')
print(df_aut.shape)
print(df_aut.columns)
df_aut.head(5)
print('Home and Kitchen reviews summary')
print(df_hnk.shape)
print(df_hnk.columns)
df_hnk.head(5)

toys reviews summary
(2252771, 9)
Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')
video games reviews summary
(1324753, 9)
Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')
Auto reviews summary
(1373768, 9)
Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')
Home and Kitchen reviews summary
(4253926, 9)
Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A210NOCSTBT4OD,76144011,Sheila,"[0, 0]",Have you ever thought about how you met your b...,4.0,Lovely,1349308800,"10 4, 2012"
1,A28ILV4TOG8BH2,130350591,ccjensen,"[0, 0]","The butter dish is serving us well, and keepin...",5.0,"Nice looking, and keeps the butter fresh",1300752000,"03 22, 2011"
2,A31B4D7URW4DNZ,307394530,3Gigi3,"[11, 16]",I anxiously waited for the book I had pre orde...,2.0,Mother of the Bride,1214784000,"06 30, 2008"
3,A2HU0RPDRZZOP1,307394530,Alexey Leontev,"[0, 0]","Fantastic book, a lot of good, original recipe...",5.0,amazing book,1277337600,"06 24, 2010"
4,A7J0XOW7DYBBD,307394530,Allan Mar Cariaso,"[0, 0]",Can't wait to try all the amazing techniques. ...,5.0,Very helpful,1393113600,"02 23, 2014"


In [8]:
#Count by ratings to determine skew in sample.
print('Ratings distribution for toys',df_toys.groupby('overall').count())
print('Ratings distribution for video games',df_vid.groupby('overall').count())
print('Ratings distribution for automobiles',df_aut.groupby('overall').count())
print('Ratings distribution for home and kitchen',df_hnk.groupby('overall').count())

Ratings distribution for toys          reviewerID     asin  reviewerName  helpful  reviewText  summary  \
overall                                                                    
1.0          192993   192993        192435   192993      192993   192993   
2.0          115801   115801        115416   115801      115801   115801   
3.0          193941   193941        193195   193941      193941   193941   
4.0          407884   407884        406255   407884      407884   407884   
5.0         1342152  1342152       1333623  1342152     1342152  1342152   

         unixReviewTime  reviewTime  
overall                              
1.0              192993      192993  
2.0              115801      115801  
3.0              193941      193941  
4.0              407884      407884  
5.0             1342152     1342152  
Ratings distribution for video games          reviewerID    asin  reviewerName  helpful  reviewText  summary  \
overall                                                    

In [9]:
#Looking at a few examples
print('toys reviews examples\n')
for i in range(5):
    print(df_toys['reviewerID'].iloc[i])
    print(df_toys['reviewText'].iloc[i])

print('\nvideo games reviews examples\n')
for i in range(5):
    print(df_vid['reviewerID'].iloc[i])
    print(df_vid['reviewText'].iloc[i])
    
print('\nautomobile reviews examples\n')
for i in range(5):
    print(df_aut['reviewerID'].iloc[i])
    print(df_aut['reviewText'].iloc[i])
    
print('\nHome and Kitchen reviews examples\n')
for i in range(5):
    print(df_hnk['reviewerID'].iloc[i])
    print(df_hnk['reviewText'].iloc[i])

toys reviews examples

AMEVO2LY6VEJA
Great product, thank you! Our son loved the puzzles.  They have large pieces yet they are still challenging for a 4 year old.
A3C9CSW3TJITGT
I love these felt nursery rhyme characters and scenes.  The quality of the felt is good, and the illustrations are detailed and pretty.  As noted, the figures and scenes are printed on 2 large sheets of flannel and each individual item needs to be cut out.  This process took me 2 hours of tiny cutting.  To me it does not lend itself to a book form but rather laying out the scenes separately or for use on a flannel board.  However, I love the quiet play it offers for my toddler, and as a former Kindergarten teacher, I understand the value of learning rhyme and its connection to future reading.  Overall, delightful product with some work involved.
A31POTIYCKSZ9G
I see no directions for its use. Therefore I have to make up the games, unfortunately.
A2GGHHME9B6W4O
This is a great tool for any teacher using the Pre 

In [36]:
#Get the count by unique product id
tempcnt = df_toys.groupby('asin').size().reset_index()
print('total number of unique products in toys',tempcnt.count())
print('total number of products with at least 5 reviews\n','count of unique products:\n',tempcnt[tempcnt.iloc[:,1] > 5].count(),'sum of their reviews',tempcnt[tempcnt.iloc[:,1] > 5].sum())
print('total number of products with at least 20 reviews\n','count of unique products:\n',tempcnt[tempcnt.iloc[:,1] > 20].count(),'sum of their reviews',tempcnt[tempcnt.iloc[:,1] > 20].sum())

tempcnt = df_vid.groupby('asin').size().reset_index()
print('total number of unique products in videos',tempcnt.count())
print('total number of products with at least 5 reviews\n','count of unique products:\n',tempcnt[tempcnt.iloc[:,1] > 5].count(),'sum of their reviews',tempcnt[tempcnt.iloc[:,1] > 5].sum())
print('total number of products with at least 20 reviews\n','count of unique products:\n',tempcnt[tempcnt.iloc[:,1] > 20].count(),'sum of their reviews',tempcnt[tempcnt.iloc[:,1] > 20].sum())

tempcnt = df_aut.groupby('asin').size().reset_index()
print('total number of unique products in automobiles',tempcnt.count())
print('total number of products with at least 5 reviews\n','count of unique products:\n',tempcnt[tempcnt.iloc[:,1] > 5].count(),'sum of their reviews',tempcnt[tempcnt.iloc[:,1] > 5].sum())
print('total number of products with at least 20 reviews\n','count of unique products:\n',tempcnt[tempcnt.iloc[:,1] > 20].count(),'sum of their reviews',tempcnt[tempcnt.iloc[:,1] > 20].sum())

total number of unique products in toys asin    327698
0       327698
dtype: int64
total number of products with at least 5 reviews
 count of unique products:
 asin    68782
0       68782
dtype: int64 sum of their reviews asin    0375829695043985589604398935770470182318048645...
0                                                 1775109
dtype: object
total number of products with at least 20 reviews
 count of unique products:
 asin    19992
0       19992
dtype: int64 sum of their reviews asin    043985589604398935770470182318048645195X054534...
0                                                 1275698
dtype: object
total number of unique products in videos asin    50210
0       50210
dtype: int64
total number of products with at least 5 reviews
 count of unique products:
 asin    23866
0       23866
dtype: int64 sum of their reviews asin    043940133X043959136807000266570700099867075853...
0                                                 1266698
dtype: object
total number of products w

In [10]:
#Create train,dev,test split
from sklearn.model_selection import train_test_split
train_toys,devtest = train_test_split(df_toys, test_size=0.4)
dev_toys,test_toys = train_test_split(devtest,test_size = 0.5)
print(train_toys.shape,dev_toys.shape,test_toys.shape)

#For Video games reviews
train_vid,devtest = train_test_split(df_vid, test_size=0.4)
dev_vid,test_vid = train_test_split(devtest,test_size = 0.5)
print(train_vid.shape,dev_vid.shape,test_vid.shape)

#For Auto reviews
train_aut,devtest = train_test_split(df_aut, test_size=0.4)
dev_aut,test_aut = train_test_split(devtest,test_size = 0.5)
print(train_aut.shape,dev_aut.shape,test_aut.shape)

#For Home and Kitchen reviews
train_hnk,devtest = train_test_split(df_hnk, test_size=0.4)
dev_hnk,test_hnk = train_test_split(devtest,test_size = 0.5)
print(train_hnk.shape,dev_hnk.shape,test_hnk.shape)

(1351662, 9) (450554, 9) (450555, 9)
(794851, 9) (264951, 9) (264951, 9)
(824260, 9) (274754, 9) (274754, 9)
(2552355, 9) (850785, 9) (850786, 9)


In [None]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize

In [11]:
def set_df_size(size,data_train,data_dev):
    size_train = size
    len_max_train = data_train[data_train.overall!=3].shape[0] #max possible length of train data set taking out the 3 ratings.
    print("Number of reviews with ratings != 3 in train set",len_max_train)
    temp_size_train = min(len_max_train,size_train)

    len_max_dev = data_dev[data_dev.overall!=3].shape[0]
    print("Number of reviews with ratings != 3 in dev set",len_max_dev)
    temp_size_dev = min(len_max_dev,int(0.3*temp_size_train)) #making the dev set about 0.3 times the train set.

    temp_train_data = data_train[data_train.overall != 3][:temp_size_train]
    print('Size of train data',temp_train_data.shape)
    #print(temp_train_data.groupby('overall').count())
    #print(temp_train_toys[:5])

    temp_dev_data = data_dev[data_dev.overall!=3][:temp_size_dev]
    print('Size of dev data',temp_dev_data.shape)
    #print(temp_dev_data.groupby('overall').count())
    #print(temp_dev_data[:2])
    
    #Binarize ratings
    temp_train_y = np.zeros(temp_size_train)
    temp_train_y[temp_train_data.overall > 3] = 1
    temp_dev_y = np.zeros(temp_size_dev)
    temp_dev_y[temp_dev_data.overall>3] = 1
    print('binarized y shape',temp_train_y.shape,temp_dev_y.shape)
    #print(temp_dev_y[:20],data_dev.overall[:20])
    return temp_train_data,temp_dev_data,temp_train_y,temp_dev_y

In [12]:
size_train = 100000
print('toys reviews\n')
temp_train_toys, temp_dev_toys,temp_train_toys_y,temp_dev_toys_y = set_df_size(size_train,train_toys,dev_toys)
print('\nvideo games reviews\n')
temp_train_vid, temp_dev_vid,temp_train_vid_y,temp_dev_vid_y = set_df_size(size_train,train_vid,dev_vid)
print('\nautomobiles reviews\n')
temp_train_aut, temp_dev_aut,temp_train_aut_y,temp_dev_aut_y = set_df_size(size_train,train_aut,dev_aut)
print('done')
temp_train_hnk, temp_dev_hnk,temp_train_hnk_y,temp_dev_hnk_y = set_df_size(size_train,train_hnk,dev_hnk)
print('done')

toys reviews

Number of reviews with ratings != 3 in train set 1235008
Number of reviews with ratings != 3 in dev set 411966
Size of train data (100000, 9)
Size of dev data (30000, 9)
binarized y shape (100000,) (30000,)

video games reviews

Number of reviews with ratings != 3 in train set 720082
Number of reviews with ratings != 3 in dev set 240212
Size of train data (100000, 9)
Size of dev data (30000, 9)
binarized y shape (100000,) (30000,)

automobiles reviews

Number of reviews with ratings != 3 in train set 762084
Number of reviews with ratings != 3 in dev set 253791
Size of train data (100000, 9)
Size of dev data (30000, 9)
binarized y shape (100000,) (30000,)
done
Number of reviews with ratings != 3 in train set 2345119
Number of reviews with ratings != 3 in dev set 781620
Size of train data (100000, 9)
Size of dev data (30000, 9)
binarized y shape (100000,) (30000,)
done


In [64]:
cnt = collections.Counter()
#print(ds.vocab.num_unigrams)
x_tokens_list = []
start = time.time()
for i in range(temp_train_toys.shape[0]):
#for i in range(5):
    x_tokens = word_tokenize(temp_train_toys.reviewText.iloc[i])
    #print(train_toys.reviewText.iloc[i],x_tokens)
    #print(type(x_tokens),len(x_tokens))
    x_tokens_canonical = utils.canonicalize_words(x_tokens)
    #print(x_tokens_canonical,type(x_tokens_canonical),len(x_tokens_canonical))
    x_tokens_list.append(x_tokens_canonical)
    #print('list',x_tokens_list)
    #print(x_tokens_list.shape)
    for word in x_tokens_canonical:
        cnt[word]+=1
    if i%25000 == 0:
        #print('x_tokens',x_tokens,'x_tokens_canonical',x_tokens_canonical)
        print('done',i)
end = time.time()
print('total time taken to canonicalize',start-end)

print('100 most common words in the dataset:',cnt.most_common(100))

done 0
done 25000
done 50000
done 75000
done 100000
done 125000
done 150000
done 175000
done 200000
done 225000
done 250000
done 275000
done 300000
done 325000
done 350000
done 375000
done 400000
done 425000
done 450000
done 475000
total time taken to canonicalize -442.5564184188843
100 most common words in the dataset: [('.', 1766866), ('the', 1548962), (',', 994173), ('and', 990792), ('it', 968089), ('a', 824326), ('to', 816861), ('i', 770345), ('is', 562166), ('this', 548846), ('for', 538379), ('of', 481605), ('my', 395452), ('!', 361257), ('with', 344227), ('in', 313419), ('that', 299537), ('was', 284199), ('you', 236122), ('but', 231972), ('on', 228584), ('are', 218913), ('have', 209179), ("'s", 203357), ('DG', 200948), ('not', 198800), ('as', 195126), ('so', 187108), ('they', 183769), ("n't", 183235), ('we', 161394), ('he', 157594), ('very', 154850), ('one', 151223), ('great', 146363), ('be', 143362), ('all', 130528), (')', 129631), ('she', 128389), ('DGDG', 125484), ('them', 121

In [30]:
print(len(x_tokens_list))
print(x_tokens_list[:2])
print(x_tokens_canonical[:2])

500000
[['seriously', ',', 'this', 'little', 'cutie', 'pie', 'looks', 'like', 'bird', 'sitting', 'beside', 'my', 'computer', '.', 'i', 'bought', 'the', 'green', 'and', 'white', 'one', 'on', 'a', 'lark', '(', 'no', 'pun', 'intended', '--', 'okay', ',', 'maybe', 'a', 'little', 'pun', ')', 'and', 'then', 'had', 'to', 'have', 'the', 'blue', 'and', 'yellow', 'one', 'as', 'a', 'companion', '.', 'they', 'sit', 'on', 'top', 'of', 'my', 'big', 'computer', 'cpu', 'guarding', 'the', 'on/off', 'button', 'from', 'the', 'paws', 'of', 'my', 'cat', '.', 'well', ',', 'it', 'works', 'for', 'us', '.', 'again', ',', 'these', 'are', 'beautiful', ',', 'very', 'realistic', 'stuffed', 'birds', 'that', 'stand', 'up', 'without', 'any', 'problem', '.', ':', ')'], ['this', 'was', 'a', 'wonderful', 'find', '--', '--', '--', 'terrific', 'price', ',', 'sturdy', 'metal', 'carrying', 'tin', ',', 'up', 'to', 'DG', 'players', ',', 'DG', 'train', 'markers', '(', 'DG', 'for', 'each', 'player', 'plus', 'DG', 'for', 'the', 

In [65]:
cnt_dev = collections.Counter()
#print(ds.vocab.num_unigrams)
x_tokens_list_dev = []
start = time.time()
for i in range(temp_dev_toys.shape[0]):
#for i in range(5):
    x_tokens = word_tokenize(temp_dev_toys.reviewText.iloc[i])
    #print(train_toys.reviewText.iloc[i],x_tokens)
    #print(type(x_tokens),len(x_tokens))
    x_tokens_canonical = utils.canonicalize_words(x_tokens)
    #print(x_tokens_canonical,type(x_tokens_canonical),len(x_tokens_canonical))
    x_tokens_list_dev.append(x_tokens_canonical)
    #print('list',x_tokens_list)
    #print(x_tokens_list.shape)
    for word in x_tokens_canonical:
        cnt_dev[word]+=1
    if i%25000 == 0:
        #print('x_tokens',x_tokens,'x_tokens_canonical',x_tokens_canonical)
        print('done',i)
end = time.time()
print('total time taken to canonicalize dev set',start-end)

print('100 most common words in the dev dataset:',cnt.most_common(100))

done 0
done 25000
done 50000
done 75000
done 100000
done 125000
total time taken to canonicalize dev set -131.94238710403442
100 most common words in the dev dataset: [('.', 1766866), ('the', 1548962), (',', 994173), ('and', 990792), ('it', 968089), ('a', 824326), ('to', 816861), ('i', 770345), ('is', 562166), ('this', 548846), ('for', 538379), ('of', 481605), ('my', 395452), ('!', 361257), ('with', 344227), ('in', 313419), ('that', 299537), ('was', 284199), ('you', 236122), ('but', 231972), ('on', 228584), ('are', 218913), ('have', 209179), ("'s", 203357), ('DG', 200948), ('not', 198800), ('as', 195126), ('so', 187108), ('they', 183769), ("n't", 183235), ('we', 161394), ('he', 157594), ('very', 154850), ('one', 151223), ('great', 146363), ('be', 143362), ('all', 130528), (')', 129631), ('she', 128389), ('DGDG', 125484), ('them', 121653), ('(', 115468), ('would', 114432), ('just', 112185), ('old', 111766), ('can', 111204), ('at', 107540), ('like', 107451), ('game', 106187), ('has', 106

In [66]:
#Create vocabulary on train data set.
vocab = vocabulary.Vocabulary(cnt, size=None)
print('vocab size',vocab.size)
print('100 most common words in the dataset:',cnt.most_common(100))

x_tokens_id_list = []
for i in range(temp_train_toys.shape[0]):
#for i in range(5):
    x_tokens_ids = vocab.words_to_ids(x_tokens_list[i])
    #print (x_tokens_ids)
    x_tokens_id_list.append(x_tokens_ids)
    #print(x_tokens_id_list)

print('id list shape',len(x_tokens_id_list))

print('doing dev set')
x_tokens_id_list_dev = []
for i in range(temp_dev_toys.shape[0]):
#for i in range(5):
    x_tokens_ids = vocab.words_to_ids(x_tokens_list_dev[i])
    #print (x_tokens_ids)
    x_tokens_id_list_dev.append(x_tokens_ids)
    #print(x_tokens_id_list_dev)
print('dev list shape',len(x_tokens_id_list_dev))


vocab size 305505
100 most common words in the dataset: [('.', 1766866), ('the', 1548962), (',', 994173), ('and', 990792), ('it', 968089), ('a', 824326), ('to', 816861), ('i', 770345), ('is', 562166), ('this', 548846), ('for', 538379), ('of', 481605), ('my', 395452), ('!', 361257), ('with', 344227), ('in', 313419), ('that', 299537), ('was', 284199), ('you', 236122), ('but', 231972), ('on', 228584), ('are', 218913), ('have', 209179), ("'s", 203357), ('DG', 200948), ('not', 198800), ('as', 195126), ('so', 187108), ('they', 183769), ("n't", 183235), ('we', 161394), ('he', 157594), ('very', 154850), ('one', 151223), ('great', 146363), ('be', 143362), ('all', 130528), (')', 129631), ('she', 128389), ('DGDG', 125484), ('them', 121653), ('(', 115468), ('would', 114432), ('just', 112185), ('old', 111766), ('can', 111204), ('at', 107540), ('like', 107451), ('game', 106187), ('has', 106167), ('toy', 104604), ('do', 102128), ('when', 100761), ('up', 97644), ('her', 96785), ('or', 95880), ('if', 9

In [16]:
# x = utils.id_lists_to_sparse_bow(df['ids'], self.vocab.size)
# y = np.array(df.label, dtype=np.int32)
    
#train_x_csr = utils.id_lists_to_sparse_bow(x_tokens_id_list, vocab.size)
#train_y = np.array(temp_train_toys.overall, dtype=np.int32)
train_toys_yb = temp_train_toys_y
#dev_x_csr = utils.id_lists_to_sparse_bow(x_tokens_id_list_dev, vocab.size)
#dev_y = np.array(temp_dev_toys.overall, dtype=np.int32)
dev_toys_yb = temp_dev_toys_y

#print("Training set: x = {:s} sparse, y = {:s}".format(str(train_x_csr.shape), 
 #                                               str(train_y.shape)))
#print("Test set:     x = {:s} sparse, y = {:s}".format(str(dev_x_csr.shape), 
 #                                               str(dev_y.shape)))

In [14]:
#converting ratings to tokenized word id counts as a sparse matrix
vect_toys = CountVectorizer() #vectorizer specific to toys
vect_vid = CountVectorizer()  #vectorizer specific to videos
vect_aut = CountVectorizer() #vectorizer specific to automobiles
vect_hnk = CountVectorizer() #vectorizer specific to automobiles

# tokenize train and test text data for toys
train_toys_ids = vect_toys.fit_transform(temp_train_toys['reviewText'])
dev_toys_ids = vect_toys.transform(temp_dev_toys['reviewText'])
print("number words in training corpus for toys:", len(vect_toys.get_feature_names()))
print('toys dataset id shapes',train_toys_ids.shape,dev_toys_ids.shape)

#tokenize train and test text data for videos
train_vid_ids = vect_vid.fit_transform(temp_train_vid['reviewText'])
dev_vid_ids = vect_vid.transform(temp_dev_vid['reviewText'])
print("number words in training corpus for video games:", len(vect_vid.get_feature_names()))
print('videos dataset id shapes',train_vid_ids.shape,dev_vid_ids.shape)

#tokenize train and test text data for automobiles
train_aut_ids = vect_aut.fit_transform(temp_train_aut['reviewText'])
dev_aut_ids = vect_aut.transform(temp_dev_aut['reviewText'])
print("number words in training corpus for automobiles:", len(vect_aut.get_feature_names()))
print('automobile dataset id shapes',train_aut_ids.shape,dev_aut_ids.shape)

#tokenize train and test text data for home and kitchen
train_hnk_ids = vect_hnk.fit_transform(temp_train_hnk['reviewText'])
dev_hnk_ids = vect_hnk.transform(temp_dev_hnk['reviewText'])
print("number words in training corpus for home and kitchen:", len(vect_hnk.get_feature_names()))
print('home and kitchen dataset id shapes',train_hnk_ids.shape,dev_hnk_ids.shape)

number words in training corpus for toys: 63984
toys dataset id shapes (100000, 63984) (30000, 63984)
number words in training corpus for video games: 98899
videos dataset id shapes (100000, 98899) (30000, 98899)
number words in training corpus for automobiles: 59468
automobile dataset id shapes (100000, 59468) (30000, 59468)
number words in training corpus for home and kitchen: 57884
home and kitchen dataset id shapes (100000, 57884) (30000, 57884)


In [15]:
#Cross-tokenization(for comparison of accuracy using transfer learning):

#tokenize for videos using the count_vect for toys 
train_toys_vid_ids = vect_toys.transform(temp_train_vid['reviewText'])
dev_toys_vid_ids = vect_toys.transform(temp_dev_vid['reviewText'])
print('videos dataset using toy count vectorizer, id shapes',train_toys_vid_ids.shape,dev_toys_vid_ids.shape)

#tokenize for autos using the count_vect for toys 
train_toys_aut_ids = vect_toys.transform(temp_train_aut['reviewText'])
dev_toys_aut_ids = vect_toys.transform(temp_dev_aut['reviewText'])
print('autos dataset using toy count vectorizer, id shapes',train_toys_aut_ids.shape,dev_toys_aut_ids.shape)

#tokenize for home and kitchen using the count_vect for toys 
train_toys_hnk_ids = vect_toys.transform(temp_train_hnk['reviewText'])
dev_toys_hnk_ids = vect_toys.transform(temp_dev_hnk['reviewText'])
print('autos dataset using toy count vectorizer, id shapes',train_toys_hnk_ids.shape,dev_toys_hnk_ids.shape)

#tokenize for toys using the count_vect for videos 
train_vid_toys_ids = vect_vid.transform(temp_train_toys['reviewText'])
dev_vid_toys_ids = vect_vid.transform(temp_dev_toys['reviewText'])
print('toys dataset using video count vectorizer, id shapes',train_vid_toys_ids.shape,dev_vid_toys_ids.shape)

#tokenize for autos using the count_vect for videos 
train_vid_aut_ids = vect_vid.transform(temp_train_aut['reviewText'])
dev_vid_aut_ids = vect_vid.transform(temp_dev_aut['reviewText'])
print('autos dataset using video count vectorizer, id shapes',train_vid_aut_ids.shape,dev_vid_aut_ids.shape)

#tokenize for toys using the count_vect for autos 
train_aut_toys_ids = vect_aut.transform(temp_train_toys['reviewText'])
dev_aut_toys_ids = vect_aut.transform(temp_dev_toys['reviewText'])
print('toys dataset using autos count vectorizer, id shapes',train_aut_toys_ids.shape,dev_aut_toys_ids.shape)

#tokenize for videos using the count_vect for autos 
train_aut_vid_ids = vect_aut.transform(temp_train_vid['reviewText'])
dev_aut_vid_ids = vect_aut.transform(temp_dev_vid['reviewText'])
print('videos dataset using autos count vectorizer, id shapes',train_aut_vid_ids.shape,dev_aut_vid_ids.shape)

videos dataset using toy count vectorizer, id shapes (100000, 63984) (30000, 63984)
autos dataset using toy count vectorizer, id shapes (100000, 63984) (30000, 63984)
autos dataset using toy count vectorizer, id shapes (100000, 63984) (30000, 63984)
toys dataset using video count vectorizer, id shapes (100000, 98899) (30000, 98899)
autos dataset using video count vectorizer, id shapes (100000, 98899) (30000, 98899)
toys dataset using autos count vectorizer, id shapes (100000, 59468) (30000, 59468)
videos dataset using autos count vectorizer, id shapes (100000, 59468) (30000, 59468)


In [18]:
tfidf_vect = TfidfVectorizer(min_df=5, stop_words='english')
tfidf_train_toys = tfidf_vect.fit_transform(temp_train_toys['reviewText'])
tfidf_dev_toys = tfidf_vect.transform(temp_dev_toys['reviewText'])
print(tfidf_train_toys.shape,tfidf_dev_toys.shape)

(100000, 24706) (30000, 24706)


0.8499


In [33]:
from sklearn.naive_bayes import MultinomialNB

#Naive bayes for binary prediction
# nb_b = MultinomialNB()
# nb_b.fit(train_x_csr,train_y_b)
# y_pred = nb_b.predict(dev_x_csr)

# acc = accuracy_score(dev_y_b, y_pred)
# print("Accuracy on dev set for binary prediction: {:.02%}".format(acc))
# print('classification report naive bayes binary classification \n',classification_report(dev_y_b, y_pred))

#Naive bayes for binary prediction with count_vectorizer for toys
print('Baseline prediction for toys using most common class: {:.02%}'.format(len(temp_dev_toys_y[temp_dev_toys_y==1])/len(temp_dev_toys_y)))
nb_toys_b = MultinomialNB()
nb_toys_b.fit(train_toys_ids,temp_train_toys_y)
y_pred_toys_b_cv = nb_toys_b.predict(dev_toys_ids)

acc = accuracy_score(temp_dev_toys_y, y_pred_toys_b_cv)
print("Accuracy on toys dev set for binary prediction with toys naive bayes model: {:.02%}".format(acc))
print('Corresponding classification report',classification_report(temp_dev_toys_y, y_pred_toys_b_cv))

#Naive bayes for binary prediction with count_vectorizer for video games
print('Baseline prediction for video games using most common class:{:.02%}'.format(len(temp_dev_vid_y[temp_dev_vid_y==1])/len(temp_dev_vid_y)))
nb_vid_b = MultinomialNB()
nb_vid_b.fit(train_vid_ids,temp_train_vid_y)
y_pred_vid_b_cv = nb_vid_b.predict(dev_vid_ids)

acc = accuracy_score(temp_dev_vid_y, y_pred_vid_b_cv)
print("Accuracy on video games dev set for binary prediction with video games naive bayes model: {:.02%}".format(acc))
print('Corresponding classification report',classification_report(temp_dev_vid_y, y_pred_vid_b_cv))

#Naive bayes for binary prediction with count_vectorizer for automobiles
print('Baseline prediction for autos using most common class:{:.02%}'.format(len(temp_dev_aut_y[temp_dev_aut_y==1])/len(temp_dev_aut_y)))
nb_aut_b = MultinomialNB()
nb_aut_b.fit(train_aut_ids,temp_train_aut_y)
y_pred_aut_b_cv = nb_aut_b.predict(dev_aut_ids)

acc = accuracy_score(temp_dev_aut_y, y_pred_aut_b_cv)
print("Accuracy on autos dev set for binary prediction with autos naive bayes model: {:.02%}".format(acc))
print('Corresponding classification report',classification_report(temp_dev_aut_y, y_pred_aut_b_cv))

#Naive bayes for binary prediction with count_vectorizer for home and kitchen
print('Baseline prediction for home and kitchen using most common class:{:.02%}'.format(len(temp_dev_hnk_y[temp_dev_hnk_y==1])/len(temp_dev_hnk_y)))
nb_hnk_b = MultinomialNB()
nb_hnk_b.fit(train_hnk_ids,temp_train_hnk_y)
y_pred_hnk_b_cv = nb_hnk_b.predict(dev_hnk_ids)

acc = accuracy_score(temp_dev_hnk_y, y_pred_hnk_b_cv)
print("Accuracy on home and kitchen dev set for binary prediction with home and kitchen naive bayes model: {:.02%}".format(acc))
print('Corresponding classification report',classification_report(temp_dev_hnk_y, y_pred_hnk_b_cv))

#Naive bayes for binary prediction with tfidf 
# nb_bt = MultinomialNB()
# nb_bt.fit(tfidf_train_toys,train_y_b)
# y_pred_bt = nb_bt.predict(tfidf_dev_toys)

# acc = accuracy_score(dev_y_b, y_pred_bt)
# print("Accuracy on dev set for binary prediction with tfidf: {:.02%}".format(acc))
# print('classification report naive bayes binary classification with tfidf \n',classification_report(dev_y_b, y_pred_bt))

#Naive bayes for 4 level rating prediction, excluding the 3s
# nb = MultinomialNB()
# nb.fit(train_x_csr,train_y)
# y_pred_mult = nb.predict(dev_x_csr)

# acc = accuracy_score(dev_y, y_pred_mult)
# print("Accuracy on dev set for 4 level (1,2,4,5) prediction: {:.02%}".format(acc))
# print('classification report naive bayes multinomial classification with tfidf \n',classification_report(dev_y, y_pred_mult))

Baseline prediction for toys using most common class: 84.99%
Accuracy on toys dev set for binary prediction with toys naive bayes model: 92.23%
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.74      0.74      0.74      4503
        1.0       0.95      0.95      0.95     25497

avg / total       0.92      0.92      0.92     30000

Baseline prediction for video games using most common class:80.92%
Accuracy on video games dev set for binary prediction with video games naive bayes model: 89.16%
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.72      0.71      0.71      5725
        1.0       0.93      0.93      0.93     24275

avg / total       0.89      0.89      0.89     30000

Baseline prediction for autos using most common class:85.59%
Accuracy on autos dev set for binary prediction with autos naive bayes model: 91.93%
Corresponding classification report         

In [19]:
#Transfer learning
#Naive bayes prediction of video games with toys model
y_pred_vidwithtoys_dev = nb_toys_b.predict(dev_toys_vid_ids)
acc = accuracy_score(temp_dev_vid_y, y_pred_vidwithtoys_dev)
print("Accuracy on video games dev set for binary prediction with toys naive bayes model: {:.02%}".format(acc))
print('Corresponding classification report',classification_report(temp_dev_vid_y, y_pred_vidwithtoys_dev))

#Naive bayes prediction of autos games with toys model
y_pred_autwithtoys_dev = nb_toys_b.predict(dev_toys_aut_ids)
acc = accuracy_score(temp_dev_aut_y, y_pred_autwithtoys_dev)
print("Accuracy on automobiles dev set for binary prediction with toys naive bayes model: {:.02%}".format(acc))
print('Corresponding classification report',classification_report(temp_dev_aut_y, y_pred_autwithtoys_dev))

#Naive bayes prediction of home and kitchen games with toys model
y_pred_hnkwithtoys_dev = nb_toys_b.predict(dev_toys_hnk_ids)
acc = accuracy_score(temp_dev_hnk_y, y_pred_hnkwithtoys_dev)
print("Accuracy on home and kitchen dev set for binary prediction with toys naive bayes model: {:.02%}".format(acc))
print('Corresponding classification report',classification_report(temp_dev_hnk_y, y_pred_hnkwithtoys_dev))

#Naive bayes prediction of toys with videos model
y_pred_toyswithvid_dev = nb_vid_b.predict(dev_vid_toys_ids)
acc = accuracy_score(temp_dev_toys_y, y_pred_toyswithvid_dev)
print("Accuracy on toys dev set for binary prediction with video games naive bayes model: {:.02%}".format(acc))
print('Corresponding classification report',classification_report(temp_dev_toys_y, y_pred_toyswithvid_dev))

#Naive bayes prediction of autos games with videos model
y_pred_autwithvid_dev = nb_vid_b.predict(dev_vid_aut_ids)
acc = accuracy_score(temp_dev_aut_y, y_pred_autwithvid_dev)
print("Accuracy on automobiles dev set for binary prediction with video games naive bayes model: {:.02%}".format(acc))
print('Corresponding classification report',classification_report(temp_dev_aut_y, y_pred_autwithvid_dev))


Accuracy on video games dev set for binary prediction with toys naive bayes model: 86.99%
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.66      0.65      0.66      5725
        1.0       0.92      0.92      0.92     24275

avg / total       0.87      0.87      0.87     30000

Accuracy on automobiles dev set for binary prediction with toys naive bayes model: 76.06%
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.36      0.88      0.51      4323
        1.0       0.97      0.74      0.84     25677

avg / total       0.88      0.76      0.79     30000

Accuracy on home and kitchen dev set for binary prediction with toys naive bayes model: 85.78%
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.55      0.85      0.67      5072
        1.0       0.97      0.86      0.91     24928

avg / total       0.90    

In [29]:
#Not updated for multi category
#Pulling out and printing most positive and negative features - binary prediction
linear_weights = nb_b.feature_log_prob_[1,] - nb_b.feature_log_prob_[0,]  # populate this with actual values
top_negative_features = np.argsort(linear_weights)[:20]
top_positive_features = np.argsort(linear_weights)[-20:]


print("Most negative features - binary prediction:")
for idx in top_negative_features:
    print("  {:s} ({:.02f})".format(vocab.id_to_word[idx], 
                                    linear_weights[idx]))
print("")
print("Most positive features: binary prediction:")
for idx in top_positive_features:
    print("  {:s} ({:.02f})".format(vocab.id_to_word[idx], 
                                    linear_weights[idx]))
    
#Pulling out and printing most positive and negative features - 4 way prediction
linear_weights = nb.feature_log_prob_[1,] - nb.feature_log_prob_[0,]  # populate this with actual values
top_negative_features = np.argsort(linear_weights)[:10]
top_positive_features = np.argsort(linear_weights)[-10:]


print("")
print("Most negative features - (1,2,4,5) prediction:")
for idx in top_negative_features:
    print("  {:s} ({:.02f})".format(vocab.id_to_word[idx], 
                                    linear_weights[idx]))
print("")
print("Most positive features: (1,2,4,5) prediction:")
for idx in top_positive_features:
    print("  {:s} ({:.02f})".format(vocab.id_to_word[idx], 
                                    linear_weights[idx]))

NameError: name 'nb_b' is not defined

In [None]:
from sklearn import svm
# instantiate and train SVM model, kernel=rbf 

svm_model = svm.SVC()
start =time.time()

svm_model.fit(tfidf_train_toys,train_y_b)

# evaulate model
y_pred_svm = svm_model.predict(tfidf_dev_toys)

acc = accuracy_score(dev_y_b, y_pred_svm)
print("Accuracy on dev set for binary prediction: {:.02%}".format(acc))

#target_names = ['polarity 0', 'polarity 1']
print('classification report svm',classification_report(dev_y_b, y_pred_svm))
stop = time.time()
print('time taken for SVM', stop-start)

#Does not look like the negative class is correct. Is it because of highly unbalanced negative class.
#Lookup http://scikit-learn.org/stable/modules/svm.html#unbalanced-problems

### Keeping track of results from test runs
With number in train set = 10000 (excl 3 ratings)    
    Accuracy on dev set for binary prediction: 88.74%
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 67.16%
    Vocab Size : 38696
    
With number in train set = 50000 (excl 3 ratings)   
    Accuracy on dev set for binary prediction: 91.33%   
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 69.33% 
    Vocab Size : ~ ..
    
With number in train set = 100000 (excl 3 ratings)
    Accuracy on dev set for binary prediction: 91.56%   
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 70.42%
    Vocab Size : 105304

With number in train set = 500000, dev set = 150000 (excl 3 ratings)    
    Accuracy on dev set for binary prediction: 91.73%
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 70.95%
    vocab size 307822
    
With number in train set = 1200000, dev set = 360000 (excl 3 ratings)    
    Accuracy on dev set for binary prediction: 91.92%
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 71.24%
    vocab size 674074 (not repeated with correction for vocab)
    
### Output from trying different pre-processing with the toys review set.
 
 Accuracy on dev set for binary prediction: 91.69%
classification report naive bayes binary classification 
              precision    recall  f1-score   support

        0.0       0.70      0.77      0.74     22472
        1.0       0.96      0.94      0.95    127528

avg / total       0.92      0.92      0.92    150000

Accuracy on dev set for binary prediction with count vectorizer: 91.92%
classification report naive bayes binary classification with count vectorizer 
              precision    recall  f1-score   support

        0.0       0.71      0.79      0.75     22472
        1.0       0.96      0.94      0.95    127528

avg / total       0.92      0.92      0.92    150000

Accuracy on dev set for binary prediction with tfidf: 90.13%
classification report naive bayes binary classification with tfidf 
              precision    recall  f1-score   support

        0.0       0.90      0.38      0.54     22472
        1.0       0.90      0.99      0.94    127528

avg / total       0.90      0.90      0.88    150000

Accuracy on dev set for 4 level (1,2,4,5) prediction: 70.91%
classification report naive bayes multinomial classification with tfidf 
              precision    recall  f1-score   support

          1       0.60      0.74      0.66     13975
          2       0.32      0.05      0.09      8497
          4       0.42      0.34      0.37     29733
          5       0.80      0.87      0.83     97795

avg / total       0.68      0.71      0.68    150000

### Output from simple ratings prediction with video games review set.

train set size : 10000, dev set size : 3000
Accuracy on dev set for binary prediction with count vectorizer: 88.93%
classification report naive bayes binary classification with count vectorizer 
              precision    recall  f1-score   support

        0.0       0.77      0.54      0.64       534
        1.0       0.91      0.96      0.93      2466

avg / total       0.88      0.89      0.88      3000

Accuracy on dev set for binary prediction with tfidf: 84.93%
classification report naive bayes binary classification with tfidf 
              precision    recall  f1-score   support

        0.0       0.95      0.16      0.28       534
        1.0       0.85      1.00      0.92      2466

avg / total       0.86      0.85      0.80      3000

Using SVM, with Count Vectorizer pre-processing:
Accuracy on dev set for binary prediction: 82.20%
classification report svm              precision    recall  f1-score   support

        0.0       0.00      0.00      0.00       534
        1.0       0.82      1.00      0.90      2466

avg / total       0.68      0.82      0.74      3000

time taken for SVM 48.42102265357971

Using SVM with TFIDF pre-processing:
Accuracy on dev set for binary prediction: 82.20%
classification report svm              precision    recall  f1-score   support

        0.0       0.00      0.00      0.00       534
        1.0       0.82      1.00      0.90      2466

avg / total       0.68      0.82      0.74      3000

train set size : 100000, dev set size : 30000
Accuracy on dev set for binary prediction with count vectorizer: 89.12%
classification report naive bayes binary classification with count vectorizer 
              precision    recall  f1-score   support

        0.0       0.72      0.71      0.71      5728
        1.0       0.93      0.93      0.93     24272

avg / total       0.89      0.89      0.89     30000

Accuracy on dev set for binary prediction with tfidf: 86.04%
classification report naive bayes binary classification with tfidf 
              precision    recall  f1-score   support

        0.0       0.91      0.30      0.45      5728
        1.0       0.86      0.99      0.92     24272

avg / total       0.87      0.86      0.83     30000



### Results for transfer learning from toys to video games
number words in training corpus for toys: 63984
toys dataset id shapes (100000, 63984) (30000, 63984)
number words in training corpus for video games: 98899
videos dataset id shapes (100000, 98899) (30000, 98899)
number words in training corpus for automobiles: 59468
automobile dataset id shapes (100000, 59468) (30000, 59468)
number words in training corpus for home and kitchen: 57884
home and kitchen dataset id shapes (100000, 57884) (30000, 57884)

Accuracy on toys dev set for binary prediction with toys naive bayes model: 92.23%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.74      0.74      0.74      4503
        1.0       0.95      0.95      0.95     25497

avg / total       0.92      0.92      0.92     30000

Accuracy on video games dev set for binary prediction with video games naive bayes model: 89.16%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.72      0.71      0.71      5725
        1.0       0.93      0.93      0.93     24275

avg / total       0.89      0.89      0.89     30000

Accuracy on autos dev set for binary prediction with autos naive bayes model: 91.93%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.78      0.61      0.69      4323
        1.0       0.94      0.97      0.95     25677

avg / total       0.91      0.92      0.92     30000

Accuracy on home and kitchen dev set for binary prediction with home and kitchen naive bayes model: 91.37%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.76      0.71      0.73      5072
        1.0       0.94      0.96      0.95     24928

avg / total       0.91      0.91      0.91     30000

### Transfer learning:

Accuracy on video games dev set for binary prediction with toys naive bayes model: 86.99%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.66      0.65      0.66      5725
        1.0       0.92      0.92      0.92     24275

avg / total       0.87      0.87      0.87     30000

Accuracy on automobiles dev set for binary prediction with toys naive bayes model: 76.06%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.36      0.88      0.51      4323
        1.0       0.97      0.74      0.84     25677

avg / total       0.88      0.76      0.79     30000

Accuracy on home and kitchen dev set for binary prediction with toys naive bayes model: 85.78%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.55      0.85      0.67      5072
        1.0       0.97      0.86      0.91     24928

avg / total       0.90      0.86      0.87     30000

Accuracy on toys dev set for binary prediction with video games naive bayes model: 91.53%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.76      0.63      0.69      4503
        1.0       0.94      0.97      0.95     25497

avg / total       0.91      0.92      0.91     30000

Accuracy on automobiles dev set for binary prediction with video games naive bayes model: 80.50%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.41      0.77      0.53      4323
        1.0       0.96      0.81      0.88     25677

avg / total       0.88      0.81      0.83     30000