In [1]:
import pandas as pd
import gzip
import time
# Install a few python packages using pip
from common import utils
utils.require_package('nltk')
utils.require_package("wget")      # for fetching dataset
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB

In [2]:
# Standard python helper libraries.
from __future__ import print_function
from __future__ import division
import os, sys, time
import collections
import itertools

# Numerical manipulation libraries.
import numpy as np
from scipy import stats, optimize

import nltk
nltk.download('punkt')
from nltk import word_tokenize

# Helper libraries
from common import utils, vocabulary

[nltk_data] Downloading package punkt to /home/arunima/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#Function to read the amazon review data files
def parse(path):
  print('start parse')
  start_parse = time.time()
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)
  end_parse = time.time()
  print('end parse with time for parse',end_parse - start_parse)

def getDF(path):
  print('start getDF')
  start = time.time()
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  print('end getDF')
  end = time.time()
  print('time taken to load data = ',end-start)
  return pd.DataFrame.from_dict(df, orient='index')
#df = getDF('reviews_Toys_and_Games.json.gz') #old def function corresponding to the step bt step vectorization

In [4]:
df_vid = getDF('reviews_Video_Games.json.gz')
df_toys = getDF('reviews_Toys_and_Games.json.gz')

start getDF
start parse
end parse with time for parse 81.08209609985352
end getDF
time taken to load data =  81.08266162872314
start getDF
start parse
end parse with time for parse 112.94878482818604
end getDF
time taken to load data =  112.94914507865906


In [5]:
df_aut = getDF('reviews_Automotive.json.gz')

start getDF
start parse
end parse with time for parse 65.77880191802979
end getDF
time taken to load data =  65.78030347824097


In [6]:
df_hnk = getDF('reviews_Home_and_Kitchen.json.gz')

start getDF
start parse
end parse with time for parse 225.31836080551147
end getDF
time taken to load data =  225.31877446174622


In [7]:
#Looking at a few exampls of the data.
print('\n Toys reviews summary')
print(df_toys.shape)
print(df_toys.columns)
print(df_toys.head(3))
print('\n Video games reviews summary')
print(df_vid.shape)
print(df_vid.columns)
print(df_vid.head(3))
print('\n Auto reviews summary')
print(df_aut.shape)
print(df_aut.columns)
print(df_aut.head(3))
print('\n Home and Kitchen reviews summary')
print(df_hnk.shape)
print(df_hnk.columns)
print(df_hnk.head(3))


 Toys reviews summary
(2252771, 9)
Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')
       reviewerID        asin   reviewerName helpful  \
0   AMEVO2LY6VEJA  0000191639  Nicole Soeder  [0, 0]   
1  A3C9CSW3TJITGT  0005069491          Renee  [0, 0]   
2  A31POTIYCKSZ9G  0076561046  So CA Teacher  [0, 0]   

                                          reviewText  overall  \
0  Great product, thank you! Our son loved the pu...      5.0   
1  I love these felt nursery rhyme characters and...      4.0   
2  I see no directions for its use. Therefore I h...      3.0   

                                      summary  unixReviewTime   reviewTime  
0                                     Puzzles      1388016000  12 26, 2013  
1  Charming characters but busy work required      1377561600  08 27, 2013  
2                    No directions for use...      1404864000   07 9, 2014  

 Video games re

In [8]:
#Count by ratings to determine skew in sample.
print('Ratings distribution for toys',df_toys.groupby('overall').count())
print('\n Ratings distribution for video games',df_vid.groupby('overall').count())
print('\n Ratings distribution for automobiles',df_aut.groupby('overall').count())
print('\n Ratings distribution for home and kitchen',df_hnk.groupby('overall').count())

Ratings distribution for toys          reviewerID     asin  reviewerName  helpful  reviewText  summary  \
overall                                                                    
1.0          192993   192993        192435   192993      192993   192993   
2.0          115801   115801        115416   115801      115801   115801   
3.0          193941   193941        193195   193941      193941   193941   
4.0          407884   407884        406255   407884      407884   407884   
5.0         1342152  1342152       1333623  1342152     1342152  1342152   

         unixReviewTime  reviewTime  
overall                              
1.0              192993      192993  
2.0              115801      115801  
3.0              193941      193941  
4.0              407884      407884  
5.0             1342152     1342152  

 Ratings distribution for video games          reviewerID    asin  reviewerName  helpful  reviewText  summary  \
overall                                                  

In [9]:
#Looking at a few examples of review text
print('Toys reviews examples\n')
for i in range(3):
    print(df_toys['reviewerID'].iloc[i])
    print(df_toys['reviewText'].iloc[i])

print('\n Video games reviews examples\n')
for i in range(3):
    print(df_vid['reviewerID'].iloc[i])
    print(df_vid['reviewText'].iloc[i])
    
print('\n Automobile reviews examples\n')
for i in range(3):
    print(df_aut['reviewerID'].iloc[i])
    print(df_aut['reviewText'].iloc[i])
    
print('\n Home and Kitchen reviews examples\n')
for i in range(3):
    print(df_hnk['reviewerID'].iloc[i])
    print(df_hnk['reviewText'].iloc[i])

Toys reviews examples

AMEVO2LY6VEJA
Great product, thank you! Our son loved the puzzles.  They have large pieces yet they are still challenging for a 4 year old.
A3C9CSW3TJITGT
I love these felt nursery rhyme characters and scenes.  The quality of the felt is good, and the illustrations are detailed and pretty.  As noted, the figures and scenes are printed on 2 large sheets of flannel and each individual item needs to be cut out.  This process took me 2 hours of tiny cutting.  To me it does not lend itself to a book form but rather laying out the scenes separately or for use on a flannel board.  However, I love the quiet play it offers for my toddler, and as a former Kindergarten teacher, I understand the value of learning rhyme and its connection to future reading.  Overall, delightful product with some work involved.
A31POTIYCKSZ9G
I see no directions for its use. Therefore I have to make up the games, unfortunately.

 Video games reviews examples

AB9S9279OZ3QO
I haven't gotten aro

In [10]:
#Get the count by unique product id, and % of products and reviews left if we limit to products with at least X reviews
def product_skew(df):
    tempcnt = df.groupby('asin').size().reset_index()
    cnt_total = tempcnt.count()[1]
    sum_total = tempcnt.iloc[:,1].sum()
    cnt_5 = tempcnt[tempcnt.iloc[:,1] > 5].count()[1]
    sum_5 = tempcnt[tempcnt.iloc[:,1] > 5].sum()[1]
    cnt_20 = tempcnt[tempcnt.iloc[:,1] > 20].count()[1]
    sum_20 = tempcnt[tempcnt.iloc[:,1] > 20].sum()[1]
    print('Total','\n\tCount of unique products:',cnt_total,'\n\tSum of their reviews',sum_total)
    print('Total with at least 5 reviews','\n\tCount of unique products:',cnt_5,'Percentage of total {0:.0f}%'.format(cnt_5*100/cnt_total))
    print('\tSum of their reviews',sum_5,'Percentage of total {0:.0f}%'.format(sum_5*100/sum_total))
    print('Total with at least 20 reviews','\n\tCount of unique products:',cnt_20,'Percentage of total {0:.0f}%'.format(cnt_20*100/cnt_total))
    print('\tSum of their reviews',sum_20,'Percentage of total {0:.0f}%'.format(sum_20*100/sum_total))
    return

print('Additional Summary metrics for Toy reviews')
product_skew(df_toys)

print('\nAdditional Summary metrics for Video Games reviews')
product_skew(df_vid)

print('\nAdditional Summary metrics for Auto reviews')
product_skew(df_aut)

print('\nAdditional Summary metrics for Home and Kitchen reviews')
product_skew(df_hnk)


Additional Summary metrics for Toy reviews
Total 
	Count of unique products: 327698 
	Sum of their reviews 2252771
Total with at least 5 reviews 
	Count of unique products: 68782 Percentage of total 21%
	Sum of their reviews 1775109 Percentage of total 79%
Total with at least 20 reviews 
	Count of unique products: 19992 Percentage of total 6%
	Sum of their reviews 1275698 Percentage of total 57%

Additional Summary metrics for Video Games reviews
Total 
	Count of unique products: 50210 
	Sum of their reviews 1324753
Total with at least 5 reviews 
	Count of unique products: 23866 Percentage of total 48%
	Sum of their reviews 1266698 Percentage of total 96%
Total with at least 20 reviews 
	Count of unique products: 10904 Percentage of total 22%
	Sum of their reviews 1124236 Percentage of total 85%

Additional Summary metrics for Auto reviews
Total 
	Count of unique products: 320112 
	Sum of their reviews 1373768
Total with at least 5 reviews 
	Count of unique products: 42052 Percentage o

In [7]:
#Create train,dev,test split
from sklearn.model_selection import train_test_split
train_toys,devtest = train_test_split(df_toys, test_size=0.4,random_state=42)
dev_toys,test_toys = train_test_split(devtest,test_size = 0.5,random_state=42)
print('Toy reviews train, dev and test set dataframe shape:',train_toys.shape,dev_toys.shape,test_toys.shape)

#For Video games reviews
train_vid,devtest = train_test_split(df_vid, test_size=0.4,random_state=42)
dev_vid,test_vid = train_test_split(devtest,test_size = 0.5,random_state=42)
print('Video games reviews train, dev and test set dataframe shape:',train_vid.shape,dev_vid.shape,test_vid.shape)

#For Auto reviews
train_aut,devtest = train_test_split(df_aut, test_size=0.4,random_state=42)
dev_aut,test_aut = train_test_split(devtest,test_size = 0.5,random_state=42)
print('Auto reviews train, dev and test set dataframe shape:',train_aut.shape,dev_aut.shape,test_aut.shape)

#For Home and Kitchen reviews
train_hnk,devtest = train_test_split(df_hnk, test_size=0.4,random_state=42)
dev_hnk,test_hnk = train_test_split(devtest,test_size = 0.5,random_state=42)
print('Home and Kitchen reviews train, dev and test set dataframe shape:',train_hnk.shape,dev_hnk.shape,test_hnk.shape)

Toy reviews train, dev and test set dataframe shape: (1351662, 9) (450554, 9) (450555, 9)
Video games reviews train, dev and test set dataframe shape: (794851, 9) (264951, 9) (264951, 9)
Auto reviews train, dev and test set dataframe shape: (824260, 9) (274754, 9) (274754, 9)
Home and Kitchen reviews train, dev and test set dataframe shape: (2552355, 9) (850785, 9) (850786, 9)


In [8]:
#Function to create a smaller sized train and dev data set. Enables testing accuracy for different sizes.
#Also binarizes the labels. Ratings of 1,2 set to 0; Ratings of 4,5 to 1.

def set_df_size(size,data_train,data_dev):
    size_train = size
    len_max_train = data_train[data_train.overall!=3].shape[0] #max possible length of train data set taking out the 3 ratings.
    #print("Number of reviews with ratings != 3 in train set",len_max_train)
    temp_size_train = min(len_max_train,size_train)

    len_max_dev = data_dev[data_dev.overall!=3].shape[0]
    #print("Number of reviews with ratings != 3 in dev set",len_max_dev)
    temp_size_dev = min(len_max_dev,int(0.3*temp_size_train)) #making the dev set about 0.3 times the train set.

    temp_train_data = data_train[data_train.overall != 3][:temp_size_train]
    #print('Size of train data',temp_train_data.shape)
    #print(temp_train_data.groupby('overall').count())
    #print(temp_train_toys[:5])

    temp_dev_data = data_dev[data_dev.overall!=3][:temp_size_dev]
    #print('Size of dev data',temp_dev_data.shape)
    #print(temp_dev_data.groupby('overall').count())
    #print(temp_dev_data[:2])
    
    #Binarize ratings
    temp_train_y = np.zeros(temp_size_train)
    temp_train_y[temp_train_data.overall > 3] = 1
    temp_dev_y = np.zeros(temp_size_dev)
    temp_dev_y[temp_dev_data.overall>3] = 1
    #print('binarized y shape',temp_train_y.shape,temp_dev_y.shape)
    #print(temp_dev_y[:20],data_dev.overall[:20])
    return temp_train_data,temp_dev_data,temp_train_y,temp_dev_y

In [9]:
list_df = ['toys','vid','aut','hnk'] #list of keys that refer to each dataframe. Adding a new dataframe would require updating this list
dict_train_df = {} #Dict to store train input data frame for each domain, can be accessed by using domain name as key
dict_dev_df = {} #Dict to store dev input data frame for each domain, can be accessed by using domain name as key
dict_train_y = {} #Dict to store binarized train data label for each domain
dict_dev_y = {} #Dict to store binarized dev data label for each domain
#print(len(dict_train_df))

def create_sized_data(size = 100000):
    size_train = size #Set size of train set here. This is a hyperparameter.
    key = list_df[0]
    #print('Toys reviews\n')
    dict_train_df[key], dict_dev_df[key], dict_train_y[key], dict_dev_y[key] = set_df_size(size_train,train_toys,dev_toys)
    #print('\n Video games reviews\n')
    key = list_df[1]
    dict_train_df[key], dict_dev_df[key], dict_train_y[key], dict_dev_y[key] = set_df_size(size_train,train_vid,dev_vid)
    #print('\n Auto reviews\n')
    key = list_df[2]
    dict_train_df[key], dict_dev_df[key], dict_train_y[key], dict_dev_y[key] = set_df_size(size_train,train_aut,dev_aut)
    #print('\n Home and Kitchen reviews\n')
    key = list_df[3]
    dict_train_df[key], dict_dev_df[key], dict_train_y[key], dict_dev_y[key] = set_df_size(size_train,train_hnk,dev_hnk)
    
create_sized_data()
#print(len(dict_train_df))

In [110]:
#Converting reviews to sparse matrix of word ids with count vectorizer, and using Naive Bayes to make the prediction.
#This section also creates the count_vectorizer and Naive Bayes models for each domain to be used to test transfer learning
dict_vectorizers = {} #Dict to store the count_vectorizer model developed on each domain
dict_train_ids = {} #Dict to store train data reviews as sparse matrix of word ids
dict_dev_ids = {} #Dict to store dev data reviews as sparse matrix of word ids
dict_nb = {} #Dict to store naive bayes model developed on each domain. Assumes input features are developed using the corresponding count_vectorizer
dict_dev_ypred = {} #Dict to store dev predictions

def create_base_NB_models():
    for key in list_df:
        #Converting ratings to tokenized word id counts as a sparse matrix using count_vectorizer
        dict_vectorizers[key] = CountVectorizer(min_df=2, stop_words='english')
        dict_train_ids[key] = dict_vectorizers[key].fit_transform(dict_train_df[key].reviewText)
        dict_dev_ids[key] = dict_vectorizers[key].transform(dict_dev_df[key].reviewText)
        print("Number words in training corpus for",key,len(dict_vectorizers[key].get_feature_names()))
        #print(key,'dataset id shapes',dict_train_ids[key].shape, dict_dev_ids[key].shape)

        #Building a Naive Bayes model to predict the ratings
        dict_nb[key] = MultinomialNB()
        dict_nb[key].fit(dict_train_ids[key],dict_train_y[key])
        dict_dev_ypred[key] = dict_nb[key].predict(dict_dev_ids[key])
        acc = accuracy_score(dict_dev_y[key], dict_dev_ypred[key])
        print("Accuracy on",key,"dev set for binary prediction with toys naive bayes model: {:.02%}".format(acc))

def print_base_NB_details():
    for key in list_df:
      print('Classification report for',key,'\n',classification_report(dict_dev_y[key], dict_dev_ypred[key]))  
        
create_base_NB_models()
print_base_NB_details()

Number words in training corpus for toys 31005
Accuracy on toys dev set for binary prediction with toys naive bayes model: 91.89%
Number words in training corpus for vid 44569
Accuracy on vid dev set for binary prediction with toys naive bayes model: 88.62%
Number words in training corpus for aut 27220
Accuracy on aut dev set for binary prediction with toys naive bayes model: 91.26%
Number words in training corpus for hnk 27825
Accuracy on hnk dev set for binary prediction with toys naive bayes model: 90.95%
Classification report for toys 
              precision    recall  f1-score   support

        0.0       0.72      0.75      0.73      4465
        1.0       0.96      0.95      0.95     25535

avg / total       0.92      0.92      0.92     30000

Classification report for vid 
              precision    recall  f1-score   support

        0.0       0.69      0.71      0.70      5642
        1.0       0.93      0.93      0.93     24358

avg / total       0.89      0.89      0.89   

In [113]:
#Accuracy of transfer learning

dict_transfer_ids = {} #Dictionary to store the dev vector ids for dataframe A(df) using the count_vectorizer of dataframe B(vect)
transfer_results = pd.DataFrame(index=list_df,columns=list_df) #Dataframe to store accuracy on transfer. Col = Model, row = dataframe

def estimate_transfer_accuracy():
    for vectKey in list_df:
        dict_transfer_ids[vectKey] = {}
        #print('vectKey',vectKey)
        for dfKey in list_df:
            #print('dfKey',dfKey)
            dict_transfer_ids[vectKey][dfKey] = dict_vectorizers[vectKey].transform(dict_dev_df[dfKey].reviewText)
            #print(dfKey,'dataset using ',vectKey,' count vectorizer, id shapes',dict_transfer_ids[vectKey][dfKey].shape)
            dict_dev_ypred = dict_nb[vectKey].predict(dict_transfer_ids[vectKey][dfKey])
            acc = accuracy_score(dict_dev_y[dfKey], dict_dev_ypred)
            #print("Accuracy on ",dfKey," dev set for binary prediction with ", vectKey," naive bayes model: {:.02%}".format(acc))
            transfer_results[vectKey][dfKey] = acc

    print("Effectiveness of transfer learning with Naive Bayes:")
    print("Accuracy of rating predictions")
    print("Colums = source domain, Rows = target domain\n")
    print(transfer_results.to_string(float_format = '{:.01%}'.format))

estimate_transfer_accuracy()

Effectiveness of transfer learning with Naive Bayes:
Accuracy of rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut   hnk
toys 91.9% 90.9% 90.3% 90.9%
vid  86.4% 88.6% 87.1% 86.9%
aut  74.6% 78.4% 91.3% 82.2%
hnk  84.0% 84.6% 90.5% 90.9%


In [10]:
#Accuracy of transfer learning - updating to use countvectorizer developed on both dataframes instead of just the source domain.

dict_transfer_vect = {} ##Dictionary to store countvectorizer for two dfs combined.
dict_transfer_train_ids = {} ##Dictionary to store train ids using countvectorizer for two dfs combined.
dict_transfer_dev_ids = {} ## Dictionary to store dev ids using countvectorizer for two dfs combined.
transfer_results = pd.DataFrame(index=list_df,columns=list_df) #Dataframe to store accuracy on transfer. Col = Model, row = dataframe

for vectKey in list_df:
    dict_transfer_vect[vectKey] = {}
    dict_transfer_train_ids[vectKey] = {}
    dict_transfer_dev_ids[vectKey] = {}

def estimate_transfer_accuracy():
    #First create the countvectorizer for the two dfs together, then create the train and dev ids for both dfs using that.
    for vectKey in list_df:
        #print('vectKey',vectKey)
        for dfKey in list_df:
            if list_df.index(dfKey) > list_df.index(vectKey): 
                
                #Create combined dataframe of reviewText from both domains
                temp_two_df_reviews = pd.concat([dict_train_df[vectKey].reviewText,dict_train_df[dfKey].reviewText])
                print('combined df shape for',vectKey,dfKey,temp_two_df_reviews.shape)
                
                #create countVectorizer on combined dataframe of reviewText from both domains
                dict_transfer_vect[vectKey][dfKey] = CountVectorizer(min_df=2, stop_words='english')
                dict_transfer_vect[vectKey][dfKey] = dict_transfer_vect[vectKey][dfKey].fit(temp_two_df_reviews)
                print("Number words in training corpus for keys",vectKey,dfKey,len(dict_transfer_vect[vectKey][dfKey].get_feature_names()))
                
                #create id vectors of reviews for each df, train and dev set, using combined countVectorizer
                dict_transfer_train_ids[vectKey][dfKey] = dict_transfer_vect[vectKey][dfKey].transform(dict_train_df[vectKey].reviewText)
                dict_transfer_train_ids[dfKey][vectKey] = dict_transfer_vect[vectKey][dfKey].transform(dict_train_df[dfKey].reviewText)
                dict_transfer_dev_ids[vectKey][dfKey] = dict_transfer_vect[vectKey][dfKey].transform(dict_dev_df[vectKey].reviewText)
                dict_transfer_dev_ids[dfKey][vectKey] = dict_transfer_vect[vectKey][dfKey].transform(dict_dev_df[dfKey].reviewText)
                
                #using vectKey as source, and dfkey as target
                source_modelVect = MultinomialNB()
                source_modelVect.fit(dict_transfer_train_ids[vectKey][dfKey],dict_train_y[vectKey])
                dict_dev_ypred = source_modelVect.predict(dict_transfer_dev_ids[dfKey][vectKey])
                acc = accuracy_score(dict_dev_y[dfKey], dict_dev_ypred)
                transfer_results[vectKey][dfKey] = acc
                
                #using dfKey as source, and Vectkey as target
                source_modeldf = MultinomialNB()
                source_modeldf.fit(dict_transfer_train_ids[dfKey][vectKey],dict_train_y[dfKey])
                dict_dev_ypred = source_modeldf.predict(dict_transfer_dev_ids[vectKey][dfKey])
                acc = accuracy_score(dict_dev_y[vectKey], dict_dev_ypred)
                #print("Accuracy on ",dfKey," dev set for binary prediction with ", vectKey," naive bayes model: {:.02%}".format(acc))
                transfer_results[dfKey][vectKey] = acc

    print("Effectiveness of transfer learning with Naive Bayes:")
    print("Accuracy of rating predictions")
    print("Colums = source domain, Rows = target domain\n")
    print(transfer_results.to_string(float_format = '{:.01%}'.format))

estimate_transfer_accuracy()

combined df shape for toys vid (200000,)
Number words in training corpus for keys toys vid 56195
combined df shape for toys aut (200000,)
Number words in training corpus for keys toys aut 43529
combined df shape for toys hnk (200000,)
Number words in training corpus for keys toys hnk 43129
combined df shape for vid aut (200000,)
Number words in training corpus for keys vid aut 55983
combined df shape for vid hnk (200000,)
Number words in training corpus for keys vid hnk 55831
combined df shape for aut hnk (200000,)
Number words in training corpus for keys aut hnk 40614
Effectiveness of transfer learning with Naive Bayes:
Accuracy of rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut   hnk
toys   NaN 90.1% 87.5% 89.2%
vid  86.0%   NaN 81.1% 82.6%
aut  71.6% 74.0%   NaN 78.7%
hnk  81.5% 79.6% 88.4%   NaN


In [107]:
#Calculating and displaying as transfer loss
transfer_loss = pd.DataFrame(index=list_df,columns=list_df) #Dataframe to store loss in accuracy on transfer. Col = Model, row = dataframe
def estimate_transfer_loss():
    for A in list_df:
        for B in list_df:
            transfer_loss[A][B] = transfer_results[B][B] - transfer_results[A][B]
    print("Transfer loss on rating predictions")
    print("Colums = source domain, Rows = target domain\n")
    print(transfer_loss.to_string(float_format = '{:.01%}'.format))

estimate_transfer_loss()

Transfer loss on rating predictions
Colums = source domain, Rows = target domain

      toys   vid  aut   hnk
toys  0.0%  1.6% 3.0%  2.1%
vid   3.5%  0.0% 6.3%  5.8%
aut  22.7% 19.8% 0.0% 12.3%
hnk  11.4% 12.1% 1.9%  0.0%


In [73]:
#for size in (50000,100000,250000,500000,1000000):
for size in (5000,100000):
    print("\n Train data_set size =",size)
    create_sized_data(size = size)
    create_base_NB_models()
    estimate_transfer_accuracy()
    estimate_transfer_loss()


 Train data_set size = 5000
Number words in training corpus for toys 7133
Accuracy on toys dev set for binary prediction with toys naive bayes model: 90.53%
Number words in training corpus for vid 10570
Accuracy on vid dev set for binary prediction with toys naive bayes model: 88.87%
Number words in training corpus for aut 6610
Accuracy on aut dev set for binary prediction with toys naive bayes model: 91.40%
Number words in training corpus for hnk 7255
Accuracy on hnk dev set for binary prediction with toys naive bayes model: 89.80%
Effectiveness of transfer learning with Naive Bayes:
Accuracy of rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut   hnk
toys 90.5% 88.1% 89.9% 90.0%
vid  86.1% 88.9% 86.1% 86.4%
aut  78.5% 82.5% 91.4% 88.8%
hnk  86.0% 87.4% 88.5% 89.8%
Transfer loss on rating predictions
Colums = source domain, Rows = target domain

      toys  vid  aut  hnk
toys  0.0% 2.5% 0.7% 0.5%
vid   2.7% 0.0% 2.8% 2.5%
aut  12.9% 8.9% 0.0% 2.6%

In [12]:
#Create a function to calculate JS Divergence using two discrete distributions.
from scipy.stats import entropy
from scipy import spatial
#from scipy.sparse.linalg import norm
from numpy.linalg import norm

def JSD(P, Q):
   _P = P / norm(P, ord=1)
   _Q = Q / norm(Q, ord=1)
   _M = 0.5 * (_P + _Q)
   return 0.5 * (entropy(_P, _M) + entropy(_Q, _M))

In [105]:
#Create a vocabulary on the reviewText of all dataframes for the sake of comparing their distributions on the same baseline.
all_df_reviews = pd.DataFrame(columns = dict_train_df[list_df[0]].columns)
for key in list_df:
    #print(dict_train_df[key].shape)
    all_df_reviews = pd.concat([dict_train_df[key],all_df_reviews])
print(all_df_reviews.shape)
#print(type(all_df_reviews))
#print(all_df_reviews.columns)

all_vectorizer = CountVectorizer(min_df=5, max_df=0.8, stop_words='english')
all_ids = all_vectorizer.fit_transform(all_df_reviews.reviewText)
print("Number words in training corpus",len(all_vectorizer.get_feature_names()))

#Create a word if distribution of each df on the integrated vocabulary ids.
dict_allVocab_ids = {}
for key in list_df:
    dict_allVocab_ids[key] = all_vectorizer.transform(dict_train_df[key].reviewText)
    print(key,dict_allVocab_ids[key].shape)

(400000, 9)
Number words in training corpus 41262
toys (100000, 41262)
vid (100000, 41262)
aut (100000, 41262)
hnk (100000, 41262)


In [106]:
JSD_results = pd.DataFrame(index=list_df,columns=list_df)
cosine_results = pd.DataFrame(index=list_df,columns=list_df)
for key1 in list_df:
   for key2 in list_df:
       dict_train_ids_1 = dict_allVocab_ids[key1].sum(axis=0).T
       dict_train_ids_2 = dict_allVocab_ids[key2].sum(axis=0).T
       #print(dict_allVocab_ids[key1].shape,dict_train_ids_1.shape,dict_train_ids_2.shape)
       JSD_results[key1][key2] = JSD(dict_train_ids_1,dict_train_ids_2)
       cosine_results[key1][key2] = spatial.distance.cosine(dict_train_ids_1,dict_train_ids_2)
       
print('JS Divergence')
print(JSD_results)
print('\nCosine Distance')
print(cosine_results)

JS Divergence
                  toys               vid               aut               hnk
toys             [0.0]  [0.121900206869]  [0.149253766723]  [0.129181255531]
vid   [0.121900206869]             [0.0]  [0.197747372379]  [0.200978538535]
aut   [0.149253766723]  [0.197747372379]             [0.0]  [0.118183647432]
hnk   [0.129181255531]  [0.200978538535]  [0.118183647432]             [0.0]

Cosine Distance
          toys       vid          aut          hnk
toys         0  0.345511     0.287423     0.236629
vid   0.345511         0     0.547328     0.526993
aut   0.287423  0.547328  2.22045e-16     0.144065
hnk   0.236629  0.526993     0.144065 -2.22045e-16


In [12]:
#Calculating similarity using countVectorizer of two dfs together rather than all 4.
JSD_results = pd.DataFrame(index=list_df,columns=list_df)
cosine_results = pd.DataFrame(index=list_df,columns=list_df)
for key1 in list_df:
    for key2 in list_df:
        if list_df.index(key1)!= list_df.index(key2):
            #print(key1,key2)
            dict_train_ids_1 = dict_transfer_train_ids[key1][key2].sum(axis=0).T
            dict_dev_ids_2 = dict_transfer_dev_ids[key2][key1].sum(axis=0).T
            #print(dict_allVocab_ids[key1].shape,dict_train_ids_1.shape,dict_train_ids_2.shape)
            JSD_results[key1][key2] = JSD(dict_train_ids_1,dict_dev_ids_2)
            cosine_results[key1][key2] = spatial.distance.cosine(dict_train_ids_1,dict_dev_ids_2)
       
print('JS Divergence')
print(JSD_results)
print('\nCosine Distance')
print(cosine_results)


JS Divergence
                  toys               vid               aut               hnk
toys               NaN  [0.128890147874]  [0.153422214856]  [0.132789423959]
vid   [0.126816613629]               NaN  [0.202562442184]  [0.205457618486]
aut   [0.154323392071]  [0.203572077456]               NaN  [0.122984089333]
hnk   [0.134259265546]  [0.207322916085]  [0.122498015676]               NaN

Cosine Distance
          toys       vid       aut       hnk
toys       NaN  0.348899  0.285166  0.232095
vid    0.34513       NaN  0.547097  0.526382
aut   0.289583  0.548537       NaN  0.145872
hnk   0.236801  0.526535  0.143537       NaN


In [35]:
#calculating cosine similarity for each individual review.
from scipy.sparse import csr_matrix

key1 = 'toys'
key2 = 'aut'
source_ids = dict_transfer_train_ids[key1][key2].sum(axis=0).T
cosine_distance_train = np.ones((len(dict_train_y[key2])))
cosine_distance_dev = np.ones((len(dict_dev_y[key2])))
print(source_ids.shape,cosine_distance_train.shape,cosine_distance_dev.shape)
print(norm(source_ids))
n = norm(source_ids)

count_zero_norm = 0
for i in range(len(dict_train_y[key2])):
    y = dict_transfer_train_ids[key2][key1][i].T.toarray()
    if norm(y) == 0:
        cosine_distance_train[i] = 1
        count_zero_norm += 1
    else:
        cosine_distance_train[i] = spatial.distance.cosine(source_ids,y)
print(max(cosine_distance_train),min(cosine_distance_train),cosine_distance_train.shape)
print('number of reviews with zero norm = ',count_zero_norm)

y = np.histogram(cosine_distance_train,bins=20, normed=False)
print(len(y),y)

for i in range(len(dict_dev_y[key2])):
    z = dict_transfer_dev_ids[key2][key1][i].T.toarray()
    if norm(z) == 0:
        cosine_distance_dev[i] = 1
    else:
        cosine_distance_dev[i] = spatial.distance.cosine(source_ids,z)
        
print(max(cosine_distance_dev),min(cosine_distance_dev))

(43529, 1) (100000,) (30000,)
111165.904643
1.0 0.592696517226 (100000,)
number of reviews with zero norm =  41
2 (array([    2,     9,    21,    64,   194,   455,   857,  1661,  2772,
        4521,  6692,  8950, 11210, 12703, 13418, 12543, 10782,  7831,
        4189,  1126]), array([ 0.59269652,  0.61306169,  0.63342687,  0.65379204,  0.67415721,
        0.69452239,  0.71488756,  0.73525274,  0.75561791,  0.77598308,
        0.79634826,  0.81671343,  0.83707861,  0.85744378,  0.87780896,
        0.89817413,  0.9185393 ,  0.93890448,  0.95926965,  0.97963483,  1.        ]))
1.0 0.605343760681


In [None]:
y = np.histogram(cosine_distance_train,bins=20, normed=False)
print(len(y),y)

In [36]:
#calculating accuracy by distance buckets
from sklearn.metrics import roc_auc_score

source_modelVect = MultinomialNB()
source_modelVect.fit(dict_transfer_train_ids[key1][key2],dict_train_y[key1])
dict_dev_ypred = source_modelVect.predict(dict_transfer_dev_ids[key2][key1])
dict_dev_ypred_proba = source_modelVect.predict_proba(dict_transfer_dev_ids[key2][key1])[:,1]
#print(dict_dev_ypred_proba[:5])
dev_y_actual = dict_dev_y[key2]
#print(dict_dev_ypred[:5])
acc = accuracy_score(dict_dev_y[key2], dict_dev_ypred)
print('Overall transfer accuracy',acc)
auc = roc_auc_score(dict_dev_y[key2], dict_dev_ypred_proba, average = 'weighted')
print('Overall AUC',auc)

acc_by_similarity = np.zeros((20))
auc_by_sim = np.zeros((20))
percent_pos_class = np.zeros((20))
span = 0.05

for i in range(20):
    acc_by_similarity[i] = accuracy_score(dev_y_actual[(cosine_distance_dev > span*i) & (cosine_distance_dev < span*(i+1))], 
                                          dict_dev_ypred[(cosine_distance_dev > span*i) & (cosine_distance_dev < span*(i+1))])
    dev_y_selected = dev_y_actual[(cosine_distance_dev > span*i) & (cosine_distance_dev < span*(i+1))]
    percent_pos_class[i] = np.sum(dev_y_selected)/len(dev_y_selected)
    if percent_pos_class[i] < 0.99:
        auc_by_sim[i] = roc_auc_score(dev_y_actual[(cosine_distance_dev > span*i) & (cosine_distance_dev < span*(i+1))], 
                                    dict_dev_ypred_proba[(cosine_distance_dev > span*i) & (cosine_distance_dev < span*(i+1))],
                                     average = 'weighted')
    percent_pos_class[i] = np.sum(dev_y_selected)/len(dev_y_selected)
    print('Buckets of cosine distance %0.1f to %0.1f, transfer accuracy: %0.2f'%(span*(i),span*(i+1),
            acc_by_similarity[i]),'auc',auc_by_sim[i],'# reviews',len(dev_y_selected), '% pos',percent_pos_class[i])


Overall transfer accuracy 0.7159
Overall AUC 0.846190231547
Buckets of cosine distance 0.0 to 0.1, transfer accuracy: nan auc 0.0 # reviews 0 % pos nan
Buckets of cosine distance 0.1 to 0.1, transfer accuracy: nan auc 0.0 # reviews 0 % pos nan
Buckets of cosine distance 0.1 to 0.2, transfer accuracy: nan auc 0.0 # reviews 0 % pos nan
Buckets of cosine distance 0.2 to 0.2, transfer accuracy: nan auc 0.0 # reviews 0 % pos nan
Buckets of cosine distance 0.2 to 0.2, transfer accuracy: nan auc 0.0 # reviews 0 % pos nan
Buckets of cosine distance 0.2 to 0.3, transfer accuracy: nan auc 0.0 # reviews 0 % pos nan
Buckets of cosine distance 0.3 to 0.4, transfer accuracy: nan auc 0.0 # reviews 0 % pos nan
Buckets of cosine distance 0.4 to 0.4, transfer accuracy: nan auc 0.0 # reviews 0 % pos nan
Buckets of cosine distance 0.4 to 0.5, transfer accuracy: nan auc 0.0 # reviews 0 % pos nan
Buckets of cosine distance 0.5 to 0.5, transfer accuracy: nan auc 0.0 # reviews 0 % pos nan
Buckets of cosine di

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


In [27]:
 #Improvement in accuracy with adding increasing random sample from the target domain to the source domain
    
vectKey = 'toys'
dfKey = 'aut'
size_list = [0,5000,10000,25000,40000,70000,100000]

for size in size_list: 
    
    #pick out samples from target domain
    df_to_add = dict_train_df[dfKey][:size]
    labels_to_add = dict_train_y[dfKey][:size]
    
    #Create combined dataframe of reviewText from both domains
    temp_two_df_reviews = pd.concat([dict_train_df[vectKey].reviewText,df_to_add.reviewText])
    temp_two_labels = np.concatenate([dict_train_y[vectKey],labels_to_add])
    print('combined df shape for',vectKey,dfKey,temp_two_df_reviews.shape, temp_two_labels.shape)
                
    #create countVectorizer on combined dataframe of reviewText from both domains
    dict_transfer_vect[vectKey][dfKey] = CountVectorizer(min_df=2, stop_words='english')
    dict_transfer_vect[vectKey][dfKey] = dict_transfer_vect[vectKey][dfKey].fit(temp_two_df_reviews)
    print("Number words in training corpus for keys",vectKey,dfKey,len(dict_transfer_vect[vectKey][dfKey].get_feature_names()))
                
    #create id vectors of reviews for each df, train and dev set, using combined countVectorizer
    dict_transfer_train = dict_transfer_vect[vectKey][dfKey].transform(temp_two_df_reviews)
    dict_transfer_dev_ids[vectKey][dfKey] = dict_transfer_vect[vectKey][dfKey].transform(dict_dev_df[vectKey].reviewText)
    dict_transfer_dev_ids[dfKey][vectKey] = dict_transfer_vect[vectKey][dfKey].transform(dict_dev_df[dfKey].reviewText)
                
    #using vectKey as source, and dfkey as target
    source_modelVect = MultinomialNB()
    source_modelVect.fit(dict_transfer_train,temp_two_labels)
    dict_dev_ypred = source_modelVect.predict(dict_transfer_dev_ids[dfKey][vectKey])
    acc = accuracy_score(dict_dev_y[dfKey], dict_dev_ypred)
    print('for size = %d, accuracy = %0.3f'%(size,acc))

combined df shape for toys aut (100000,) (100000,)
Number words in training corpus for keys toys aut 31005
for size = 0, accuracy = 0.746
combined df shape for toys aut (105000,) (105000,)
Number words in training corpus for keys toys aut 32140
for size = 5000, accuracy = 0.793
combined df shape for toys aut (110000,) (110000,)
Number words in training corpus for keys toys aut 33106
for size = 10000, accuracy = 0.819
combined df shape for toys aut (125000,) (125000,)
Number words in training corpus for keys toys aut 35295
for size = 25000, accuracy = 0.856
combined df shape for toys aut (140000,) (140000,)
Number words in training corpus for keys toys aut 37254
for size = 40000, accuracy = 0.874
combined df shape for toys aut (170000,) (170000,)
Number words in training corpus for keys toys aut 40549
for size = 70000, accuracy = 0.890
combined df shape for toys aut (200000,) (200000,)
Number words in training corpus for keys toys aut 43529
for size = 100000, accuracy = 0.897


In [23]:
# temp code to figure and confirm sorting of arrays and dataframes
arr1 = np.array([2,1,4,3])
arr2 = np.array([4,2,8,6])
print(arr2)
df = dict_train_df['aut'][:4]
print('pre sort df',df)
sort_id = np.argsort(arr1)
print('sort indices',sort_id)
arr2 = arr2[sort_id]
print('sorted array',arr2)
df = df.iloc([sort_id])
df1 = df[sort_id]
#print(df[sort_id])
print('sorted dataframe',df1)
print('last obs',df1[-2:])

[4 2 8 6]
pre sort df              reviewerID        asin                             reviewerName  \
496617   A3KLNIO5LSJUFX  B001BQISSS                                zephyrous   
1200153   A5AMO3KTY3QQR  B008OEQ6WU  M. Chase "Film,Theatre, Products Used."   
409816   A3UXW18DP4WSD6  B000VZJH6W                        Richard "Richard"   
886302   A1GSIW3K44CAWW  B004DRV5GY                                jeffs4589   

        helpful                                         reviewText  overall  \
496617   [0, 0]  This ashtray looks smaller in person than it d...      4.0   
1200153  [1, 1]  Easy to install this air filter does what it s...      5.0   
409816   [0, 0]  I need to check to price at the auto stores to...      5.0   
886302   [0, 0]  THE OLD LIGHTS WERE FOGGED OVER WITH THE NEW O...      5.0   

                     summary  unixReviewTime   reviewTime  
496617     Simple and cheap.      1398988800   05 2, 2014  
1200153           Air Filter      1370217600   06 3, 2013  
4

In [37]:
 #Improvement in accuracy with adding an actively selected sample of different sizes from the target domain to the source domain
    
vectKey = 'toys'
dfKey = 'aut'
size_list = [5000,10000,25000,40000,70000,100000]

sort_ids = np.argsort(cosine_distance_train)
cosine_distance_sorted = cosine_distance_train[sort_ids]
print(sort_ids)
df_target_ids_pre = dict_train_df[dfKey]
df_target_labels_pre = dict_train_y[dfKey]
print('pre sort',df_target_ids_pre.reviewText[-5:],df_target_labels_pre[-20:])
print(type(df_target_labels_pre))
df_target_ids_pre = df_target_ids_pre.iloc([sort_ids])
df_target_ids = df_target_ids_pre[sort_ids]
df_target_labels = df_target_labels_pre[sort_ids]
print('\n Post sort',df_target_ids.reviewText[-5:],df_target_labels[-20:],'first 20',cosine_distance_sorted[:20],cosine_distance_sorted[-20:])

for size in size_list: 
    
    #pick out samples from target domain
    df_to_add = df_target_ids[:size]
    labels_to_add = df_target_labels[:size]
    
    #Create combined dataframe of reviewText from both domains
    temp_two_df_reviews = pd.concat([dict_train_df[vectKey].reviewText,df_to_add.reviewText])
    temp_two_labels = np.concatenate([dict_train_y[vectKey],labels_to_add])
    print('combined df shape for',vectKey,dfKey,temp_two_df_reviews.shape, temp_two_labels.shape)
                
    #create countVectorizer on combined dataframe of reviewText from both domains
    dict_transfer_vect[vectKey][dfKey] = CountVectorizer(min_df=2, stop_words='english')
    dict_transfer_vect[vectKey][dfKey] = dict_transfer_vect[vectKey][dfKey].fit(temp_two_df_reviews)
    print("Number words in training corpus for keys",vectKey,dfKey,len(dict_transfer_vect[vectKey][dfKey].get_feature_names()))
                
    #create id vectors of reviews for each df, train and dev set, using combined countVectorizer
    dict_transfer_train = dict_transfer_vect[vectKey][dfKey].transform(temp_two_df_reviews)
    dict_transfer_dev_ids[vectKey][dfKey] = dict_transfer_vect[vectKey][dfKey].transform(dict_dev_df[vectKey].reviewText)
    dict_transfer_dev_ids[dfKey][vectKey] = dict_transfer_vect[vectKey][dfKey].transform(dict_dev_df[dfKey].reviewText)
                
    #using vectKey as source, and dfkey as target
    source_modelVect = MultinomialNB()
    source_modelVect.fit(dict_transfer_train,temp_two_labels)
    dict_dev_ypred = source_modelVect.predict(dict_transfer_dev_ids[dfKey][vectKey])
    acc = accuracy_score(dict_dev_y[dfKey], dict_dev_ypred)
    print('for size = %d, accuracy = %0.3f'%(size,acc))

[83897 87459  6515 ..., 68601  2674 88133]
pre sort 496971    What a beautiful jacket, excellant quality , u...
154362    Fel Pro OEM quality at rock-bottom price deliv...
781005    I wasn't sure how well these wipes would clean...
877282    I bought a pair of aftermarket HID headlights ...
542143    Nice, but I misread and thought they were for ...
Name: reviewText, dtype: object [ 1.  1.  1.  1.  0.  1.  1.  0.  1.  1.  0.  1.  1.  0.  0.  1.  1.  1.
  1.  1.]
<class 'numpy.ndarray'>

 Post sort 1066682                   gooooood
679235         Its all about the U
835032                            
749000     This is not an ashtray.
737700                            
Name: reviewText, dtype: object [ 1.  1.  1.  1.  1.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  0.  1.] first 20 [ 0.59269652  0.6059804   0.61797067  0.62121062  0.6236347   0.62776251
  0.62796823  0.6283241   0.62888674  0.62991426  0.63153037  0.63427282
  0.63466891  0.64026683  0.64058714  0.64162127  0.

In [33]:
print('\n Post sort',df_target_ids.reviewText[-5:],df_target_labels[-20:],cosine_distance_sorted[:5],cosine_distance_sorted[:100])


 Post sort 606930                                            ghetto!!!
756908    must do for mk4 jetta must do for mk4 jetta mu...
95674     &#4315;&#4304;&#4306;&#4304;&#4320;&#4312;&#43...
749000                              This is not an ashtray.
510840                                              Perfict
Name: reviewText, dtype: object [ 0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  1.  1.
  0.  1.] [ 0.  0.  0.  0.  0.] [ 0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.59269652  0.6059804   0.61797067  0.62121062  0.6236347   0.62776251
  0.62796823  0.6283241   0.

### Keeping track of results from test runs
With number in train set = 10000 (excl 3 ratings)    
    Accuracy on dev set for binary prediction: 88.74%
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 67.16%
    Vocab Size : 38696
    
With number in train set = 50000 (excl 3 ratings)   
    Accuracy on dev set for binary prediction: 91.33%   
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 69.33% 
    Vocab Size : ~ ..
    
With number in train set = 100000 (excl 3 ratings)
    Accuracy on dev set for binary prediction: 91.56%   
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 70.42%
    Vocab Size : 105304

With number in train set = 500000, dev set = 150000 (excl 3 ratings)    
    Accuracy on dev set for binary prediction: 91.73%
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 70.95%
    vocab size 307822
    
With number in train set = 1200000, dev set = 360000 (excl 3 ratings)    
    Accuracy on dev set for binary prediction: 91.92%
    Accuracy on dev set for 4 level (1,2,4,5) prediction: 71.24%
    vocab size 674074 (not repeated with correction for vocab)
    
### Output from trying different pre-processing with the toys review set.
 
 Accuracy on dev set for binary prediction: 91.69%
classification report naive bayes binary classification 
              precision    recall  f1-score   support

        0.0       0.70      0.77      0.74     22472
        1.0       0.96      0.94      0.95    127528

avg / total       0.92      0.92      0.92    150000

Accuracy on dev set for binary prediction with count vectorizer: 91.92%
classification report naive bayes binary classification with count vectorizer 
              precision    recall  f1-score   support

        0.0       0.71      0.79      0.75     22472
        1.0       0.96      0.94      0.95    127528

avg / total       0.92      0.92      0.92    150000

Accuracy on dev set for binary prediction with tfidf: 90.13%
classification report naive bayes binary classification with tfidf 
              precision    recall  f1-score   support

        0.0       0.90      0.38      0.54     22472
        1.0       0.90      0.99      0.94    127528

avg / total       0.90      0.90      0.88    150000

Accuracy on dev set for 4 level (1,2,4,5) prediction: 70.91%
classification report naive bayes multinomial classification with tfidf 
              precision    recall  f1-score   support

          1       0.60      0.74      0.66     13975
          2       0.32      0.05      0.09      8497
          4       0.42      0.34      0.37     29733
          5       0.80      0.87      0.83     97795

avg / total       0.68      0.71      0.68    150000

### Output from simple ratings prediction with video games review set.

train set size : 10000, dev set size : 3000
Accuracy on dev set for binary prediction with count vectorizer: 88.93%
classification report naive bayes binary classification with count vectorizer 
              precision    recall  f1-score   support

        0.0       0.77      0.54      0.64       534
        1.0       0.91      0.96      0.93      2466

avg / total       0.88      0.89      0.88      3000

Accuracy on dev set for binary prediction with tfidf: 84.93%
classification report naive bayes binary classification with tfidf 
              precision    recall  f1-score   support

        0.0       0.95      0.16      0.28       534
        1.0       0.85      1.00      0.92      2466

avg / total       0.86      0.85      0.80      3000

Using SVM, with Count Vectorizer pre-processing:
Accuracy on dev set for binary prediction: 82.20%
classification report svm              precision    recall  f1-score   support

        0.0       0.00      0.00      0.00       534
        1.0       0.82      1.00      0.90      2466

avg / total       0.68      0.82      0.74      3000

time taken for SVM 48.42102265357971

Using SVM with TFIDF pre-processing:
Accuracy on dev set for binary prediction: 82.20%
classification report svm              precision    recall  f1-score   support

        0.0       0.00      0.00      0.00       534
        1.0       0.82      1.00      0.90      2466

avg / total       0.68      0.82      0.74      3000

train set size : 100000, dev set size : 30000
Accuracy on dev set for binary prediction with count vectorizer: 89.12%
classification report naive bayes binary classification with count vectorizer 
              precision    recall  f1-score   support

        0.0       0.72      0.71      0.71      5728
        1.0       0.93      0.93      0.93     24272

avg / total       0.89      0.89      0.89     30000

Accuracy on dev set for binary prediction with tfidf: 86.04%
classification report naive bayes binary classification with tfidf 
              precision    recall  f1-score   support

        0.0       0.91      0.30      0.45      5728
        1.0       0.86      0.99      0.92     24272

avg / total       0.87      0.86      0.83     30000



### Results for transfer learning from toys to video games
number words in training corpus for toys: 63984
toys dataset id shapes (100000, 63984) (30000, 63984)
number words in training corpus for video games: 98899
videos dataset id shapes (100000, 98899) (30000, 98899)
number words in training corpus for automobiles: 59468
automobile dataset id shapes (100000, 59468) (30000, 59468)
number words in training corpus for home and kitchen: 57884
home and kitchen dataset id shapes (100000, 57884) (30000, 57884)

Accuracy on toys dev set for binary prediction with toys naive bayes model: 92.23%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.74      0.74      0.74      4503
        1.0       0.95      0.95      0.95     25497

avg / total       0.92      0.92      0.92     30000

Accuracy on video games dev set for binary prediction with video games naive bayes model: 89.16%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.72      0.71      0.71      5725
        1.0       0.93      0.93      0.93     24275

avg / total       0.89      0.89      0.89     30000

Accuracy on autos dev set for binary prediction with autos naive bayes model: 91.93%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.78      0.61      0.69      4323
        1.0       0.94      0.97      0.95     25677

avg / total       0.91      0.92      0.92     30000

Accuracy on home and kitchen dev set for binary prediction with home and kitchen naive bayes model: 91.37%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.76      0.71      0.73      5072
        1.0       0.94      0.96      0.95     24928

avg / total       0.91      0.91      0.91     30000

### Transfer learning:

Accuracy on video games dev set for binary prediction with toys naive bayes model: 86.99%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.66      0.65      0.66      5725
        1.0       0.92      0.92      0.92     24275

avg / total       0.87      0.87      0.87     30000

Accuracy on automobiles dev set for binary prediction with toys naive bayes model: 76.06%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.36      0.88      0.51      4323
        1.0       0.97      0.74      0.84     25677

avg / total       0.88      0.76      0.79     30000

Accuracy on home and kitchen dev set for binary prediction with toys naive bayes model: 85.78%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.55      0.85      0.67      5072
        1.0       0.97      0.86      0.91     24928

avg / total       0.90      0.86      0.87     30000

Accuracy on toys dev set for binary prediction with video games naive bayes model: 91.53%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.76      0.63      0.69      4503
        1.0       0.94      0.97      0.95     25497

avg / total       0.91      0.92      0.91     30000

Accuracy on automobiles dev set for binary prediction with video games naive bayes model: 80.50%   
Corresponding classification report              precision    recall  f1-score   support

        0.0       0.41      0.77      0.53      4323
        1.0       0.96      0.81      0.88     25677

avg / total       0.88      0.81      0.83     30000

### Results from running Naive Bayes for all 4 dfs, with different train set size.

Train data_set size = 50000
Number words in training corpus for toys 45973
Accuracy on toys dev set for binary prediction with toys naive bayes model: 92.46%
Number words in training corpus for vid 68303
Accuracy on vid dev set for binary prediction with toys naive bayes model: 89.37%
Number words in training corpus for aut 41130
Accuracy on aut dev set for binary prediction with toys naive bayes model: 91.57%
Number words in training corpus for hnk 41378
Accuracy on hnk dev set for binary prediction with toys naive bayes model: 91.43%
Effectiveness of transfer learning with Naive Bayes:
Accuracy of rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut   hnk
toys 92.5% 91.4% 90.9% 91.2%
vid  87.1% 89.4% 86.8% 87.3%
aut  76.7% 80.0% 91.6% 84.6%
hnk  86.1% 86.5% 90.3% 91.4%
Transfer loss on rating predictions
Colums = source domain, Rows = target domain

      toys   vid  aut  hnk
toys  0.0%  1.0% 1.6% 1.3%
vid   2.2%  0.0% 2.5% 2.1%
aut  14.9% 11.5% 0.0% 7.0%
hnk   5.3%  4.9% 1.1% 0.0%

 Train data_set size = 100000
Number words in training corpus for toys 64698
Accuracy on toys dev set for binary prediction with toys naive bayes model: 92.17%
Number words in training corpus for vid 98625
Accuracy on vid dev set for binary prediction with toys naive bayes model: 89.22%
Number words in training corpus for aut 59179
Accuracy on aut dev set for binary prediction with toys naive bayes model: 91.56%
Number words in training corpus for hnk 57706
Accuracy on hnk dev set for binary prediction with toys naive bayes model: 91.56%
Effectiveness of transfer learning with Naive Bayes:
Accuracy of rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut   hnk
toys 92.2% 91.3% 91.3% 91.1%
vid  86.6% 89.2% 87.4% 87.3%
aut  75.4% 78.8% 91.6% 83.6%
hnk  85.8% 86.5% 90.9% 91.6%
Transfer loss on rating predictions
Colums = source domain, Rows = target domain

      toys   vid  aut  hnk
toys  0.0%  0.8% 0.9% 1.0%
vid   2.6%  0.0% 1.8% 1.9%
aut  16.2% 12.8% 0.0% 8.0%
hnk   5.8%  5.1% 0.7% 0.0%

 Train data_set size = 250000
Number words in training corpus for toys 104512
Accuracy on toys dev set for binary prediction with toys naive bayes model: 91.92%
Number words in training corpus for vid 165841
Accuracy on vid dev set for binary prediction with toys naive bayes model: 89.01%
Number words in training corpus for aut 97914
Accuracy on aut dev set for binary prediction with toys naive bayes model: 91.87%
Number words in training corpus for hnk 93561
Accuracy on hnk dev set for binary prediction with toys naive bayes model: 91.58%
Effectiveness of transfer learning with Naive Bayes:
Accuracy of rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut   hnk
toys 91.9% 91.4% 91.8% 91.3%
vid  86.1% 89.0% 88.0% 87.5%
aut  72.7% 78.0% 91.9% 83.6%
hnk  84.9% 86.3% 91.5% 91.6%
Transfer loss on rating predictions
Colums = source domain, Rows = target domain

      toys   vid  aut  hnk
toys  0.0%  0.6% 0.1% 0.7%
vid   2.9%  0.0% 1.0% 1.6%
aut  19.2% 13.8% 0.0% 8.3%
hnk   6.7%  5.3% 0.1% 0.0%

 Train data_set size = 500000
Number words in training corpus for toys 151534
Accuracy on toys dev set for binary prediction with toys naive bayes model: 91.81%
Number words in training corpus for vid 249256
Accuracy on vid dev set for binary prediction with toys naive bayes model: 88.99%
Number words in training corpus for aut 144802
Accuracy on aut dev set for binary prediction with toys naive bayes model: 92.06%
Number words in training corpus for hnk 137575
Accuracy on hnk dev set for binary prediction with toys naive bayes model: 91.55%
Effectiveness of transfer learning with Naive Bayes:
Accuracy of rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut   hnk
toys 91.8% 91.4% 91.9% 91.4%
vid  85.9% 89.0% 88.4% 87.9%
aut  72.8% 78.7% 92.1% 83.6%
hnk  84.5% 86.9% 91.7% 91.6%
Transfer loss on rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut  hnk
toys  0.0%  0.4% -0.1% 0.5%
vid   3.1%  0.0%  0.6% 1.1%
aut  19.3% 13.3%  0.0% 8.5%
hnk   7.1%  4.7% -0.2% 0.0%

 Train data_set size = 1000000
Number words in training corpus for toys 224573
Accuracy on toys dev set for binary prediction with toys naive bayes model: 91.74%
Number words in training corpus for vid 309416
Accuracy on vid dev set for binary prediction with toys naive bayes model: 88.89%
Number words in training corpus for aut 185933
Accuracy on aut dev set for binary prediction with toys naive bayes model: 92.03%
Number words in training corpus for hnk 204991
Accuracy on hnk dev set for binary prediction with toys naive bayes model: 91.50%
Effectiveness of transfer learning with Naive Bayes:
Accuracy of rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut   hnk
toys 91.7% 91.5% 91.9% 91.4%
vid  85.8% 88.9% 88.7% 88.0%
aut  73.0% 78.6% 92.0% 83.3%
hnk  84.4% 86.5% 91.8% 91.5%
Transfer loss on rating predictions
Colums = source domain, Rows = target domain

      toys   vid   aut  hnk
toys  0.0%  0.3% -0.2% 0.3%
vid   3.1%  0.0%  0.2% 0.9%
aut  19.0% 13.5%  0.0% 8.7%
hnk   7.1%  5.0% -0.3% 0.0%

from scipy.stats import entropy
