In [40]:
import json
import pandas as pd
import os

In [41]:
# Function to load dataset raveled json file.
def load_data(filepath):
    data = json.load(open(filepath))
    return data

In [42]:
# Function to uravel json data into pandas dataframe
def convertDictToJson(data):
    finalDict = {}
    for asin in data:
        finalDict[asin] = {}
        if ('price' in data[asin]) and ('reviewPoliteness' in data[asin]):
            # Politeness
            #finalDict[asin]['reviewPoliteness'] = data[asin]["reviewPoliteness"]
            #finalDict[asin]['descriptionPoliteness'] = data[asin]["descriptionPoliteness"]
        
            # ARI
            finalDict[asin]['readabilityIndexDescription'] = data[asin]["descriptionARI"]
            finalDict[asin]['readabilityIndexReview'] = data[asin]["reviewARI"]
        
            # Sentiment - Review
            finalDict[asin]['reviewSentimentNeg'] = data[asin]["reviewSentiment"]['neg']
            finalDict[asin]['reviewSentimentNeu'] = data[asin]["reviewSentiment"]['neu']
            finalDict[asin]['reviewSentimentPos'] = data[asin]["reviewSentiment"]['pos']
            finalDict[asin]['reviewSentimentCom'] = data[asin]["reviewSentiment"]['compound']
        
            # Sentiment - Description
            finalDict[asin]['descriptionSentimentNeg'] = data[asin]["descriptionSentiment"]['neg']
            finalDict[asin]['descriptionSentimentNeu'] = data[asin]["descriptionSentiment"]['neu']
            finalDict[asin]['descriptionSentimentPos'] = data[asin]["descriptionSentiment"]['pos']
            finalDict[asin]['descriptionSentimentCom'] = data[asin]["descriptionSentiment"]['compound']
        
            #Empath - Description
            for empath_key in data[asin]["descriptionEmpath"]:
                finalDict[asin]['empath_description_'+str(empath_key)] = data[asin]["descriptionEmpath"][empath_key]
            #Empath - Review
            for empath_key in data[asin]["reviewEmpath"]:
                finalDict[asin]['empath_review_'+str(empath_key)] = data[asin]["reviewEmpath"][empath_key]
        
            # Price
            finalDict[asin]['price'] = data[asin]["price"]
            # Brand
            #finalDict[asin]['brand'] = data[asin]['brand']
        
            # Rating
            finalDict[asin]['Rating'] = data[asin]["Rating"]
        
            # Separate column for Sales Rank and Product Category.
            d = (data[asin]["salesRank"]).items()
            for key,value in d:
                finalDict[asin]['category'] = key
                finalDict[asin]['salesRank'] = value
    
    finalDf = pd.DataFrame.from_dict(finalDict, orient='index')
    return finalDf

In [43]:
# Health
dataset_health = load_data('./data/reviews_Health_and_Personal_Care_5_Health & Personal Care_clean.json')
df_health = convertDictToJson(dataset_health)
# Food
dataset_food = load_data('./data/reviews_Grocery_and_Gourmet_Food_5_Grocery & Gourmet Food_clean.json')
df_food = convertDictToJson(dataset_food)
# Beauty
dataset_beauty = load_data('./data/reviews_Beauty_5_Beauty_clean.json')
df_beauty = convertDictToJson(dataset_beauty)
# Tools
dataset_tools = load_data('./data/reviews_Tools_and_Home_Improvement_5_Sports &amp; Outdoors_clean.json')
df_tools = convertDictToJson(dataset_tools)
frames = [df_health,df_food,df_beauty,df_tools]
result = pd.concat(frames)
print(result.shape)
print(result.head())

(20239, 402)
            readabilityIndexDescription  readabilityIndexReview  \
3812028492                        87.01                   92.93   
B00000J9DU                        51.28                   75.71   
B00000JHQ2                        34.63                   90.60   
B00004YK0Y                        29.21                   68.81   
B00004YK10                        30.36                   70.33   

            reviewSentimentNeg  reviewSentimentNeu  reviewSentimentPos  \
3812028492               0.000               0.712               0.288   
B00000J9DU               0.124               0.767               0.109   
B00000JHQ2               0.114               0.802               0.083   
B00004YK0Y               0.071               0.822               0.107   
B00004YK10               0.000               0.753               0.247   

            reviewSentimentCom  descriptionSentimentNeg  \
3812028492              0.9468                    0.039   
B00000J9DU           

In [44]:
#Rename Unnamed column to ASIN - hack.
result.to_csv('./dummy_dataset.csv')
test_df = pd.read_csv('./dummy_dataset.csv')
test_df.columns = [x if not x.startswith('Unnamed') else 'ASIN' for x in test_df.columns ]
test_df.to_csv('./data/final_regression_dataset.csv',index=False)
# remove the dummy file
os.remove('./dummy_dataset.csv')

In [46]:
# compile data for the classification task
# add a class column to the dataset
# aggregate all dataframes into one dataframe for classification

def covertRegression2ClassificationDataframe(df):
    numProdsInThirtyPercent = int(df.shape[0] * 0.3)
    top30Percentdf = df.nsmallest(numProdsInThirtyPercent, 'salesRank')
    top30Percentdf["class"] = 1 
    bot30Percentdf = df.nlargest(numProdsInThirtyPercent, 'salesRank')
    bot30Percentdf["class"] = 0
    return pd.concat([top30Percentdf,bot30Percentdf])


# read the entire dataset
data = pd.read_csv("data/final_dataset.csv")
print(data.shape)
# split data into dataframes for each category
data.sort_values(by = ['category'], inplace=True)
# set the index to be this and don't drop
data.set_index(keys = ['category'], drop=False,inplace=True)
# get a list of all product categories
categories = data['category'].unique().tolist()
print(categories)

# for each category compute the top and bottom 30% products by sales 
allClassificationDFs = []
for key in categories:
    classificationDFForKey = covertRegression2ClassificationDataframe(data[:][data.category == key])
    allClassificationDFs.append(classificationDFForKey)

# combine all the data into one dataframe and store it 
classificationData = pd.concat(allClassificationDFs)
print(classificationData.shape)
print(classificationData.head())
classificationData.to_csv('./data/final_classification_dataset.csv',index=False)

(20239, 403)
['Beauty', 'Grocery & Gourmet Food', 'Health & Personal Care', 'Sports &amp; Outdoors']
(12140, 404)
                ASIN  readabilityIndexDescription  readabilityIndexReview  \
category                                                                    
Beauty    B001MA0QY2                        58.82                   93.68   
Beauty    B00I46E8DC                        50.33                   91.82   
Beauty    B000FS05VG                        43.73                  100.58   
Beauty    B00016XJ4M                      -215.78                   79.40   
Beauty    B002MSN3QQ                        59.23                   63.15   

          reviewSentimentNeg  reviewSentimentNeu  reviewSentimentPos  \
category                                                               
Beauty                 0.066               0.817               0.118   
Beauty                 0.000               0.777               0.223   
Beauty                 0.033               0.618          