In [7]:
import json
import ast
import operator
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textstat.textstat import textstat
from empath import Empath

def getSentimentForText(text):
    sia = SentimentIntensityAnalyzer()
    sentiments = sia.polarity_scores(text)
    return sentiments

def getAutomatedReadabilityIndexForText(text):
    return textstat.flesch_reading_ease(text)

def getEmpathCategoriesForText(text):
    lexicon = Empath()
    return lexicon.analyze(text, normalize=True)

def getMostHelpfulReview(listOfReviews):
    allReviewsDict = {}
    for review in listOfReviews:
        if float(review[0]) > 0.0:
            allReviewsDict[float(review[1])/float(review[0])] = [review[2],review[3]]
    if len(allReviewsDict.keys()) > 0:
        return allReviewsDict[max(allReviewsDict.keys())]
    else:
        return "NA"

def make_clean_complete_data(furl):
    f = open(furl, 'r')
    l = f.readlines()
    megaAsinDict = {}
    for line in l:
        asinDict = json.loads(line)
        if asinDict["asin"] in megaAsinDict:
            megaAsinDict[asinDict["asin"]]["review_list"].append([asinDict["helpful"][1], 
                                                                  asinDict["helpful"][0], 
                                                                  asinDict["reviewText"],
                                                                 asinDict["overall"]])
        else:
            megaAsinDict[asinDict["asin"]] = {"review_list" : [[asinDict["helpful"][1], 
                                                                asinDict["helpful"][0], 
                                                               asinDict["reviewText"],
                                                                 asinDict["overall"]]]}
        
    asinReviewDict = {}
    for asin in megaAsinDict:
        bestReview = getMostHelpfulReview(megaAsinDict[asin]["review_list"])
        if not bestReview == "NA":
            asinReviewDict[asin] = {
                                    "Best review" : bestReview[0],
                                    "Rating" : bestReview[1]
                                   }
    asinReviewSRPDTDict = {}
    products_found = 0
    lines_read = 0

    with open('./data/metadata.json') as fp: 
        line = fp.readline()
        while line:
            lines_read += 1
            d = ast.literal_eval(line.strip())
            if d['asin'] in asinReviewDict:
                products_found += 1
                asinReviewSRPDTDict[d['asin']] =  { "Best review" : asinReviewDict[d['asin']]["Best review"] }
                asinReviewSRPDTDict[d['asin']] ["Rating"] = asinReviewDict[d['asin']]["Rating"]
                if 'salesRank' in d:
                    asinReviewSRPDTDict[d['asin']] ["salesRank"] = d['salesRank']
                if 'title' in d:
                    asinReviewSRPDTDict[d['asin']] ["title"] =  d['title']
                if 'description' in d:
                    asinReviewSRPDTDict[d['asin']] ["description"] =  d['description']
                if 'price' in d:
                    asinReviewSRPDTDict[d['asin']] ["price"] =  d['price']
                if 'brand' in d:
                    asinReviewSRPDTDict[d['asin']] ["brand"] =  d['brand']
                if 'imUrl' in d:
                    asinReviewSRPDTDict[d['asin']] ["imUrl"] =  d['imUrl']
            if lines_read % 1000000 == 0:
                print("lines read:",lines_read,"products found:", products_found)
            line = fp.readline()
    
    #save the data just in case
    with open('./out/'+furl.split("/")[-1].split(".")[0]+'_unclean.json', 'w') as outfile:
        json.dump(asinReviewSRPDTDict, outfile)
        
    noSalesRankAsins = []
    for asin in asinReviewSRPDTDict:
        if 'salesRank' not in asinReviewSRPDTDict[asin]:
            noSalesRankAsins.append(asin)
        elif len([*asinReviewSRPDTDict[asin]['salesRank'].keys()]) <= 0:
            noSalesRankAsins.append(asin)
    print("No sales rank found for: "+str(len(noSalesRankAsins))+" products.")

    # remove entries for which salesrank is empty
    for asin in noSalesRankAsins:
        del asinReviewSRPDTDict[asin]

    # remove everything review and description length outside the 20 and 300 word range
    usefulAsins = []
    for asin in asinReviewSRPDTDict:
        if ("Best review" in asinReviewSRPDTDict[asin]) and ("description" in asinReviewSRPDTDict[asin]):
            reviewLength = len(asinReviewSRPDTDict[asin]["Best review"].split())
            descriptionLength = len(asinReviewSRPDTDict[asin]["description"].split())

            if reviewLength > 20 and reviewLength < 300 and descriptionLength > 20 and descriptionLength < 300:
                usefulAsins.append(asin)
    print("Proportion of useful products:",len(usefulAsins)/len(asinReviewSRPDTDict))
    
    departments = []
    cleanProducts = {}
    for asin in usefulAsins:
        cleanProducts[asin] = asinReviewSRPDTDict[asin]
        departments.append([*cleanProducts[asin]['salesRank'].keys()][0])

    # print all the departments found with the sales rank data for this category
    deptFreqDict = {}
    for dept in departments:
        if dept in deptFreqDict:
            deptFreqDict[dept] = deptFreqDict[dept] + 1
        else:
            deptFreqDict[dept] = 1
    print("Departments present in the clean products",deptFreqDict)

    sorted_depts = sorted(deptFreqDict.items(), key=operator.itemgetter(1))
    high_freq_dept = sorted_depts[-1][0]
    print("Selecting all the products in the highest frequency department",sorted_depts[-1])

    cleanerProducts = {}
    for asin in cleanProducts:
        if [*cleanProducts[asin]['salesRank'].keys()][0] == high_freq_dept:
            cleanerProducts[asin] = cleanProducts[asin]

    # attach sentiments, ARI and Empath categories to the usable product data
    for asin in cleanerProducts:
        cleanerProducts[asin]['descriptionSentiment'] = getSentimentForText(cleanerProducts[asin]['description'])
        cleanerProducts[asin]['descriptionARI'] = getAutomatedReadabilityIndexForText(cleanerProducts[asin]['description'])
        cleanerProducts[asin]['descriptionEmpath'] = getEmpathCategoriesForText(cleanerProducts[asin]['description'])
        cleanerProducts[asin]['reviewSentiment'] = getSentimentForText(cleanerProducts[asin]['Best review'])
        cleanerProducts[asin]['reviewARI'] = getAutomatedReadabilityIndexForText(cleanerProducts[asin]['Best review'])
        cleanerProducts[asin]['reviewEmpath'] = getEmpathCategoriesForText(cleanerProducts[asin]['Best review'])
        
            
    print("No. of clean products returned",len(cleanerProducts))
    with open('./out/'+furl.split("/")[-1].split(".")[0]+"_"+high_freq_dept+'_clean.json', 'w') as outfile:
        json.dump(cleanerProducts, outfile)
    print("done!")
#    return cleanerProducts

In [8]:
# Apps_for_Android doesn't have any sales rank data
# make_clean_complete_data("./data/reviews_Beauty_5.json")
# make_clean_complete_data("./data/reviews_Health_and_Personal_Care_5.json")
# make_clean_complete_data("./data/reviews_Grocery_and_Gourmet_Food_5.json")
# make_clean_complete_data("./data/reviews_Tools_and_Home_Improvement_5.json")

lines read: 1000000 products found: 0
lines read: 2000000 products found: 573
lines read: 3000000 products found: 4027
lines read: 4000000 products found: 5931
lines read: 5000000 products found: 7409
lines read: 6000000 products found: 8385
lines read: 7000000 products found: 8986
lines read: 8000000 products found: 9496
lines read: 9000000 products found: 9824
No sales rank found for: 8817 products.
Proportion of useful products: 0.6341681574239714
Departments present in the clean products {'Home Improvement': 162, 'Industrial & Scientific': 120, 'Patio, Lawn & Garden': 82, 'Cell Phones & Accessories': 2, 'Home &amp; Kitchen': 79, 'Kitchen & Dining': 10, 'Sports &amp; Outdoors': 166, 'Health & Personal Care': 22, 'Electronics': 21, 'Automotive': 16, 'Pet Supplies': 2, 'Office Products': 2, 'Arts, Crafts & Sewing': 9, 'Toys & Games': 1, 'Beauty': 4, 'Camera &amp; Photo': 6, 'Computers & Accessories': 1, 'Movies & TV': 1, 'Musical Instruments': 3}
Selecting all the products in the high