In [None]:
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


# Importing Packages

In [None]:
import pandas as pd
import gzip 
import numpy as np
import spacy
import json
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
import ast
import spacy
from sklearn import cluster
from collections import defaultdict
from time import time

NUM_CLUSTERS = 5

# Link Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data Loading

In [None]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [None]:
df = getDF('/content/drive/My Drive/reviews_Electronics_5.json.gz')[:1000]
R = df.reviewText.values

In [None]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5.0,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1.0,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3.0,1st impression,1283990400,"09 9, 2010"
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2.0,"Great grafics, POOR GPS",1290556800,"11 24, 2010"
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1.0,"Major issues, only excuses for support",1317254400,"09 29, 2011"


# Entity Extraction Block

## Utils for Entity Extraction

In [None]:

def init_spacy():
    print("Loading Spacy")
    nlp = spacy.load("en_core_web_lg")
    # for w in stopwords:
    #     nlp.vocab[w].is_stop = True
    # for w in exclude_stopwords:
    #     nlp.vocab[w].is_stop = False
    return nlp


def init_nltk():
    print("\nLoading NLTK....")
    try:
        sid = SentimentIntensityAnalyzer()
    except LookupError:
        print("Installing SentimentAnalyzer")
        nltk.download("vader_lexicon")
        sid = SentimentIntensityAnalyzer()
    print("NLTK successfully loaded")
    return sid


def extract_aspects(reviews, nlp, sid):

    # reviews = df[['review_id', 'review_body']]
    # nlp = init_spacy()
    # sid = init_nltk()

    print("Entering Apply function!")
    aspect_list = reviews.apply(
        lambda row: apply_extraction(row, nlp, sid), axis=1
    )  # going through all the rows in the dataframe

    return aspect_list


def aspect_extraction(nlp, sid):
    root = "/content/drive/My Drive"
    print("=" * 10)
    print(df.columns)
    # df = clean_data(df)
    aspect_list = extract_aspects(df, nlp, sid)

    # print(aspect_list)

    return aspect_list


## Entity Extraction Function

In [None]:
prod_pronouns = ["it", "this", "they", "these"]
avoid_neg = ["price", "cost"]

def apply_extraction(row, nlp, sid):
    review_body = row["reviewText"]
    review_id = row["reviewerID"]
    # verified = row["verified"]
    # overall = row["overall"]
    product_id = row["asin"]
    # summary = row["summary"]
    unixReviewTime = row["unixReviewTime"]
    # style = row["style"]
    # name = row["reviewerName"]
    # vote = row["vote"]
    doc = nlp(review_body)

    # FIRST RULE OF DEPENDANCY PARSE -
    # M - Sentiment modifier || A - Aspect
    # RULE = M is child of A with a relationshio of amod
    rule1_pairs = []
    for token in doc:
        A = "999999"
        M = "999999"
        if token.dep_ == "amod" and not token.is_stop:
            M = token.text
            A = token.head.text

            # add adverbial modifier of adjective
            # (e.g. 'most comfortable headphones')
            M_children = token.children
            for child_m in M_children:
                if child_m.dep_ == "advmod":
                    M_hash = child_m.text
                    M = M_hash + " " + M
                    break

            # negation in adjective, the "no" keyword is a
            # 'det' of the noun (e.g. no interesting characters)
            if A not in avoid_neg:
              A_children = token.head.children
              for child_a in A_children:
                  if child_a.dep_ == "det" and child_a.text == "no":
                      neg_prefix = "not"
                      M = neg_prefix + " " + M
                      break

        if A != "999999" and M != "999999":
            x = sid.polarity_scores(token.text)["compound"]
            rule1_pairs.append((A, M, x, 1))

    # SECOND RULE OF DEPENDANCY PARSE -
    # M - Sentiment modifier || A - Aspect
    # Direct Object - A is a child of something
    # with relationship of nsubj, while
    # M is a child of the same something
    # with relationship of dobj
    # Assumption - A verb will have only one NSUBJ and DOBJ
    add_neg_pfx = False
    rule2_pairs = []
    for token in doc:
        children = token.children
        A = "999999"
        M = "999999"
        add_neg_pfx = False
        for child in children:
            if child.dep_ == "nsubj" and not child.is_stop:
                A = child.text
                # check_spelling(child.text)
            t1 = child.dep_ == "dobj"
            t2 = child.pos_ == "ADJ"
            if (t1 and t2) and not child.is_stop:
                M = child.text
                # check_spelling(child.text)

            if child.dep_ == "neg":
                neg_prefix = child.text
                add_neg_pfx = True

    if add_neg_pfx and M != "999999":
        M = neg_prefix + " " + M

        if A != "999999" and M != "999999":
            rule2_pairs.append((A, M, sid.polarity_scores(M)["compound"], 2))

    # THIRD RULE OF DEPENDANCY PARSE -
    # M - Sentiment modifier || A - Aspect
    # Adjectival Complement - A is a child
    # of something with relationship of nsubj, while
    # M is a child of the same something
    # with relationship of acomp
    # Assumption - A verb will have only one NSUBJ and DOBJ
    # "The sound of the speakers would be better.
    #  The sound of the speakers could be better"
    # - handled using AUX dependency

    rule3_pairs = []

    for token in doc:

        children = token.children
        A = "999999"
        M = "999999"
        add_neg_pfx = False
        for child in children:
            if child.dep_ == "nsubj" and not child.is_stop:
                A = child.text
                # check_spelling(child.text)

            if child.dep_ == "acomp" and not child.is_stop:
                M = child.text

            # example - 'this could have been better' -> (this, not better)
            if child.dep_ == "aux" and child.tag_ == "MD":
                neg_prefix = "not"
                add_neg_pfx = True

            if child.dep_ == "neg":
                neg_prefix = child.text
                add_neg_pfx = True

        if add_neg_pfx and M != "999999":
            M = neg_prefix + " " + M
            # check_spelling(child.text)

        if A != "999999" and M != "999999":
            rule3_pairs.append((A, M, sid.polarity_scores(M)["compound"], 3))

    # FOURTH RULE OF DEPENDANCY PARSE -
    # M - Sentiment modifier || A - Aspect
    # Adverbial modifier to a passive verb -
    # A is a child of something with relationship
    # of nsubjpass, while
    # M is a child of the same something
    # with relationship of advmod
    # Assumption - A verb will have only one NSUBJ and DOBJ

    rule4_pairs = []
    for token in doc:

        children = token.children
        A = "999999"
        M = "999999"
        add_neg_pfx = False
        for child in children:
            if (
                child.dep_ == "nsubjpass" or child.dep_ == "nsubj"
            ) and not child.is_stop:
                A = child.text
                # check_spelling(child.text)

            if child.dep_ == "advmod" and not child.is_stop:
                M = child.text
                M_children = child.children
                for child_m in M_children:
                    if child_m.dep_ == "advmod":
                        M_hash = child_m.text
                        M = M_hash + " " + child.text
                        break
                # check_spelling(child.text)

            if child.dep_ == "neg":
                neg_prefix = child.text
                add_neg_pfx = True

        if add_neg_pfx and M != "999999":
            M = neg_prefix + " " + M

        if A != "999999" and M != "999999":
            temp = sid.polarity_scores(M)["compound"]
            rule4_pairs.append((A, M, temp, 4))

    # FIFTH RULE OF DEPENDANCY PARSE -
    # M - Sentiment modifier || A - Aspect
    # Complement of a copular verb - A is a
    # child of M with relationship of nsubj, while
    # M has a child with relationship of cop
    # Assumption - A verb will have only one NSUBJ and DOBJ

    rule5_pairs = []
    for token in doc:
        children = token.children
        A = "999999"
        buf_var = "999999"
        for child in children:
            if child.dep_ == "nsubj" and not child.is_stop:
                A = child.text
                # check_spelling(child.text)

            if child.dep_ == "cop" and not child.is_stop:
                buf_var = child.text
                # check_spelling(child.text)

        if A != "999999" and buf_var != "999999":
            rule5_pairs.append(
                (A, token.text, sid.polarity_scores(token.text)["compound"], 5)
            )

    # SIXTH RULE OF DEPENDANCY PARSE -
    # M - Sentiment modifier || A - Aspect
    # Example - "It ok", "ok" is INTJ
    # (interjections like bravo, great etc)

    rule6_pairs = []
    for token in doc:
        children = token.children
        A = "999999"
        M = "999999"
        if token.pos_ == "INTJ" and not token.is_stop:
            for child in children:
                if child.dep_ == "nsubj" and not child.is_stop:
                    A = child.text
                    M = token.text
                    # check_spelling(child.text)

        if A != "999999" and M != "999999":
            rule6_pairs.append((A, M, sid.polarity_scores(M)["compound"], 6))

    # SEVENTH RULE OF DEPENDANCY PARSE -
    # M - Sentiment modifier || A - Aspect
    # ATTR - link between a verb like
    # 'be/seem/appear' and its complement
    # Example: 'this is garbage' -> (this, garbage)

    rule7_pairs = []
    for token in doc:
        children = token.children
        A = "999999"
        M = "999999"
        add_neg_pfx = False
        for child in children:
            if child.dep_ == "nsubj" and not child.is_stop:
                A = child.text
                # check_spelling(child.text)

            if (child.dep_ == "attr") and not child.is_stop:
                M = child.text
                # check_spelling(child.text)

            if child.dep_ == "neg":
                neg_prefix = child.text
                add_neg_pfx = True

        if add_neg_pfx and M != "999999":
            M = neg_prefix + " " + M

        if A != "999999" and M != "999999":
            rule7_pairs.append((A, M, sid.polarity_scores(M)["compound"], 7))

    aspects = []

    aspects = (
        rule1_pairs
        + rule2_pairs
        + rule3_pairs
        + rule4_pairs
        + rule5_pairs
        + rule6_pairs
        + rule7_pairs
    )

    # replace all instances of "it", "this" and "they" with "product"
    aspects = [
        (A, M, P, r) if A not in prod_pronouns else ("product", M, P, r)
        for A, M, P, r in aspects
    ]

    dic = {
        "reviewText": review_body,
        "reviewerID": review_id,
        # "verified": verified,
        # "overall": overall,
        "asin": product_id,
        # "summary": summary,
        "unixReviewTime": unixReviewTime,
        "aspect_pairs": aspects,
        # "style": style,
        # "reviewerName": name,
        # "vote": vote,
    }

    return dic



## Making json file with entities

In [None]:
nlp = init_spacy()
sid = init_nltk()
a = aspect_extraction(nlp, sid)
a.to_json("Entity.json", orient="split", compression="infer")


Loading Spacy

Loading NLTK....
Installing SentimentAnalyzer
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
NLTK successfully loaded
Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')
Entering Apply function!


# Clustering Block

## Loading entity data

In [None]:
f = open("Entity.json")
reviews_data = json.load(f)["data"]

In [None]:
reviews_data[100]

{'asin': '0972683275',
 'aspect_pairs': [['installation', 'simple', 0.0, 3]],
 'reviewText': "you can't beat the price and the installation was simple as can be. We installed it in our office and it is perfect! no problems and was completed in a matter of minutes",
 'reviewerID': 'A7EVWY97FPSIA',
 'unixReviewTime': 1355097600}

## Utils function for Clustering

In [None]:
def get_unique_product_ids(reviews_data):
    product_ids = []
    product_ids = [r['asin'] for r in reviews_data]
    return list(set(product_ids))

def get_aspects(reviews_data):
    aspects = []
    for review in reviews_data:
        aspect_pairs = review["aspect_pairs"]
        for noun,_,_,_ in aspect_pairs:
            aspects.append(noun)
    # aspects = [r['aspect_pairs'][0] for r in reviews_data]
    return aspects

def get_aspect_freq_map(aspects):
    aspect_freq_map = defaultdict(int)
    for asp in aspects:
        aspect_freq_map[asp] += 1
    return aspect_freq_map

def get_unique_aspects(aspects):
    unique_aspects = list(set(aspects)) # use this list for clustering
    return unique_aspects


def get_word_vectors(unique_aspects, nlp):
    asp_vectors = []
    for aspect in unique_aspects:
        # print(aspect)
        token = nlp(aspect)
        asp_vectors.append(token.vector)
    return asp_vectors

def get_word_clusters(unique_aspects, nlp):
    print("Found {} unique aspects for this product".format(len(unique_aspects)))
    asp_vectors = get_word_vectors(unique_aspects, nlp)
    # n_clusters = min(NUM_CLUSTERS,len(unique_aspects))
    if len(unique_aspects) <= NUM_CLUSTERS:
        print("Too few aspects ({}) found. No clustering required...".format(len(unique_aspects)))
        return list(range(len(unique_aspects)))

    print("Running k-means clustering...")
    n_clusters = NUM_CLUSTERS
    kmeans = cluster.KMeans(n_clusters=n_clusters)
    kmeans.fit(asp_vectors)
    labels = kmeans.labels_
    print("Finished running k-means clustering with {} labels".format(len(labels)))
    return labels

def get_cluster_names_map(asp_to_cluster_map, aspect_freq_map):
    cluster_id_to_name_map = defaultdict()
    # cluster_to_asp_map = defaultdict()
    n_clusters = len(set(asp_to_cluster_map.values()))
    for i in range(n_clusters):
        this_cluster_asp = [k for k,v in asp_to_cluster_map.items() if v == i]
        filt_freq_map = {k:v for k,v in aspect_freq_map.items() if k in this_cluster_asp}
        filt_freq_map = sorted(filt_freq_map.items(), key = lambda x: x[1], reverse = True)
        cluster_id_to_name_map[i] = filt_freq_map[0][0]

        # cluster_to_asp_map[i] = cluster_nouns

    # print(cluster_to_asp_map)
    return cluster_id_to_name_map

## Clustering

In [None]:
def add_clusters_to_reviews(reviews_data, nlp):
    product_aspects = get_aspects(reviews_data)
    print("Total aspects found: {}".format(len(product_aspects)))
    aspect_freq_map = get_aspect_freq_map(product_aspects)
    unique_aspects = aspect_freq_map.keys()
    print("Runnig clustering on {} unique aspects".format(len(unique_aspects)))

    aspect_labels = get_word_clusters(unique_aspects, nlp)
    asp_to_cluster_map = dict(zip(unique_aspects, aspect_labels))
    cluster_names_map = get_cluster_names_map(asp_to_cluster_map, aspect_freq_map)
    updated_reviews = []
    
    m = {}
    summ = {"product_id":reviews_data[0]['asin']}
    for review in reviews_data:
        result = []
        aspect_pairs = review["aspect_pairs"]
        for noun,adj,polarity,rule  in aspect_pairs:
            cluster_label_id = asp_to_cluster_map[noun]
            cluster_label_name = cluster_names_map[cluster_label_id]
            if noun in m:
              m[cluster_label_name].append(polarity)
            else:
              m[cluster_label_name] = [polarity]
            result.append({'noun':noun, 'adj':adj, 'rule':rule, 'polarity':polarity, 'cluster':cluster_label_name})

        assert len(result) == len(aspect_pairs)

        updated_reviews.append({'review_id':review['reviewerID'], 'product_id':review['asin'], 'aspect_pairs':result})
    for key in m:
      summ[key] = np.array(m[key]).mean()
    return updated_reviews, [summ]

def update_reviews_data(reviews_data, nlp):
    updated_reviews = []
    summ_reviews = []
    product_ids = get_unique_product_ids(reviews_data)
    print("Total number of unique products in this category: {}".format(len(product_ids)))

    no_asp_reviews = [r for r in reviews_data if len(r['aspect_pairs']) == 0]
    print("Total reviews found with no aspect pairs: {}".format(len(no_asp_reviews)))

    for prod_id in product_ids:
        print("\nRunning clustering for product ID - {}".format(prod_id))
        this_product_reviews = [r for r in reviews_data if r['asin'] == prod_id]

        this_product_upd_reviews, s = add_clusters_to_reviews(this_product_reviews, nlp)
        updated_reviews.extend(this_product_upd_reviews)
        summ_reviews.extend(s)
    print("\n----------------***----------------")
    print("Updating final results")
    with open('results_file.json', 'a') as f:
        json.dump(updated_reviews,f)
    with open('summ_file.json', 'a') as f:
        json.dump(summ_reviews,f)

    print("Finished writing results to json!!")
    print("----------------***----------------")

## Calling Clustering Function

In [None]:
print("Running clustering...")
update_reviews_data(reviews_data, nlp)

Running clustering...
Total number of unique products in this category: 40
Total reviews found with no aspect pairs: 109

Running clustering for product ID - 1400532655
Total aspects found: 1480
Runnig clustering on 617 unique aspects
Found 617 unique aspects for this product
Running k-means clustering...
Finished running k-means clustering with 617 labels

Running clustering for product ID - 9966338926
Total aspects found: 34
Runnig clustering on 22 unique aspects
Found 22 unique aspects for this product
Running k-means clustering...
Finished running k-means clustering with 22 labels

Running clustering for product ID - 9573212900
Total aspects found: 26
Runnig clustering on 17 unique aspects
Found 17 unique aspects for this product
Running k-means clustering...
Finished running k-means clustering with 17 labels

Running clustering for product ID - 3744295508
Total aspects found: 34
Runnig clustering on 26 unique aspects
Found 26 unique aspects for this product
Running k-means cluster

In [None]:
ls

[0m[01;34mdrive[0m/  Entity.json  results_file.json  [01;34msample_data[0m/  summ_file.json


In [None]:
f = open("results_file.json")
data = json.load(f)

In [None]:
data[100]

{'aspect_pairs': [{'adj': 'very bad',
   'cluster': 'life',
   'noun': 'way',
   'polarity': -0.5423,
   'rule': 1},
  {'adj': 'loaded',
   'cluster': 'books',
   'noun': 'covers',
   'polarity': 0.0,
   'rule': 1},
  {'adj': 'corrupted',
   'cluster': 'tablet',
   'noun': 'freezes',
   'polarity': 0.0,
   'rule': 1},
  {'adj': 'hardline',
   'cluster': 'price',
   'noun': 'marketing',
   'polarity': 0.0,
   'rule': 1},
  {'adj': 'oriented',
   'cluster': 'price',
   'noun': 'marketing',
   'polarity': 0.0,
   'rule': 1},
  {'adj': 'different',
   'cluster': 'books',
   'noun': 'book',
   'polarity': 0.0,
   'rule': 1},
  {'adj': 'phoney',
   'cluster': 'books',
   'noun': 'cover',
   'polarity': 0.0,
   'rule': 1},
  {'adj': 'commercial',
   'cluster': 'books',
   'noun': 'cover',
   'polarity': 0.0,
   'rule': 1},
  {'adj': 'compromised',
   'cluster': 'books',
   'noun': 'reader',
   'polarity': 0.0,
   'rule': 1},
  {'adj': 'compromised',
   'cluster': 'tablet',
   'noun': 'tablet'

In [None]:
f = open("summ_file.json")
data = json.load(f)

In [None]:
data[12]

{'B&N': 0.5719,
 'books': 0.0,
 'for': 0.0,
 'library': -0.28595,
 'product_id': '1400599997',
 'screen': -0.22925}