In [2]:
import json
import nltk


import torch
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity



In [3]:
with open('raw_aspect_cluster.json', 'r') as jfile:
    aspect_json=json.load(jfile)

In [4]:
with open('raw_product_sentence_aspects.json', 'r') as jfile:
    sentence_json=json.load(jfile)

In [5]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [15]:
#generating cluster centers
aspect_centres={}
for prod in tqdm(aspect_json):
    prod_cats={}
    for i in range(8):
        aspects=aspect_json[prod][str(i)]
        if aspects==[]:
            continue
        vectors=model.encode(aspects)
        centre=np.mean(vectors,0)
        prod_cats[i]=centre
    aspect_centres[prod]=prod_cats


100%|██████████| 2041/2041 [05:56<00:00,  5.73it/s]


In [24]:
#assigning sentence to cluster

sent_clust={}
for prod in tqdm(sentence_json):
    prod_cats={0:[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[], 7:[]}
    for sent in sentence_json[prod]:
        cluster_centres=aspect_centres[prod]
        sent_aspects=sent[1]
        if sent_aspects==[]:
            continue

        sent_aspects_vec=model.encode(sent_aspects)
        sent_aspects_centre=np.mean(sent_aspects_vec,0)
        #print(np.array(list(cluster_centres.values())).shape)
        #print(len(sent_aspects_centre))
        similarities=cosine_similarity(np.array(list(cluster_centres.values())), [sent_aspects_centre]) 
        prod_cats[np.argmax(similarities)].append(sent[0])

    sent_clust[prod]=prod_cats

                
                
        


100%|██████████| 2041/2041 [44:22<00:00,  1.30s/it]


In [25]:
with open('sent_clust_raw_exc.json', 'w') as jfile:
    json.dump(sent_clust, jfile, indent=4)

In [26]:
#retrieving

salient_sents={}
for prod in tqdm(sent_clust):
    prod_cats={0:[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[], 7:[]}
    for clust in sent_clust[prod]:
        clust_sents = sent_clust[prod][clust]
        if clust_sents==[]:
            continue
        clust_vecs = model.encode(clust_sents)
        centroid = np.mean(clust_vecs, axis=0)
        #print(clust_vecs.shape)
        #print(centroid)
        similarities = cosine_similarity(clust_vecs, [centroid])
        salient_sent = clust_sents[np.argmax(similarities)]
        prod_cats[clust] = salient_sent
    salient_sents[prod] = prod_cats
        


#sent_clust

100%|██████████| 2041/2041 [13:13<00:00,  2.57it/s]


In [27]:
#demo
salient_sents['B002TR0LUG']

{0: ' and is really light .\nOne of my favorite things is ',
 1: " and gives extra room for the charger and a portable mouse .\nIt 's constructed well ",
 2: ' and also has a sturdy handle and shoulder strap .\nLooks great too .',
 3: " left over in the computer compartment .\nI 've been able to fit in about three extra notebooks in there , ",
 4: " I 've been able to stop carrying around paper and books ",
 5: ' case is not durable .\nThe inside fabric lining ',
 6: 'If you have seen this at a retail store and then order from Amazon ',
 7: ' so I wanted a smaller bag .\nThis is the perfect size '}

In [28]:
with open('salient_sents_raw_exc.json','w+') as jfile:
    json.dump(salient_sents,jfile,indent=4)