In [0]:
import tensorflow as tf
import tensorflow_hub as hub

In [0]:
import json
import spacy
import numpy as np
import pandas as pd
import gzip
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#have dataset in google cloud storage
df = getDF('/content/drive/My Drive/reviews_CDs_and_Vinyl_5.json.gz')

# Baseline Experiments
## Review level

### Get review text from product with maximum number of reviews

In [0]:
products = df.groupby('asin')
popular_product = products.get_group(max([(name,len(g)) for name, g in products], key=lambda x: x[1])[0])
text = popular_product.reviewText.values.tolist()

### Get the embeddings

In [21]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
embed = hub.Module(module_url)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/2'.
INFO:tensorflow:Downloading https://tfhub.dev/google/universal-sentence-encoder/2: 908.00MB
INFO:tensorflow:Downloaded https://tfhub.dev/google/universal-sentence-encoder/2, Total size: 993.27MB
INFO:tensorflow:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/2'.
Instructions for updating:
Colocations handled automatically by placer.


In [22]:
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(embed(text))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [23]:
message_embeddings

array([[-0.00498599,  0.05155576, -0.05256446, ..., -0.01323828,
        -0.0153173 , -0.02197886],
       [ 0.03496033,  0.00148476, -0.03304245, ...,  0.02228946,
        -0.05228505, -0.04375076],
       [ 0.01388276,  0.0456822 , -0.04915727, ...,  0.01008371,
        -0.0022106 , -0.00903768],
       ...,
       [-0.00988192,  0.04619836, -0.05654455, ..., -0.01586402,
        -0.01399377, -0.05667837],
       [-0.00270829,  0.03226892, -0.03778088, ...,  0.00980518,
        -0.04636449, -0.01610075],
       [ 0.03276125,  0.03607516, -0.01833413, ...,  0.05066476,
        -0.01203018, -0.02806881]], dtype=float32)

### Cluster

In [0]:
gmm = GaussianMixture(n_components=5, covariance_type='full').fit(message_embeddings)

In [25]:
gmm.means_

array([[ 0.02396342,  0.01456812, -0.00786535, ...,  0.00964799,
        -0.02902046, -0.0208292 ],
       [ 0.00492165,  0.04102638, -0.04064649, ..., -0.00031299,
        -0.00538904, -0.02168997],
       [-0.02148044,  0.0215514 , -0.03355961, ..., -0.01495544,
        -0.02352742, -0.03041917],
       [ 0.03200406,  0.03405391, -0.01268487, ...,  0.01913169,
        -0.00155534, -0.01288623],
       [ 0.00500178,  0.03959957, -0.02983416, ...,  0.00997102,
        -0.01566399, -0.00643891]])

### Extract

In [0]:
sim = cosine_similarity(message_embeddings, gmm.means_)

In [0]:
extractive_means = sim.argmax(axis=0)

In [28]:
for i in extractive_means:
  print(text[i] + '\n')

I.v been a Metallica fan since Ride the Lightning, and this is by far the worst Metallica recording I,v ever heard. The entire album sounds like it was recorded inside of a tin can. The vocals are faint, the music itself is muddy and rushed. I got not more than four tracks into the CD when I wanted to return it to the store and get my money back. Sounds like theyve been listening to too much System of a Down.(which to me was another disappointment along with the latest Godsmack)But I digress. I have not talked to one single person who tinks this album is anything but a peice of ... . Try again guys. Do it yurself and let Bob Rock go back to Bon Jovi. This album [is not good]!

Let's start with the fact that this album sounds bad, before we even touch on the fact that it is, in fact, the worst Metallica record ever produced. Metallica latest effort tries to reclaim the glory years, unfortunately the group forgot those years. The one thing on the album that annoyed me the most was the sn

## Sentence Level

In [0]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm')

### Perform sentence segmentation

In [0]:
sentences = []
for review in text:
  doc = nlp(review)
  for sent in doc.sents:
    sentences.append(sent.text)

In [31]:
sentences[:10]

['Without a doubt, Metallica are the godfathers of thrash metal.',
 "More so than any modern band, Metallica has been influential in shaping the course and growth of metal from their early days back in the 1980's to the present.",
 'Perhaps the best compliment that can be paid to Metallica is the fact that, at 40+ years-of-age, they have been able to remain relevant in an industry (music) that is dominated and defined by the interests of the youth.',
 'Metallica\'s early career was defined by four albums which could stand as the defining elements of the thrash metal genre (i.e., "Kill em All" through "...',
 'And Justice for All").',
 'Following "...',
 'And Justice for All," Metallica entered a period of producing music that, although well written, seemed to abandon their thrash metal roots.',
 'As any true fan of Metallica and thrash metal know, Metallica endured some fairly harsh criticism for "softening up."',
 'However, with "St. Anger" Metallica reposition themselves firmly withi

### Proceed as before

In [32]:
review_sentence_embeddings = []
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  review_sentence_embeddings = session.run(embed(sentences))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [0]:
gmm = GaussianMixture(n_components=10, covariance_type='full').fit(review_sentence_embeddings)

In [0]:
sim = cosine_similarity(review_sentence_embeddings, gmm.means_)
extractive_means = sim.argmax(axis=0)

In [38]:
for i in extractive_means:
  print(sentences[i] + '\n')

THIS ALBUM IS AT BEST:

Still, a lot of fans voiced their displeasure about the Load/Reload albums, so Metallica made an album that "went back to their roots".

And thats just what you get with St. Anger.

St. Anger is better than all of Metallica's albums combined.

It's not like anything they've done before.

The lyrics are horrendous, and all the songs are unbelievably repetetive.

I bought this CD the day it came out thinking, this is Metallica-

The new Metallica isn't as awesome as the old Metallica

The snare sounds like Lars is banging on a garbage can, and there's NOT ONE GUITAR SOLO on the record.  

That is all.

