In [0]:
import tensorflow as tf
import tensorflow_hub as hub

In [0]:
import json
import spacy
import numpy as np
import pandas as pd
import gzip
from sklearn.mixture import GaussianMixture
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [0]:
df = getDF('reviews_CDs_and_Vinyl_5.json.gz')

# Baseline Experiments
## Review level

### Get review text from product with maximum number of reviews

In [0]:
products = df.groupby('asin')
popular_product = products.get_group(max([(name,len(g)) for name, g in products], key=lambda x: x[1])[0])
text = popular_product.reviewText.values.tolist()

### Get the embeddings

In [10]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
embed = hub.Module(module_url)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/2'.
INFO:tensorflow:Downloaded https://tfhub.dev/google/universal-sentence-encoder/2, Total size: 993.27MB
INFO:tensorflow:Downloaded TF-Hub Module 'https://tfhub.dev/google/universal-sentence-encoder/2'.
Instructions for updating:
Colocations handled automatically by placer.


In [11]:
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(embed(text))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [12]:
message_embeddings

array([[-0.00498599,  0.05155576, -0.05256446, ..., -0.01323828,
        -0.0153173 , -0.02197886],
       [ 0.03496033,  0.00148476, -0.03304245, ...,  0.02228946,
        -0.05228505, -0.04375076],
       [ 0.01388276,  0.0456822 , -0.04915727, ...,  0.01008371,
        -0.0022106 , -0.00903768],
       ...,
       [-0.00988192,  0.04619836, -0.05654455, ..., -0.01586402,
        -0.01399377, -0.05667837],
       [-0.00270829,  0.03226892, -0.03778088, ...,  0.00980518,
        -0.04636449, -0.01610075],
       [ 0.03276125,  0.03607516, -0.01833413, ...,  0.05066476,
        -0.01203018, -0.02806881]], dtype=float32)

### Cluster

In [0]:
gmm = GaussianMixture(n_components=5, covariance_type='full').fit(message_embeddings)

In [17]:
gmm.means_

array([[ 0.0254046 ,  0.01624447, -0.01004937, ...,  0.02196262,
        -0.02832562, -0.01827429],
       [ 0.01906499,  0.04080193, -0.02449171, ...,  0.01928638,
        -0.0036682 , -0.00419977],
       [ 0.02626117,  0.02270613, -0.00379469, ..., -0.00520202,
        -0.01269296, -0.0205925 ],
       [ 0.00029728,  0.03857949, -0.04102213, ..., -0.00283053,
        -0.00977061, -0.02468293],
       [-0.0184448 ,  0.02326936, -0.03310357, ..., -0.01294109,
        -0.02420212, -0.02919825]])

### Extract

In [0]:
sim = cosine_similarity(message_embeddings, gmm.means_)

In [0]:
extractive_means = sim.argmax(axis=0)

In [26]:
for i in extractive_means:
  print(text[i] + '\n')

For as long as it's been since the last album, you would think this album would just blow you over. It doesn't even come close. I love Metallica, old and new, but this album is just a blistering mess. First of all, I think the hearing loss must be getting to the guys because this album is mixed horribly. The drums sound like Lars recorded them while playing in a tin can, and the guitars are so muddy and indistinct that you can discern what the hell is going on. The lyrics are horrendous, and all the songs are unbelievably repetetive. Yeah, they're all 10 minutes long, but after 3, you've heard everything, and it just repeats, and repeats, and repeats. I guess the truly die hard fans might love this new sound, but I couldn't be more disappointed. There are no epics, no great songs like Metallica of old, nor anything catchy or intricate like their newer stuff. This is just garbage....beware.

I am a long time Metallica fan. Even though Load and Reload were disppointing, I respected the e

## Sentence Level

In [0]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm')

### Perform sentence segmentation

In [0]:
sentences = []
for review in text:
  doc = nlp(review)
  for sent in doc.sents:
    sentences.append(sent.text)

In [32]:
sentences[:10]

['Without a doubt, Metallica are the godfathers of thrash metal.',
 "More so than any modern band, Metallica has been influential in shaping the course and growth of metal from their early days back in the 1980's to the present.",
 'Perhaps the best compliment that can be paid to Metallica is the fact that, at 40+ years-of-age, they have been able to remain relevant in an industry (music) that is dominated and defined by the interests of the youth.',
 'Metallica\'s early career was defined by four albums which could stand as the defining elements of the thrash metal genre (i.e., "Kill em All" through "...',
 'And Justice for All").',
 'Following "...',
 'And Justice for All," Metallica entered a period of producing music that, although well written, seemed to abandon their thrash metal roots.',
 'As any true fan of Metallica and thrash metal know, Metallica endured some fairly harsh criticism for "softening up."',
 'However, with "St. Anger" Metallica reposition themselves firmly withi

### Proceed as before

In [33]:
review_sentence_embeddings = []
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  review_sentence_embeddings = session.run(embed(sentences))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [0]:
gmm = GaussianMixture(n_components=20, covariance_type='full').fit(review_sentence_embeddings)

In [0]:
sim = cosine_similarity(review_sentence_embeddings, gmm.means_)
extractive_means = sim.argmax(axis=0)

In [40]:
for i in extractive_means:
  print(sentences[i] + '\n')

It's not like anything they've done before.

Backing up the awful drums are a whole lot of awful guitar noises.

The new Metallica isn't as awesome as the old Metallica

I mean it is terrible!

, then it goes into a nice little riff.

However, "St. Anger" is just awful.

death the grunde riffs and the bess

SO

I applaud Metallica's approach of including a bonus DVD for the price of one CD.  

I mean, really.

The lyrics are horrendous, and all the songs are unbelievably repetetive.

I have always been a huge Metallica fan and while I found parts of this album to be refreshing and almost true to the old school Metallica albums, it was lacking something.

In addition, Lars Ulrich sounds like he's playing the EXACT same beat on every song.

THIS ALBUM IS AT BEST:

"St.

For those fans who have shyed away from Metallica because of recent disagreements about their creative process, or the belief that the guys were in some way &quot;selling-out&quot; in the industry, this album should be a 