In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

fpath = '../example/data/data_processed_1prod_full.json'
df = pd.read_json(fpath, lines=True)
docs = df['reviewText'].astype(str)
docs

0      I always get a half size up in my tennis shoes...
1      Put them on and walked 3 hours with no problem...
2                                              excelente
3      The shoes fit well in the arch area. They are ...
4      Tried them on in a store before buying online ...
5                                      I recommend that!
6      My son likes these, and this is the 2nd pair h...
7                                            Comfortable
8                Fit fine...did not like color in person
9      The shoe is too large. When you do lunges it h...
10     Really great for walking I'm very glad I got t...
11     Love these shoes. My feet feel so much better....
12                                        ok but too big
13           Love these shoes.. they are so comfortable.
14     In really like these. I wear between a 9-9.5 w...
15     Love these shoes!\nSo stylish and comfortable....
16     This shoe is JUST OK. Its not as comfortable a...
17     Best tennis shoes I've h

In [6]:
# split reviews into sentences
import spacy

nlp = spacy.load('en_core_web_md')

temp = []
for doc in docs:
    doc = nlp(doc)
    for sent in doc.sents:
        temp.append(str(sent))
docs = temp
docs

['I always get a half size up in my tennis shoes.',
 'For some reason these feel to big in the heel area and wide.',
 'Put them on and walked 3 hours with no problem!',
 'Love them!',
 'So light feeling',
 'excelente',
 'The shoes fit well in the arch area.',
 'They are a little wider in the toe area of the shoe, you feel like you have a lot of room.',
 'This does not make the shoe uncomfortable, just had to get used to it.',
 'Love the shoe.',
 "Tried them on in a store before buying online so I knew they'd fit good.",
 'Overall I was looking for a durable cross training shoe that would hold up to my rigorous training and these have been great so far.',
 'They are really light and comfortable.',
 "Most importantly for me they have grips on the bottoms so my feet don't slide out from under me while doing planks, push-ups, etc.",
 'Highly satisfied with this purchase.',
 'I recommend that!',
 "My son likes these, and this is the 2nd pair he's worn.",
 'Comfortable',
 'Fit fine...did not

## Constituenty parser

Extract meaningful phrases from sentences.

In [9]:

from constituent_treelib import ConstituentTree, Language

language = Language.English
spacy_model_size = ConstituentTree.SpacyModelSize.Medium
nlp = ConstituentTree.create_pipeline(language, spacy_model_size)

[nltk_data] Downloading package benepar_en3 to /Volumes/Databank/Works
[nltk_data]     pace/orange3-argument/venv/share/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


In [36]:
i = 664

sentence = docs[i]
sentence

'Mesh was very comfortable but no support for lateral movrment.'

In [122]:
from constituent_treelib import ConstituentTree, Language, LanguageError

language = Language.English
spacy_model_size = ConstituentTree.SpacyModelSize.Medium
nlp = ConstituentTree.create_pipeline(language, spacy_model_size)

def constituent_parser(sentence):
    try:
        tree = ConstituentTree(sentence, nlp)
    except LanguageError:
        return [sentence]
    all_phrases = tree.extract_all_phrases(avoid_nested_phrases=True)
    tags = {'NP', 'VP', 'ADJP', 'ADVP'}.intersection(all_phrases.keys())
    # result = [p for t in all_phrases.keys() for p in all_phrases[t]]
    result = [p for t in tags for p in all_phrases[t]]
    
    def has_child(i, strings):
        conditions = [s in strings[i] for s in strings] 
        conditions.pop(i)
        return any(conditions)
    
    def is_child(i, strings):
        conditions = [strings[i] in s for s in strings] 
        conditions.pop(i)
        return any(conditions)
   
    to_remove = [] 
    for i, p in enumerate(result):
        if is_child(i, result):
            to_remove.append(p)
    for i in to_remove:
        result.remove(i)
    
    return result

constituent_parser(sentence)

[nltk_data] Downloading package benepar_en3 to /Volumes/Databank/Works
[nltk_data]     pace/orange3-argument/venv/share/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


['was very comfortable but no support for lateral movrment']

## Dependency parser

The main idea is to split sentence by conjunctions.

In [59]:
import spacy

def dependency_parser(sentence):
    nlp = spacy.load('en_core_web_md')
    doc = nlp(sentence)    
    seen = set() # keep track of covered words
    chunks = []
    for sent in doc.sents:
        heads = [cc for cc in sent.root.children if cc.dep_ == 'conj']

        for head in heads:
            words = [ww for ww in head.subtree]
            for word in words:
                seen.add(word)
            chunk = (' '.join([ww.text for ww in words]))
            chunks.append( (head.i, chunk) )

        unseen = [ww for ww in sent if ww not in seen]
        chunk = ' '.join([ww.text for ww in unseen])
        chunks.append( (sent.root.i, chunk) )

    chunks = sorted(chunks, key=lambda x: x[0])
    return [c[1] for c in chunks]

dependency_parser(sentence)

['Mesh was very comfortable but .', 'no support for lateral movrment']

## Compare on some cases

With the two parsers, we want to see how they work on different cases.

In [62]:
def compare(sentence):
    result_c = constituent_parser(sentence)
    result_d = dependency_parser(sentence)
    print("Sent: %s" % sentence)
    print("=========constituent_parser=================")
    for i, p in enumerate(result_c):
        print("%i: %s" % (i, p))
    print("=========dependency_parser=================")
    for i, p in enumerate(result_d):
        print("%i: %s" % (i, p))
    

In [124]:
compare(docs[664])

Sent: Mesh was very comfortable but no support for lateral movrment.
0: was very comfortable but no support for lateral movrment
0: Mesh was very comfortable but .
1: no support for lateral movrment


In [125]:
compare(docs[7])

Sent: They are a little wider in the toe area of the shoe, you feel like you have a lot of room.
0: are a little wider in the toe area of the shoe
1: feel like you have a lot of room
0: They are a little wider in the toe area of the shoe , you feel like you have a lot of room .


In [126]:
compare(docs[12])

Sent: They are really light and comfortable.
0: are really light and comfortable
0: They are really light and comfortable .


In [127]:
compare(docs[18])

Sent: Fit fine...did not like color in person
0: Fit fine ... did not like color in person
0: Fit fine ... did not like color in person


In [128]:
compare(docs[22])

Sent: Really great for walking I'm very glad I got these and the color is fun
0: 'm very glad I got these
1: is fun
2: the color
3: Really great for walking
0: Really great for walking I 'm very glad I got these and
1: the color is fun


In [129]:
compare(docs[42])

Sent: The color pattern and fit is what I liked the most what I liked the least is that they are not easy to clean and stains do not come out very easy or at all
0: do not come out very easy or at all
1: are not easy to clean
2: liked the least
3: liked the most
4: The color pattern and fit
0: The color pattern and fit is what I liked the most what I liked the least is that they are not easy to clean and stains do not come out very easy or at all


In [133]:
compare(docs[52])

Sent: The built-in arch support is great and I've had no discomfort after 2 weeks of use.
0: 've had no discomfort after 2 weeks of use
1: is great
2: The built - in arch support
0: The built - in arch support is great and
1: I 've had no discomfort after 2 weeks of use .


In [136]:
compare(docs[129])

Sent: Lightweight, decent arch support and comfortably to wear all day doing regular activity and sprinting.
0: Lightweight , decent arch support and comfortably to wear all day doing regular activity and sprinting .
0: Lightweight , decent arch support and comfortably to wear all day doing regular activity and sprinting .
