In [2]:
import pandas as pd

dpath = '../data/sample_input.json'

df = pd.read_json(dpath, lines=True)
df

Unnamed: 0,reviewText
0,I always get a half size up in my tennis shoes...
1,Put them on and walked 3 hours with no problem...
2,excelente
3,The shoes fit well in the arch area. They are ...
4,Tried them on in a store before buying online ...
...,...
364,Favorite Nike shoe ever! The flex sole is exce...
365,"I wear these everyday to work, the gym, etc."
366,"Love these shoes! Great fit, very light weight."
367,Super comfortable and fit my small feet perfec...


In [3]:
df_reviews = df.loc[df.astype('str').drop_duplicates().index]
df_reviews

Unnamed: 0,reviewText
0,I always get a half size up in my tennis shoes...
1,Put them on and walked 3 hours with no problem...
2,excelente
3,The shoes fit well in the arch area. They are ...
4,Tried them on in a store before buying online ...
...,...
364,Favorite Nike shoe ever! The flex sole is exce...
365,"I wear these everyday to work, the gym, etc."
366,"Love these shoes! Great fit, very light weight."
367,Super comfortable and fit my small feet perfec...


In [4]:
import spacy
import pytextrank
from spacy.language import Language
from spacy_readability import Readability
from importlib.util import find_spec

@Language.component("readability")
def readability(doc):
    read = Readability()
    doc = read(doc)
    return doc

pipe_name = 'en_core_web_md'

if find_spec(pipe_name) is None:
    spacy.cli.download(pipe_name)

nlp = spacy.load(pipe_name)
nlp.add_pipe("textrank", last=True)
nlp.add_pipe("readability", last=True)

<function __main__.readability(doc)>

In [5]:
def apply_ranking(doc, trt):
    results = []
    for phrase in doc._.phrases:
        if phrase.rank >= trt:
            results.append((phrase.text, phrase.rank))
    return results

def apply_readability(doc):
    return doc._.flesch_kincaid_reading_ease

In [7]:
scores = []
docs = nlp.pipe(texts=df_reviews['reviewText'].astype('str'))
for doc in docs:
    scores.append([apply_ranking(doc, 0), apply_readability(doc)])
print('Size:', len(scores))
print('Sample:', scores[0])

Size: 369
Sample: [[('my tennis shoes', 0.12971597081854963), ('the heel area', 0.11278192619619307), ('a half', 0.053635551595810474), ('some reason', 0.04734609155372741), ('I', 0.0), ('these', 0.0)], 92.43000000000002]


In [8]:
df_reviews['ranks'] = [p[0] for p in scores]
df_reviews['n_tokens'] = [len(p[0]) for p in scores]
df_reviews['readability'] = [p[1] for p in scores]
df_reviews.sort_values(by='n_tokens', ascending=False)

Unnamed: 0,reviewText,ranks,n_tokens,readability
313,Favorite training and walking-around sneakers....,"[(gym use, 0.09606191299609641), (flex supreme...",52,70.526224
347,I am a recess aide and on my feet all day long...,"[(hip pain, 0.13045310873361876), (time, 0.111...",43,87.037802
318,I LOVE the look and comfort of these shoes for...,"[(major back issues, 0.16276363627717572), (ot...",37,90.683676
64,"Super light-weight, decent arch support (mine ...","[(Shoes, 0.15014668465326747), (shoe, 0.150146...",35,65.562910
57,Very good overall.\nNow Ive been a huge Sketch...,"[(Walt Disney World, 0.12001095576359547), (So...",35,78.244199
...,...,...,...,...
249,Cute and comfortable,[],0,34.590000
198,Very comfortable.,[],0,-48.995000
101,Super comfy!,[],0,35.605000
346,Nice looking and fit nice,[],0,100.240000


In [9]:
import gensim.downloader as api

stopwords = list(nlp.Defaults.stop_words)

model_name = 'word2vec-google-news-300' 

# HACK: Temporary fix -@jiqi at 11/18/2022, 10:23:14 AM
# This line checks if the model file is ready and then load it.
# Should check why it takes so long (~32 secs)
model = api.load(model_name) 

In [10]:
tokens = []
for phrases_rank in list(df_reviews['ranks']):
    for phrase in phrases_rank:
        phrase = phrase[0].lower().split()
        phrase = filter(lambda t: t not in stopwords, phrase)
        phrase = ' '.join(phrase)
        if phrase: tokens.append(phrase)
tokens

['tennis shoes',
 'heel area',
 'half',
 'reason',
 'problem',
 'light feeling',
 '3 hours',
 'room',
 'arch area',
 'toe area',
 'shoes',
 'shoe',
 'lot',
 'planks',
 'durable cross training shoe',
 'good',
 'push-ups',
 'rigorous training',
 'grips',
 'feet',
 'purchase',
 'bottoms',
 'store',
 '2nd',
 '2nd pair',
 'son',
 'person',
 'color',
 'fit fine',
 'fitting',
 'amazon',
 'shoe size',
 'lunges',
 'size',
 'local store',
 'heel',
 'shoe',
 'color',
 'light',
 'padding',
 'lots',
 'shoes',
 'feet',
 'shoes',
 'good support',
 'light weight',
 'pair',
 '9-9.5 womens',
 '9',
 '9.5',
 'high fashion',
 'yoga pants',
 'nikes',
 'nyc',
 'shoes',
 'trip',
 'nike shoes',
 'nike',
 'pair',
 'shoe',
 'best tennis shoes',
 'money',
 'shoes',
 'box',
 'life',
 '10 pairs',
 '10',
 'stains',
 'fit',
 'color pattern',
 'weight lifting',
 'cardio classes',
 'stairmaster',
 'lightweight versatile shoe',
 'treadmill',
 'workout',
 'shoes',
 'gym',
 '3-4',
 'consideration',
 'heavy running',
 'qua

In [11]:
import itertools

combis = list(itertools.combinations(tokens, 2))
# dists = list(itertools.starmap(model.wmdistance, combis))
# dists

In [48]:
# to test if itertools.starmap is faster than for-loop in this case
# result shows that the for-loop is even faster
dists_for_loop = []
for combi in combis:
    dists_for_loop.append(model.wmdistance(combi[0], combi[1]))