In [1]:
import pandas as pd

dpath = '../data/sample_input.json'

df = pd.read_json(dpath, lines=True)
df

Unnamed: 0,reviewText
0,I always get a half size up in my tennis shoes...
1,Put them on and walked 3 hours with no problem...
2,excelente
3,The shoes fit well in the arch area. They are ...
4,Tried them on in a store before buying online ...
...,...
364,Favorite Nike shoe ever! The flex sole is exce...
365,"I wear these everyday to work, the gym, etc."
366,"Love these shoes! Great fit, very light weight."
367,Super comfortable and fit my small feet perfec...


In [2]:
df_reviews = df.loc[df.astype('str').drop_duplicates().index]
df_reviews

Unnamed: 0,reviewText
0,I always get a half size up in my tennis shoes...
1,Put them on and walked 3 hours with no problem...
2,excelente
3,The shoes fit well in the arch area. They are ...
4,Tried them on in a store before buying online ...
...,...
364,Favorite Nike shoe ever! The flex sole is exce...
365,"I wear these everyday to work, the gym, etc."
366,"Love these shoes! Great fit, very light weight."
367,Super comfortable and fit my small feet perfec...


In [3]:
import spacy
import pytextrank
from spacy.language import Language
from spacy_readability import Readability
from importlib.util import find_spec

@Language.component("readability")
def readability(doc):
    read = Readability()
    doc = read(doc)
    return doc

pipe_name = 'en_core_web_md'

if find_spec(pipe_name) is None:
    spacy.cli.download(pipe_name)

nlp = spacy.load(pipe_name)
nlp.add_pipe("textrank", last=True)
nlp.add_pipe("readability", last=True)

<function __main__.readability(doc)>

In [4]:
def apply_ranking(doc, trt):
    results = []
    for phrase in doc._.phrases:
        if phrase.rank >= trt:
            results.append((phrase.text, phrase.rank))
    return results

def apply_readability(doc):
    return doc._.flesch_kincaid_reading_ease

In [5]:
scores = []
docs = nlp.pipe(texts=df_reviews['reviewText'].astype('str'))
for doc in docs:
    scores.append([apply_ranking(doc, 0), apply_readability(doc)])
print('Size:', len(scores))
print('Sample:', scores[0])

Size: 369
Sample: [[('my tennis shoes', 0.12971597081854963), ('the heel area', 0.11278192619619307), ('a half', 0.053635551595810474), ('some reason', 0.04734609155372741), ('I', 0.0), ('these', 0.0)], 92.43000000000002]


In [6]:
df_reviews['ranks'] = [p[0] for p in scores]
df_reviews['n_tokens'] = [len(p[0]) for p in scores]
df_reviews['readability'] = [p[1] for p in scores]
df_reviews.sort_values(by='n_tokens', ascending=False)

Unnamed: 0,reviewText,ranks,n_tokens,readability
313,Favorite training and walking-around sneakers....,"[(gym use, 0.09606191299609641), (flex supreme...",52,70.526224
347,I am a recess aide and on my feet all day long...,"[(hip pain, 0.13045310873361876), (time, 0.111...",43,87.037802
318,I LOVE the look and comfort of these shoes for...,"[(major back issues, 0.16276363627717572), (ot...",37,90.683676
64,"Super light-weight, decent arch support (mine ...","[(Shoes, 0.15014668465326747), (shoe, 0.150146...",35,65.562910
57,Very good overall.\nNow Ive been a huge Sketch...,"[(Walt Disney World, 0.12001095576359547), (So...",35,78.244199
...,...,...,...,...
249,Cute and comfortable,[],0,34.590000
198,Very comfortable.,[],0,-48.995000
101,Super comfy!,[],0,35.605000
346,Nice looking and fit nice,[],0,100.240000


In [7]:
import gensim.downloader as api

stopwords = list(nlp.Defaults.stop_words)

model_name = 'word2vec-google-news-300' 

# HACK: Temporary fix -@jiqi at 11/18/2022, 10:23:14 AM
# This line checks if the model file is ready and then load it.
# Should check why it takes so long (~32 secs)
model = api.load(model_name) 

In [31]:
tokens = []
for phrases_rank in list(df_reviews['ranks']):
    for phrase in phrases_rank:
        phrase = phrase[0].lower().split()
        phrase = filter(lambda t: t not in stopwords, phrase)
        phrase = ' '.join(phrase)
        if phrase: tokens.append(phrase.split(' '))
len(tokens)

1580

In [32]:
from itertools import starmap, combinations

combis = list(combinations(tokens, 2))
combis[0]
# dists = np.array(list(starmap(model.wmdistance, combis)))
# dists = np.nan_to_num(dists, nan=0, posinf=100)
# dists[:10]

(['tennis', 'shoes'], ['heel', 'area'])

In [37]:
from multiprocessing import Pool, cpu_count
from joblib import Parallel, delayed
from functools import partial
from itertools import starmap

# FIXME: seems not correct to pass strings as input, but should be list of strings (words)
# In the souce code of wmdistance function, it checks if the words in the input documents
# are there in the keyedvector model. If not, a inf distance will be given to that pair.
# But if the input is document as string but not a list of words included, it will always
# calculate the distance by English charactors, which will always return a value in [0, 1l]
dists = list(starmap(model.wmdistance, combis))

In [11]:
# data = np.zeros((len(tokens), len(tokens)))
# data[np.triu_indices(len(tokens), 1)] = dists
# data = data + data.T
# df_matrix = pd.DataFrame(data, index=tokens, columns=tokens)

##  Exploration on speeding up the process of computing pair-wise wmdistances

#### Simple for-loop

In [35]:
# simple for loop performs similarly as itertools.starmap
dists_for = [model.wmdistance(c[0], c[1]) for c in combis]

#### Multiprocessing

In [38]:
from multiprocessing import Pool, cpu_count
from functools import partial

def calc_wmdistance(model, doc1, doc2):
    return model.wmdistance(doc1, doc2)

job = partial(calc_wmdistance, model)

with Pool(processes=cpu_count()) as pool:
    # # async function
    # result = pool.starmap_async(job, combis)
    # data = result.get()

    # sync function
    # data = pool.starmap(job, combis, chunksize=10)
    data = pool.starmap(job, combis)
    

: 

: 

#### joblib

It's hard to get the desired speedup by directly applying mp or joblib to the static version of the function, as the model is likely quite large and pickle-sending it to the child processes for each calculation dominates the runtime.

In [3]:
from joblib import Parallel, delayed


# can't use the instance_bound function directly here, so use the static version instead
result = Parallel(n_jobs=-2)(calc_wmdistance(model, combi) for combi in combis)

#### Rewrite the wmdistance function as static

To solve the pickle-sending issue above, a possible solution is to have the model (actually only a very large array is needed) defined globally and stored in a shared memory so that the child processes can get access to them directly.

However, this also doesn't work. The kernel even crashed before finishing computation.

In [12]:
# static version of the wmdistance function, with shared memory of storing the
# word index and vectors generated by the gensim model. 

from pyemd import emd
from multiprocessing import Manager, RawArray
import numpy as np
from gensim.corpora.dictionary import Dictionary
from scipy.spatial.distance import cdist

# shared memory that stored the token index and vectors
TOKEN_INDEX = Manager().list(model.index_to_key)
v_shape = model.vectors.shape
vectors = RawArray('f', v_shape[0] * v_shape[1])
TOKEN_VECTORS = np.frombuffer(vectors, dtype=np.float32).reshape(v_shape)
np.copyto(TOKEN_VECTORS, model.vectors)


def init_pool(shared_t_index, shared_t_vectors):
    global t_index, t_vectors
    t_index = shared_t_index
    t_vectors = shared_t_vectors

def calc_wmdistance(doc1, doc2):
    doc1 = [w for w in doc1 if w in t_index]
    doc2 = [w for w in doc2 if w in t_index]

    # if any of the input is empty, return inf
    if not doc1 or not doc2: return float('inf')

    dictionary = Dictionary(documents=[doc1, doc2])
    vocab_len = len(dictionary)

    # if both input docs contain the same unique token, return 0
    if vocab_len == 1: return 0.0

    doclist1 = list(set(doc1))
    doclist2 = list(set(doc2))

    # get list of word vectors for each document
    v1 = []
    for w in doclist1:
        i = t_index.index(w)
        v = t_vectors[i]
        v = v / np.linalg.norm(v)
        v1.append(v)
    v1 = np.array(v1)
    v2 = []
    for w in doclist2:
        i = t_index.index(w)
        v = t_vectors[i]
        v = v / np.linalg.norm(v)
        v2.append(v)
    v2 = np.array(v2)

    # compute distance matrix
    doc1_indices = dictionary.doc2idx(doclist1)
    doc2_indices = dictionary.doc2idx(doclist2)
    distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.double)
    distance_matrix[np.ix_(doc1_indices, doc2_indices)] = cdist(v1, v2)

    # if distance matrix is all zero, return inf
    if abs(np.sum(distance_matrix)) < 1e-8: return float('inf')

    d1 = np.zeros(vocab_len, dtype=np.double)
    d2 = np.zeros(vocab_len, dtype=np.double)
    doc1_len = len(doc1)
    doc2_len = len(doc2)
    for i, f in dictionary.doc2bow(doc1):
        d1[i] = f / float(doc1_len)
    for i, f in dictionary.doc2bow(doc2):
        d2[i] = f / float(doc2_len) 
    
    return emd(d1, d2, distance_matrix)

# # test if the output is the same as the original function
# doc1 = tokens[0]
# doc2 = tokens[1]
# calc_wmdistance(doc1, doc2) == model.wmdistance(doc1.split(' '), doc2.split(' '))

In [15]:
from multiprocessing import Pool, cpu_count

with Pool(processes=cpu_count() - 1, initializer=init_pool, initargs=(TOKEN_INDEX, TOKEN_VECTORS, )) as pool:
    data = pool.starmap_async(calc_wmdistance, combis, chunksize=10)
    result = data.get()