In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

fpath = '../example/data/data_processed_1prod_full.json'
df = pd.read_json(fpath, lines=True)
docs = df['reviewText'].astype(str)

Arguments are devided into chunks by dependency parsing.

The current rule is to split sentences by conjunctions connecting two verb phrases.

In [14]:
from orangecontrib.argument.miner.topic import ArguTopic, chunker

chunks = chunker(docs)
chunks.head()

Unnamed: 0,doc_id,chunk
0,0,I always get a half size up in my tennis shoes .
1,0,For some reason these feel to big in the heel ...
2,1,walked 3 hours with no problem
3,1,Put them on and !
4,1,Love them !


In [34]:
# Chunking example
print(docs.loc[282])
chunks[chunks["doc_id"] == 282]

Good for non-aerobic workout. Mesh was very comfortable but no support for lateral movrment.


Unnamed: 0,doc_id,chunk,topic
840,282,Good for non - aerobic workout .,8
841,282,no support for lateral movrment,4
842,282,Mesh was very comfortable but .,6


In [3]:
topic_model = ArguTopic()

In [4]:
topics, probs = topic_model.fit_transform_reduce_outliers(chunks['chunk'])

Decisions made:
- outliers are reduced as much as possible without affecting the topic modeling result too much
- lemmatization is absent but will be added
- POS as representation model, see patterns in the source code.

In [7]:
topic_model.get_topic_table()

Unnamed: 0,Topic,Count,Name,keyword,keyword_scores
0,-1,27,-1_excelente_happened_crosstrainers_intended,"[excelente, happened, crosstrainers, intended,...","[1.1161912014785589, 0.9126471656932069, 0.912..."
1,0,115,0_true_expected_fit_fits,"[true, expected, fit, fits, glove, perfect, qu...","[0.4748237829513729, 0.44316708407464783, 0.43..."
2,1,98,1_sneakers_insole_breathable_training,"[sneakers, insole, breathable, training, runni...","[0.3167925914515192, 0.282263789047336, 0.2822..."
3,2,89,2_half_ordered_order_size,"[half, ordered, order, size, big, large, usual...","[0.46720440366916216, 0.44428569020814185, 0.4..."
4,3,73,3_favorite_turned_liked_absolute,"[favorite, turned, liked, absolute, love, easy...","[0.6941245718589287, 0.6189241039805948, 0.593..."
5,4,63,4_issues_roomy_cushion_bad,"[issues, roomy, cushion, bad, attractive, styl...","[0.5020137385990797, 0.4666332521585905, 0.460..."
6,5,43,5_lightweight_paper_minimalist_held,"[lightweight, paper, minimalist, held, light, ...","[0.6236381038603368, 0.6123515611372359, 0.612..."
7,6,43,6_mesh_felt_summer_comfy,"[mesh, felt, summer, comfy, tight, requires, s...","[0.6627948941118658, 0.5411697648986926, 0.541..."
8,7,46,7_hurt_pain_problems_hip,"[hurt, pain, problems, hip, weird, discomfort,...","[0.5751447174832408, 0.5556388510582352, 0.479..."
9,8,49,8_everyday_use_class_zumba,"[everyday, use, class, zumba, aerobics, assume...","[0.49385417778843194, 0.43626748281915195, 0.4..."


In [15]:
chunks['topic'] = topics
chunks.loc[chunks['topic'] == 25]

Unnamed: 0,doc_id,chunk,topic
148,47,I want to return for a refund .,25
149,47,Need instructions for returning as no paperwor...,25
273,79,Returned and and,25
274,80,I had to return the first pair and buy a 1/2 s...,25
288,84,Ca n't return them because I wore them several...,25
377,122,I returned them ... found a Ryka pair I liked ...,25
473,155,"So i had to return it , but",25
478,155,I was refunded instantly upon returning so .,25
600,206,Returned them for the second time .,25
762,255,Returning these .,25


In [6]:
from orangecontrib.argument.miner.topic import merger

docs_topics = merger(docs, chunks["doc_id"], topics, topic_model.get_topics(), 10)
docs_topics.head()

Unnamed: 0,doc,topic,keyword,keyword_scores
0,I always get a half size up in my tennis shoes...,"[2, 10]","[half, ordered, wide, hard, narrow, order, doe...","[0.46720440366916216, 0.44428569020814185, 0.4..."
1,Put them on and walked 3 hours with no problem...,"[12, 19, 3, 5]","[week, favorite, runner, trail, miles, worn, l...","[0.7408391348156694, 0.6941245718589287, 0.666..."
2,excelente,[-1],[],[]
3,The shoes fit well in the arch area. They are ...,"[21, 10, 10, 24]","[wide, hard, narrow, does, wider, arch, toe, i...","[0.860751990580811, 0.8606794904768273, 0.8594..."
4,Tried them on in a store before buying online ...,"[13, 1, 4, 1, 9]","[loves, daughter, purchase, satisfied, couple,...","[0.9457426496928387, 0.8959407264399921, 0.890..."


In [None]:
import copy
import numpy as np

full_topics = copy.deepcopy(topic_model.get_topics())
full_topics.pop(-1)

size = topic_model.top_n_words
keywords = []
keyword_scores = []
for i in full_topics:
    full_topic = full_topics[i]
    keywords += [w[0] for w in full_topic]
    keyword_scores.append([w[1] for w in full_topic]) 
keyword_scores = np.array(keyword_scores)

In [None]:
# Merging topics by distribution may decrease the performance though
i = 7
result = keyword_scores * probs[i][:, np.newaxis]
df_doc_keyword = pd.DataFrame({
    "keyword": keywords, 
    "score": result.flatten()
})
df_doc_keyword = df_doc_keyword.groupby('keyword', as_index=False).max().sort_values(
    by="score", ascending=False).reset_index(drop=True)
print(docs.loc[i]['doc_id'])
print(docs.loc[i]['chunk'])
print("Topic %i" % topics[i])
for w in topic_model.get_topic(topics[i]):
    print(w)
df_doc_keyword.loc[0:9]

3
The shoes fit well in the arch area .
Topic 22
('arch', 0.8003347630850949)
('support', 0.6964831108214038)
('outside', 0.6054996839248983)
('decent', 0.549647492998692)
('kind', 0.5102821650788569)
('twinge', 0.4961268225201613)
('ripped', 0.4961268225201613)
('flimsy', 0.4961268225201613)
('higher', 0.4961268225201613)
('provided', 0.4961268225201613)


Unnamed: 0,keyword,score
0,grommets,0.016436
1,true,0.014916
2,laces,0.014784
3,photo,0.014784
4,dark,0.01342
5,left,0.012459
6,coming,0.012459
7,shown,0.012459
8,lighter,0.012113
9,squeaking,0.012113
