In [1]:
import time
import numpy as np
from numpy.linalg import norm
from sklearn.datasets import fetch_20newsgroups
import preprocessor
from preprocessor import read_stopwords, preprocessor
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle,os.path
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from ISLDA import ISLDA
from palmettopy.palmetto import Palmetto
import re
from collections import Counter
from numpy import dot
from numpy.linalg import norm
palmetto = Palmetto("http://localhost:7777/palmetto-webapp/service/")

In [2]:
from sklearn import metrics

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

def cluster_acc(Y_pred, Y):
    from scipy.optimize import linear_sum_assignment
    assert Y_pred.size == Y.size
    D = max(Y_pred.max(), Y.max())+1
    w = np.zeros((D,D), dtype=np.int64)
    for i in range(Y_pred.size):
        w[Y_pred[i], Y[i]] += 1
    ind = linear_sum_assignment(w.max() - w)
    ind = np.asarray(ind)
    ind = np.transpose(ind)
    return sum([w[i,j] for i,j in ind])*1.0/Y_pred.size, w

In [40]:
### SearchSnippets dataset###
data = []
targets = []
with open("data/SearchSnippets.txt", "r") as file:
    for line in tqdm(file.readlines()):
        data.append(line.strip())
with open("data/SearchSnippets_label.txt", "r") as file:
    for line in tqdm(file.readlines()):
        targets.append(line.strip())
print (len(data))
print (len(targets))

100%|██████████| 12295/12295 [00:00<00:00, 1558116.07it/s]
100%|██████████| 12295/12295 [00:00<00:00, 2089927.77it/s]

12295
12295





In [41]:
print ("Loading stopwords...")
stopwords = read_stopwords("./data/stopwords.en.txt")

print ("preprocessing dataset...")
processor = preprocessor(stopwords)

processed_texts = []
labels = []
posts = []
for i in tqdm(range(len(data))):
    temp_text = processor.preprocess(data[i])
    if temp_text == '' or len(temp_text.split())>300 or len(temp_text.split())<3:
        continue
    processed_texts.append(temp_text)
    labels.append(int(targets[i]) - 1)
    posts.append(data[i])
    
print (len(processed_texts))
print (len(labels))

n_features = 5000
vectorizer = CountVectorizer(min_df=3, max_features=n_features)
vectors = vectorizer.fit_transform(processed_texts).toarray()
feature_names = vectorizer.get_feature_names()
    

  0%|          | 0/12295 [00:00<?, ?it/s]

Loading stopwords...
preprocessing dataset...


100%|██████████| 12295/12295 [00:06<00:00, 2044.18it/s]


12276
12276


In [3]:
#### tagmynews dataset ####
data = []
targets = []
with open("data/TagMyNews.txt", "r") as file:
    c = 0
    temp = ''
    for line in tqdm(file.readlines()):
        if c % 8 == 0 or c % 8 == 1:
            temp += line.strip() + ' '
            c += 1
            continue
        elif c % 8 == 2:
            temp = re.sub("u\.s\.", "usa", temp)
            temp = re.sub("\\bu\.s\\b", "usa", temp)
            temp = re.sub("\\bus\\b", "usa", temp)
            data.append(temp.strip())
            temp = ''
            c += 1
        elif c % 8 == 6:
            targets.append(line.strip())
            c += 1
        else:
            c += 1
            continue
            
n_samples = 40000
n_features = 5000
n_top_words = 50

print ("Loading stopwords...")
stopwords = read_stopwords("./data/stopwords.en.txt")

print ("preprocessing dataset...")
processor = preprocessor(stopwords)
processed_texts = []
labels = []
posts = []
for i in tqdm(range(len(data[:n_samples]))):
    temp_text = processor.preprocess(data[i])
    if temp_text == '' or len(temp_text.split())>300 or len(temp_text.split())<3:
        continue
    if (targets[i] != 'us' and targets[i] != 'world'):
        processed_texts.append(temp_text)
        labels.append(["business","entertainment","health","sci_tech","sport"].index(targets[i]))
        posts.append(data[i])

vectorizer = CountVectorizer(min_df=3, max_features=n_features)
vectors = vectorizer.fit_transform(processed_texts[:n_samples]).toarray()
feature_names = vectorizer.get_feature_names()

100%|██████████| 260832/260832 [00:00<00:00, 550068.23it/s]
  0%|          | 0/32604 [00:00<?, ?it/s]

Loading stopwords...
preprocessing dataset...


100%|██████████| 32604/32604 [00:11<00:00, 2909.50it/s]


In [3]:
##################### data part ########################
n_samples = 20000
n_features = 5000
n_components = 16

print ("Loading stopwords...")
stopwords = read_stopwords("./data/stopwords.en.txt")
processor = preprocessor(stopwords)

if not os.path.exists("./data/20news.txt"):
    categories=['alt.atheism', 'comp.graphics',
                'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
                'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 
                'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 
                'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast']

    print ("Loading dataset...")
    corpus = fetch_20newsgroups(shuffle=True, 
                                subset='all',
                                random_state=1,
                                categories=categories, 
                                remove=('headers', 'footers', 'quotes'))
    texts = corpus.data

    print ("preprocessing dataset...")
    labels = []
    processed_texts = []
    posts = []
    for i in tqdm(range(len(texts))):
        temp_text = processor.preprocess(texts[i])
        if temp_text == '' or len(temp_text.split())>300 or len(temp_text.split())<3:
            continue
        processed_texts.append(temp_text)
        posts.append(texts[i])
        labels.append(corpus.target[i])
else:
    processed_texts = []
    labels = []
    with open("./data/20news.txt", 'r') as file:
        for line in file.readlines():
            temp = line.split('\t')
            processed_texts.append(temp[0])
            labels.append(int(temp[1]))  
labels = np.array(labels)
## if SCLDA
# vobs = []
# with open("/Users/zhengfang/Git Repository/sparse-constrained-lda/vobs_20news_expert.txt", "r") as file:
#     for line in file.readlines():
#         vobs.append(line.strip())
# vectorizer = CountVectorizer(vocabulary=vobs)

vectorizer = CountVectorizer(min_df=3, max_features=n_features)
vectors = vectorizer.fit_transform(processed_texts[:n_samples]).toarray()
feature_names = vectorizer.get_feature_names()

Loading stopwords...


In [37]:
import tomotopy as tp
vobs = []
with open('model/searchsnippets/vobs_searchsnippets_QDTM.txt', 'r') as file:
    for line in file.readlines():
        vobs.append(line.strip())
vobs = set(vobs)   

data = []
for text in processed_texts:
    temp = " ".join([w for w in text.split() if w in vobs])
    if len(temp) != 0:
        data.append(temp)
    
mdl = tp.HLDAModel(depth=3)
for line in data:
    mdl.add_doc(line.strip().split())

for i in range(0, 200, 10):
    mdl.train(16)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

print(mdl.live_k)
for k in range(mdl.k):
    if not mdl.is_live_topic(k): continue
    print('Top 10 words of topic #{}'.format(k))
    print(mdl.level(k), mdl.get_topic_words(k, top_n=10))
# mdl.summary()

# save into file
mdl.save('searchsnippets_HLDAModel.bin')
# mdl = tp.HLDAModel.load('20news_HLDAModel.bin')
# for k in range(mdl.k):
#     if not mdl.is_live_topic(k): continue
#     print('Top 10 words of topic #{}'.format(k))
#     print(mdl.get_topic_words(k, top_n=10))

Iteration: 0	Log-likelihood: -6.659774356422769
Iteration: 10	Log-likelihood: -6.5225018679403535
Iteration: 20	Log-likelihood: -6.470582645460273
Iteration: 30	Log-likelihood: -6.4480222409646
Iteration: 40	Log-likelihood: -6.430928857678623
Iteration: 50	Log-likelihood: -6.417558879261808
Iteration: 60	Log-likelihood: -6.407168300844266
Iteration: 70	Log-likelihood: -6.398663716000969
Iteration: 80	Log-likelihood: -6.391373100349221
Iteration: 90	Log-likelihood: -6.386004912289556
Iteration: 100	Log-likelihood: -6.377747196884279
Iteration: 110	Log-likelihood: -6.373690018609303
Iteration: 120	Log-likelihood: -6.369393586523704
Iteration: 130	Log-likelihood: -6.367079564915631
Iteration: 140	Log-likelihood: -6.361377101544111
Iteration: 150	Log-likelihood: -6.362200303599677
Iteration: 160	Log-likelihood: -6.360234972093361
Iteration: 170	Log-likelihood: -6.354708471668609
Iteration: 180	Log-likelihood: -6.356469936612426
Iteration: 190	Log-likelihood: -6.352964383942973
981
Top 10 w

2 [('paper', 0.05182228237390518), ('formal', 0.044429659843444824), ('url', 0.044429659843444824), ('publication', 0.044429659843444824), ('national', 0.03703703731298447), ('formulation', 0.02964441478252411), ('social', 0.02225179225206375), ('grid', 0.02225179225206375), ('bayesian', 0.02225179225206375), ('alliance', 0.02225179225206375)]
Top 10 words of topic #592
1 [('msnbc', 0.0878901556134224), ('christian', 0.055959127843379974), ('talk', 0.04797637090086937), ('host', 0.039993613958358765), ('msn', 0.039993613958358765), ('buddhist', 0.03201085701584816), ('iraq', 0.03201085701584816), ('heaven', 0.024028098210692406), ('newsweek', 0.024028098210692406), ('jewish', 0.024028098210692406)]
Top 10 words of topic #593
1 [('product', 0.08681044727563858), ('software', 0.0651349276304245), ('drive', 0.0651349276304245), ('mini', 0.05429717153310776), ('tape', 0.04345941171050072), ('development', 0.04345941171050072), ('hardware', 0.03262165188789368), ('google', 0.032621651887893

In [38]:
import tomotopy as tp
mdl = tp.HLDAModel.load('searchsnippets_HLDAModel.bin')

In [39]:
# for k in range(mdl.k):
#     if not mdl.is_live_topic(k): continue
#     print('Top 10 words of topic #{}'.format(k))
#     print(mdl.level(k), mdl.get_topic_words(k, top_n=10))

# mdl.get_topic_word_dist(0) * mdl.get_count_by_topics()[0]
# for k in range(mdl.k):
#     mdl.get_topic_word_dist(k) * mdl.get_count_by_topics()[k] / 

cohesion = []
diversity = []
for q_index, query in enumerate(processed_query):
    sim = []
    K = []
    for k in range(mdl.k):
        if not mdl.level(k) == 1: continue
        topic_embedding = np.zeros(300)
        for word, weight in mdl.get_topic_words(k, top_n=10):
            topic_embedding += gloveModel[word] * weight

        ave = []
        for term in query.split():
            if (norm(topic_embedding)*norm(gloveModel[term])!=0):
                cos_sim = dot(topic_embedding, gloveModel[term])/(norm(topic_embedding)*norm(gloveModel[term]))
            else:
                cos_sim = 0
            ave.append(cos_sim)
        K.append(k)
        sim.append(np.mean(ave))
    print (K[np.argmax(sim)])
    
    k = K[np.argmax(sim)]
    topic_embedding = np.zeros(300)
    for word, weight in mdl.get_topic_words(k, top_n=10):
        topic_embedding += gloveModel[word] * weight
    
    ave_sim = []
    topic_words = []
    for sk in range(mdl.k):
        if not mdl.level(sk) == 2: continue
        if (mdl.parent_topic(sk) == k):
            subtopic_embedding = np.zeros(300)
            for word, weight in mdl.get_topic_words(sk, top_n=10):
                subtopic_embedding += gloveModel[word] * weight
            cos_sim = dot(topic_embedding, subtopic_embedding)/(norm(topic_embedding)*norm(subtopic_embedding))
            ave_sim.append(cos_sim)
            
            temp = []
            for word, weight in mdl.get_topic_words(sk, top_n=25):
                temp.append(word)
            topic_words.append(temp)
    
    
    ave_diff = []
    for topic in topic_words:
        other_topic = [i for i in topic_words if i != topic]
        other_topic = [w for i in other_topic for w in i]
        diff = len(set(topic)-set(other_topic))/25
        ave_diff.append(diff)   
    cohesion.append(np.mean(ave_sim))    
    diversity.append(np.mean(ave_diff))
print (np.mean(cohesion))
print (np.mean(diversity))

356
545
691
15
11
9
409
291
660
763
551
264
182
356
764
692
0.4870348051848284
0.8684444444444445


In [11]:
from numpy import dot
from numpy.linalg import norm

for k in range(mdl.k):
    if not mdl.level(k) == 1: continue
    topic_embedding = np.zeros(300)
    for word, weight in mdl.get_topic_words(k, top_n=10):
        topic_embedding += gloveModel[word] * weight
    
    for sk in range(mdl.k):
        if not mdl.level(sk) == 2: continue
        if (mdl.parent_topic(sk) == k):
            subtopic_embedding = np.zeros(300)
            for word, weight in mdl.get_topic_words(sk, top_n=10):
                subtopic_embedding += gloveModel[word] * weight
            cos_sim = dot(topic_embedding, subtopic_embedding)/(norm(topic_embedding)*norm(subtopic_embedding))
            print (cos_sim)

0.49065932330592915
0.6563466696580466
0.41401363500243893
0.37569441106303597
0.3970092166480613
0.4145107182313185
0.7262963850341775
0.2385140843526004
0.5756882342728038
0.635624480806533
0.33777256810803147
0.5045267816775235
0.3843976528449889
0.4509093464350139
0.38548527682953987
0.49407338755824953
0.6765299671240679
0.5231789941364666
0.5965952763858524
0.3220547522953086
0.38015615707231654
0.5022312176985002
0.613045652104605
0.5956870642521903
0.4651877355779918
0.6795081274858615
0.45958932718885503
0.38939305600508745
0.33664558048925974
0.6137392276704224
0.34811643922173885
0.32416333169734407
0.41886611280570357
0.32460669995872055
0.33416273555122605
0.2015500894481295
0.35605227162379754
0.30882212398334025
0.4355495477168354
0.7291633408321336
0.29460744819930423
0.41882300254477073
0.5929420601428959
0.4712389265462472
0.4468289848620646
0.705970067170414
0.6511670843548915
0.5136668134103185
0.5544738139983262
0.3316263756884725
0.3816924195665175
0.4793631209386

0.5395361645131493
0.41909413498655296
0.6363908803798838
0.602851285103152
0.4317549518519833
0.5975401639243056
0.28893291577127683
0.40884700217351627
0.43175755955405576
0.6332993105674093
0.5916852507161812
0.5863085733684372
0.5206824529052594
0.19978289431195254
0.6145157056621742
0.5084692757608756
0.7126427095837424
0.7669393956531543
0.6862977246661975
0.6426988207851744
0.4089343879067172
0.6312045198187569
0.28190112672153717
0.24003749651174175
0.22059958407600183
0.7443677488384971
0.7253258632003808
0.5570121382334451
0.7220769229938601
0.3859431470941714
0.6818580772907525
0.07942303767406933
0.5508947070109319
0.6217598833225961
0.4971091795507617
0.5962681270077519
0.6868154176331186
0.5954667793521952
0.6165032210915949
0.5616546720862684
0.5788627353350777
0.3273170949109267
0.3546885355317692
0.3583457341341412
0.5392825086527511
0.3524789457831908
0.46681713400428887
0.24917179239033402
0.5033803995294505
0.5543409478567007
0.6616300490751956
0.6010887670733421
0.

0.5755209764063604
0.7196490097012389
0.6783399978680122
0.7395263410953127
0.5886390426159349
0.6894099379110536
0.6151352747013473
0.4769829864666389
0.622063256712797
0.39848917009276885
0.7237405808363127
0.43460339873998594
0.4666298793542369
0.41670631635389555
0.6985863234483918
0.6361560619255695
0.5317289107191512
0.7339437395716065
0.3690426107561798
0.5369172488842284
0.4866072234380292
0.48917696770521374
0.5677968694666167
0.318336485167467
0.4829878819396575
0.4718839804123634
0.6676375204820776
0.28809505177160855
0.44233841993936746
0.5252098018343682
0.5546503022233455
0.7518998498023911
0.657782313471214
0.6185992195347936
0.6077793331274
0.5918282293125529
0.3410378569622916
0.3137965842641358
0.48257183669245785
0.7168390935808122
0.3243873945776547
0.7220636248997594
0.5774876274489946
0.4387609989554418
0.4068457010045367
0.5163810275194485
0.45545791532677204
0.5825894535190644
0.7097114255234142
0.6190244890122738
0.5076505191165419
0.5192538718204195
0.42286984

In [4]:
from collections import defaultdict
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    f = open(gloveFile, 'r', encoding='utf-8')
    model = defaultdict(lambda: np.zeros(300))
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        model[word] = np.array([float(val) for val in splitLine[1:]])
    print ('Done.', len(model), 'words loaded!')
    return model
gloveModel = loadGloveModel('./embedding/glove.6B.300d.txt')

935it [00:00, 9342.46it/s]

Loading Glove Model


400000it [00:36, 11024.51it/s]

Done. 400000 words loaded!





In [45]:
from scipy import spatial

for w in vectorizer.get_feature_names():
    if w in gloveModel.keys() and  1 - spatial.distance.cosine(gloveModel['opportunity'], gloveModel[w]) > 0.5:
        print (w)
# 1 - spatial.distance.cosine(gloveModel['entertainment'], gloveModel['mobile'])
# gloveModel['materi']

advantage
bring
chance
create
enjoy
experience
find
give
giving
good
great
help
hope
hoped
hoping
let
make
moment
offer
opportunity
provide
seek
take
think
wanted
wonderful


In [16]:
##################### Target model part ########################
# feature = 4704
feature = len(feature_names)
target = 5

acc = []
nmi = []
pui = []
clu_acc = []
p_k = []
type_tracker = {}
for i in range(2): 
    print (rf"#######i = {i}#######")
    if i == 0:
        path = 'model/tagmynews/QDMHDP/QDTM-test'
    else:
        path = 'model/tagmynews/QDMHDP/QDTM-test' + str(i+1)

    doc_word = np.zeros((vectors.shape[0], feature))
    with open(path+'.txt') as file:
        for line in file.readlines():
            temp = line.strip().split()
            if temp[0] == 'd' or int(temp[0]) <= target-1:
                continue
    #         results[int(temp[0])-target, int(temp[-1])] += 1
            doc_word[int(temp[0])-target, int(temp[1])] += 1
            type_tracker[int(temp[-2])] = int(temp[-1])

    topic_word = np.zeros((20, feature))
    with open(path + '_nzw.txt', 'r') as file:
        z = 0
        for line in file.readlines():
            w = 0
            for n in line.strip().split():
                topic_word[z,w] = n
                w += 1
            z += 1
            
    pzw = topic_word/topic_word.sum(0)
    results = (doc_word @ pzw.T)

    temp = results + 1e-20
    X = temp / temp.sum(1).reshape((temp.shape[0], 1))
    Y = labels
    clf = LogisticRegression(max_iter=500)
    scores = cross_val_score(clf, X, Y, scoring="accuracy", cv=5)
    print ("acc:", np.mean(scores))
    acc.append(np.mean(scores))
    
    pre = np.argmax(results[:,:target], 1)
    tru = np.asarray(labels)
    print ("numi:", normalized_mutual_info_score(tru, pre))
    print ("pui:", purity_score(tru, pre))
    print ("clu_acc:", cluster_acc(pre, tru)[0])
    nmi.append(normalized_mutual_info_score(tru, pre))
    pui.append(purity_score(tru, pre))
    clu_acc.append(cluster_acc(pre, tru)[0])
    label_count = Counter(labels)
    temp = results + 1e-20
    temp = temp / temp.sum(1).reshape((-1,1))
    tru = np.asarray(labels)
    p_results = []
    for topic_index in range(target):
        top_index = np.argsort(temp[:, topic_index])[::-1][:label_count[topic_index]]
        precision = (tru[top_index] == topic_index).sum()/label_count[topic_index]
        print (precision)
        p_results.append(precision)
    print ("p@k:", np.mean(p_results))
    p_k.append(np.mean(p_results))
    print()

print (rf"#######average#######")
print ("acc:", np.mean(acc))
print ("numi:", np.mean(nmi))
print ("pui:", np.mean(pui))
print ("clu_acc:", np.mean(clu_acc))
print ("p@k:", np.mean(p_k))

#######i = 0#######
acc: 0.8371898910271274
numi: 0.4536120316740999
pui: 0.7265940180848597
clu_acc: 0.7265940180848597
0.7728712502329048
0.819841752891053
0.7412209616423555
0.6464646464646465
0.7645909645909645
p@k: 0.7489979151643849

#######i = 1#######
acc: 0.8279619754231394
numi: 0.500372893193638
pui: 0.7712497101785301
clu_acc: 0.7712497101785301
0.7754797838643562
0.7909312233718807
0.7266342517558076
0.6847788227098572
0.7893772893772893
p@k: 0.7534402742158381

#######average#######
acc: 0.8325759332251335
numi: 0.47699246243386895
pui: 0.7489218641316948
clu_acc: 0.7489218641316948
p@k: 0.7512190946901115


In [17]:
ave = []
for i in range(1,2):
    zw = np.zeros((200, 5000))
    if i == 1:
        path = 'model/tagmynews/QDMHDP/QDTM-test_nzw.txt'
    else:
        path = 'model/tagmynews/QDMHDP/QDTM-test' + str(i) + '_nzw.txt'
    with open(path, 'r') as file:
        z = 0
        for line in file.readlines():
            w = 0
            for n in line.strip().split():
                zw[z,w] = n
                w += 1
            z += 1
    vobs = []
    with open('model/tagmynews/vobs_tagmynews_test.txt', 'r') as file:
        for line in file.readlines():
            vobs.append(line.strip())
    local_coherence = []
    for z in zw[:5]:
        top_words = [re.sub("\$\d*", "", vobs[i]) for i in z.argsort()[:-10 - 1:-1]]
        print (top_words)
        local_coherence.append(palmetto.get_coherence(top_words))
    ave.append(np.mean(local_coherence))
    print ('ave coherence', np.mean(local_coherence))
print (np.mean(ave))

['profit', 'business', 'bank', 'usa', 'sell', 'sale', 'credit', 'price', 'billion', 'stock']
['week', 'show', 'star', 'theater', 'pop', 'film', 'time', 'year', 'sony', 'tv']
['health', 'disease', 'study', 'care', 'risk', 'drug', 'cancer', 'insurance', 'usa', 'people']
['google', 'apple', 'technology', 'ipad', 'china', 'company', 'online', 'intel', 'service', 'network']
['game', 'play', 'final', 'playoff', 'series', 'scored', 'boston', 'sport', 'win', 'team']
ave coherence 0.3987468968550131
0.3987468968550131


In [21]:
searchsnippetsVOBS = 3827
zw = np.zeros((200, 5000))
path = 'model/20news/QDMHDP/QDTM-test2_nzw.txt'
with open(path, 'r') as file:
    z = 0
    for line in file.readlines():
        w = 0
        for n in line.strip().split():
            zw[z,w] = n
            w += 1
        z += 1
vobs = []
with open('model/20news/vobs_20news_test.txt', 'r') as file:
    for line in file.readlines():
        vobs.append(line.strip())
local_coherence = []

count = 0;
tp_index = []
diversity = []
for t in range(target):
    topic_words = []
    for i, z in enumerate(zw):
        percent = (zw.sum(1) / zw.sum())[i]*100
        top_words = [re.sub("\$\d*", "", vobs[i]) for i in z.argsort()[:-10 - 1:-1]]
        if (percent > 0.5 and type_tracker[i] == t):
            print ("{0:.2f}%".format(percent) , type_tracker[i], top_words)
            count += 1
    #         if (type_tracker[i] == 1):
            print (i)
            topic_words.append([re.sub("\$\d*", "", vobs[i]) for i in z.argsort()[:-25 - 1:-1]])
            tp_index.append(i)
            
    ave_diff = []
    for topic in topic_words:
        other_topic = [i for i in topic_words if i != topic]
        other_topic = [w for i in other_topic for w in i]
        diff = len(set(topic)-set(other_topic))/25
        ave_diff.append(diff)
    diversity.append(np.mean(ave_diff))
print (np.mean(diversity))

1.39% 0 ['company', 'phone', 'do', 'technology', 'business', 'number', 'system', 'japanese', 'computer', 'make']
50
3.12% 0 ['government', 'clipper', 'encryption', 'chip', 'system', 'nsa', 'phone', 'people', 'security', 'key']
51
1.81% 0 ['key', 'chip', 'algorithm', 'number', 'clipper', 'encryption', 'de', 'escrow', 'system', 'secret']
52
0.6133333333333333


In [87]:
from numpy import dot
from numpy.linalg import norm


topic_word_QDTM = np.zeros((200, 3827))
path = 'model/searchsnippets/QDMHDP/QDTM_nzw.txt'
with open(path, 'r') as file:
    z = 0
    for line in file.readlines():
        w = 0
        for n in line.strip().split():
            topic_word_QDTM[z,w] = n
            w += 1
        z += 1
topic_word_sub = np.zeros((200, 3827))
path = 'model/searchsnippets/QDMHDP/QDTM-sub_nzw.txt'
with open(path, 'r') as file:
    z = 0
    for line in file.readlines():
        w = 0
        for n in line.strip().split():
            topic_word_sub[z,w] = n
            w += 1
        z += 1
vobs = []
with open('model/searchsnippets/vobs_searchsnippets_QDTM.txt', 'r') as file:
    for line in file.readlines():
        vobs.append(line.strip())

overall_topic = topic_word_QDTM[:8] + 0.5
overall_topic = overall_topic / overall_topic.sum(1).reshape((overall_topic.shape[0], 1))
# topic_word_sub
subtopics = topic_word_sub + 0.5
subtopics = subtopics / subtopics.sum(1).reshape((subtopics.shape[0], 1))

query = ['Business', 'Computers', 'Culture Art Entertainment', 'Education Science', 'Car Engineering', 'Health', 'Politics Society', 'Sports']

for i_topic, topic in enumerate(overall_topic):
    topic_embedding = np.zeros(300)
    for index in topic.argsort()[: -10 - 1: -1]:
        topic_embedding += gloveModel[vobs[index]] * topic[index]
    ave = []
    for i_subtopic, subtopic in enumerate(subtopics):
        percent = (topic_word_sub.sum(1) / topic_word_sub.sum())[i_subtopic]*100
        if (percent > 0.5 and type_tracker[i_subtopic] == i_topic):
            subtopic_embedding = np.zeros(300)
            for index in subtopic.argsort()[: -10 - 1: -1]:
                subtopic_embedding += gloveModel[vobs[index]] * subtopic[index]
            
            cos_sim = dot(topic_embedding, subtopic_embedding)/(norm(topic_embedding)*norm(subtopic_embedding))
            ave.append(cos_sim)
#             print (i_topic, i_subtopic, cos_sim)
    print ('& ' + query[i_topic] + ' & %.2f' % np.mean(ave) + ' \\\\')

& Business & 0.73 \\
& Computers & 0.70 \\
& Culture Art Entertainment & 0.55 \\
& Education Science & 0.59 \\
& Car Engineering & 0.68 \\
& Health & 0.66 \\
& Politics Society & 0.51 \\
& Sports & 0.70 \\


In [95]:
for n in tp_index:
    for i in np.argsort(temp[:,n])[::-1][:5]:
    # print ([y for y in np.array(labels)[np.argsort(temp[:, 39])[::-1][:20]]])
    # z[z.argsort()[:-10 - 1:-1]]
        print (re.sub('\n', ' ', posts[i]))

cricket cricket cricket
topix sports horse racing horse racing news topix news horse racing continually updated net
latimes sports horseracing horse racing coverage news horse racing news racing tips horse racing stats horse racing
espn com las espn
topix racing racing news topix news racing continually updated net
maradona diego armando maradona diego armando maradona born buenos argentina family
olympic games index international olympic committee olympic games games games games olympic games ancient olympic games
beijing beijing olympics website games
miniclip games sport sports games miniclip games play games play online games sports games games action games puzzle games flash games games
miniclip miniclip games play games amazing snowboarding game color amazing snowboarding game style
indoor football indoor football
football indoor indoor arena football leagues arena football league national indoor football league arena football league
arena football team
football stadium digest fo

In [58]:
# test1 has GPU, test2 withouout GPU, test3 without GPU, test4 has GPU

[10, 22, 25, 28, 35, 39]

In [12]:
atheism = "Agnosticism theism deism islam paganism moral atheist religions argument exist"
graphics = "image digital visual 3d 2d visualization print geometry synthesizing processing"
pchardware = "cpu monitor keyboard memory card sound speakers motherboard power pc"
machardware = "touchpad touchbar drive apple mac ram gpu system sensors physical"
forsale = "product mail discount bargain shopping price sale propertise rent summer"
automobile = "car vehicle transportation wheel tire road parking gasoline energy driver"
motorcycles = "bike scooters mopads motorbikes trowel commute helmet ride speed harley"
baseball = "player ball small hit team fielding batting runs nbl baseball"
hockey = "puck nhl hockey ice rink canada rubber curve skater guard"
encrypt = "encoding decryption cryptographic secure plaintext ciphertext key algorithm pseudo private"
electronics = "equipment science electricity wire console computer outlet engineering power voltage"
medicine = "medicine surgery hospital climic doctor nurse healthcare symtoms prescription pharmacy"
space = "rocket nasa astronomy explore moon outerspace spaceship telescope satellite orbit"
christian = "belief faith church christianity ethics culture ritual Jesus bible truth"
guns = "law regulation usa victim murder violence litigation debate firearms legal"
middleeast = "israel Iran Iraq war territory turkey attack soldier turkey government"
query = [atheism, graphics, pchardware, machardware,
        forsale, automobile, motorcycles, baseball,
        hockey, encrypt, electronics, medicine,
        space, christian, guns, middleeast]
processed_query = []
for q in query:
    processed_query.append(processor.preprocess(q))

In [63]:
business = "bank stock market business economy financial investor profit price deal consumer"
entertainment = "film movie music tv theater festival actor show book hollywood director comedy"
health = "drug health cancer patient disease medical hospital healthcare doctor treatment blood care"
sci_tech = "apple google sony facebook internet mobile ipad technology microsoft phone tablet playstation computer"
sport = "league win player team tournament game playoff sport championship point nfl coach game baseball nba football"
# us = "american state president wisconsin california usa obama texas york union republican arizona"
# world = "international Europe Asia country global japan china libya pakistan syria aboard foreign africa egypt yemen afghan"
query = [business, entertainment, health, sci_tech, sport]
processed_query = []
for q in query:
    processed_query.append(processor.preprocess(q))

In [76]:
business = "bank stock market business economy financial investor profit price deal consumer"
computers = "computer software programming parallel computing memory hardware driver cpu processor"
culture_arts = "movie music art film artist museum fashion culture imdb actor comedy romantic"
education_science = "research science journal university student education scientific mathematics theory school library"
engineering = "engine electrical car wheel model automobile industrial vehicle cylinder jet transmission"
health = "drug health cancer patient disease medical hospital healthcare doctor treatment blood care"
politics_society = "political party democracy government republic parliamentary representative president communist congress"
sports = "league football player team tournament game basketball sport hockey championship nfl coach soccer nba"
query = [business, computers, culture_arts, education_science, engineering, health, politics_society, sports]
processed_query = []
for q in query:
    processed_query.append(processor.preprocess(q))  

In [9]:
##################### LDA model part ########################
processed_query = ["food"]
feature = 5000
target = 16

acc = []
nmi = []
pui = []
clu_acc = []
p_k = []
vobs = []
with open('model/20news/vobs_20news_QDTM.txt', 'r') as file:
    for line in file.readlines():
        vobs.append(line.strip())
            
for i in range(1): 
    print (rf"#######i = {i}#######")
    if i == 0:
        path = 'model/20news/LDA/LDA-GPU'
    else:
        path = 'model/20news/LDA/LDA' + str(i+1)


    doc_word = np.zeros((vectors.shape[0], feature))
    with open(path+'.txt') as file:
        for line in file.readlines():
            temp = line.strip().split()
            if temp[0] == 'd' or int(temp[0]) <= target-1:
                continue
            doc_word[int(temp[0])-target, int(temp[1])] += 1

    topic_word = np.zeros((50, feature))
    with open(path+'.txt') as file:
        for line in file.readlines():
            temp = line.strip().split()
            if temp[0] == 'd' or int(temp[0]) <= target-1:
                continue
            topic_word[int(temp[-1]), int(temp[1])] += 1
    
    pzw = topic_word/topic_word.sum(0)
    results = (doc_word @ pzw.T)

    temp = results + 1e-20
    X = temp / temp.sum(1).reshape((temp.shape[0], 1))
    Y = labels
    clf = LogisticRegression(max_iter=500)
    scores = cross_val_score(clf, X, Y, scoring="accuracy", cv=5)
    print ("acc:", np.mean(scores))
    acc.append(np.mean(scores))
    
    pre = np.argmax(results[:,:], 1)
    tru = np.asarray(labels)
    print ("numi:", normalized_mutual_info_score(tru, pre))
    print ("pui:", purity_score(tru, pre))
    print ("clu_acc:", cluster_acc(pre, tru)[0])
    nmi.append(normalized_mutual_info_score(tru, pre))
    pui.append(purity_score(tru, pre))
    clu_acc.append(cluster_acc(pre, tru)[0])
    
    label_count = Counter(labels)
    temp = results + 1e-20
    temp = temp / temp.sum(1).reshape((-1,1))
    tru = np.asarray(labels)
    p_results = []
    pzw = (topic_word+0.5)/(topic_word+0.5).sum(1).reshape((topic_word.shape[0], 1))
    for q_index, query in enumerate(processed_query):
        sim = []
        for i_topic, topic in enumerate(pzw):
            topic_embedding = np.zeros(300)
            for index in topic.argsort()[: -10 - 1: -1]:
                topic_embedding += gloveModel[vobs[index]] * topic[index]

            ave = []
            for term in query.split():
                if (norm(topic_embedding)*norm(gloveModel[term])!=0):
                    cos_sim = dot(topic_embedding, gloveModel[term])/(norm(topic_embedding)*norm(gloveModel[term]))
                else:
                    cos_sim = 0
                ave.append(cos_sim)
            sim.append(np.mean(ave))
#         print (np.argmax(sim))

        topic_index = np.argmax(sim)
        top_index = np.argsort(temp[:,topic_index])[::-1][:label_count[q_index]]
        precision = (tru[top_index] == q_index).sum()/label_count[q_index]
#         print (precision)
        p_results.append(precision)
    print ("p@k:", np.mean(p_results))
    p_k.append(np.mean(p_results))
    print()

print (rf"#######average#######")
print ("acc:", np.mean(acc))
print ("numi:", np.mean(nmi))
print ("pui:", np.mean(pui))
print ("clu_acc:", np.mean(clu_acc))
print ("p@k::", np.mean(p_k))

#######i = 0#######
acc: 0.6301072820481078
numi: 0.10206476058980617
pui: 0.11542504730534725
clu_acc: 0.11430373537038335
p@k: 0.013568521031207599

#######average#######
acc: 0.6301072820481078
numi: 0.10206476058980617
pui: 0.11542504730534725
clu_acc: 0.11430373537038335
p@k:: 0.013568521031207599


In [27]:
processed_query

['agnosticism theism deism islam paganism moral atheist religion argument exist',
 'image digital visual 3d 2d visualization print geometry synthesizing processing',
 'cpu monitor keyboard memory card sound speaker motherboard power pc',
 'touchpad touchbar drive apple mac ram gpu system sensor physical',
 'product mail discount bargain shopping price sale propertise rent summer',
 'car vehicle transportation wheel tire road parking gasoline energy driver',
 'bike scooter mopads motorbike trowel commute helmet ride speed harley',
 'player ball small hit team fielding batting run nbl baseball',
 'puck nhl hockey ice rink canada rubber curve skater guard',
 'encoding decryption cryptographic secure plaintext ciphertext key algorithm pseudo private',
 'equipment science electricity wire console computer outlet engineering power voltage',
 'medicine surgery hospital climic doctor nurse healthcare symtoms prescription pharmacy',
 'rocket nasa astronomy explore moon outerspace spaceship tele

In [19]:
coh = []
for i in range(1,6):
    zw = np.zeros((50, 5000))
    if i == 1:
        path = 'model/20news/LDA/LDA-test_nzw.txt'
    else:
        path = 'model/searchsnippets/LDA/LDA' + str(i) + '_nzw.txt'
    with open(path, 'r') as file:
        z = 0
        for line in file.readlines():
            w = 0
            for n in line.strip().split():
                zw[z,w] = n
                w += 1
            z += 1
    vobs = []
    with open('model/20news/vobs_20news_QDTM.txt', 'r') as file:
        for line in file.readlines():
            vobs.append(line.strip())
    
    local_coherence = []
    pzw = (zw+0.5)/(zw+0.5).sum(1).reshape((zw.shape[0], 1))
    for q_index, query in enumerate(processed_query):
        sim = []
        for i_topic, topic in enumerate(pzw):
            topic_embedding = np.zeros(300)
            for index in topic.argsort()[: -10 - 1: -1]:
                topic_embedding += gloveModel[vobs[index]] * topic[index]

            ave = []
            for term in query.split():
                if (norm(topic_embedding)*norm(gloveModel[term])!=0):
                    cos_sim = dot(topic_embedding, gloveModel[term])/(norm(topic_embedding)*norm(gloveModel[term]))
                else:
                    cos_sim = 0
                ave.append(cos_sim)
            sim.append(np.mean(ave))
        topic_index = np.argmax(sim)
    
        top_words = [vobs[i] for i in pzw[topic_index].argsort()[:-10 - 1:-1]]
        print (top_words)
        local_coherence.append(palmetto.get_coherence(top_words))
    print ('ave coherence', np.mean(local_coherence))
    coh.append(np.mean(local_coherence))
print (np.mean(coh))

['price', 'ca', 'count', 'monday', 'good', 'comparing', 'pay', 'yeast', 'marked', 'seizure']
ave coherence 0.37959844951323374
['change', 'palestinian', 'pollution', 'relevant', 'marvel', 'dave', 'idle', 'mariner', 'invite', 'foam']
ave coherence 0.48476894733727216
['change', 'palestinian', 'pollution', 'relevant', 'marvel', 'invite', 'commercial', 'idle', 'foam', 'confusing']
ave coherence 0.4312083795525715
['change', 'palestinian', 'relevant', 'pollution', 'marvel', 'forced', 'idle', 'invite', 'foam', 'commercial']
ave coherence 0.4312083795525715
['change', 'palestinian', 'relevant', 'pollution', 'marvel', 'forced', 'idle', 'invite', 'commercial', 'foam']
ave coherence 0.4312083795525715
0.43159850710164405


In [45]:
##################### ISLDA model part ########################
feature  = len(feature_names)
target = 8

acc = []
nmi = []
pui = []
clu_acc = []
for i in range(5): 
    print (rf"#######i = {i}#######")
    if i == 0:
        path = 'model/searchsnippets/ISLDA/ISLDA'
    else:
        path = 'model/searchsnippets/ISLDA/ISLDA' + str(i+1)

    # doc_word = vectors
    doc_word = np.zeros((vectors.shape[0], feature))
    with open(path+'.txt') as file:
        for line in file.readlines():
            temp = line.strip().split()
            if temp[0] == 'd' or int(temp[0]) <= target-1:
                continue
            doc_word[int(temp[0])-target, int(temp[1])] += 1

    topic_word = np.zeros((target+1, feature))
    with open(path + '_nzw.txt', 'r') as file:
        z = 0
        for line in file.readlines():
            w = 0
            for n in line.strip().split():
                topic_word[z,w] = n
                w += 1
            z += 1

    pzw = topic_word/topic_word.sum(0)
    results = (doc_word @ pzw.T)
    
    temp = results + 1e-20
    X = temp / temp.sum(1).reshape((temp.shape[0], 1))
    Y = labels
    clf = LogisticRegression(max_iter=500)
    scores = cross_val_score(clf, X, Y, scoring="accuracy", cv=5)
    print ("acc:", np.mean(scores))
    acc.append(np.mean(scores))
    
    pre = np.argmax(results[:,:target], 1)
    tru = np.asarray(labels)
    print ("numi:", normalized_mutual_info_score(tru, pre))
    print ("pui:", purity_score(tru, pre))
    print ("clu_acc:", cluster_acc(pre, tru)[0])
    nmi.append(normalized_mutual_info_score(tru, pre))
    pui.append(purity_score(tru, pre))
    clu_acc.append(cluster_acc(pre, tru)[0])
    print()

print (rf"#######average#######")
print ("acc:", np.mean(acc))
print ("numi:", np.mean(nmi))
print ("pui:", np.mean(pui))
print ("clu_acc:", np.mean(clu_acc))

#######i = 0#######
acc: 0.8437569077267029
numi: 0.6419389692682327
pui: 0.8274682306940372
clu_acc: 0.817367220593027

#######i = 1#######
acc: 0.8567096996755937
numi: 0.6307265179178058
pui: 0.8189964157706093
clu_acc: 0.8189964157706093

#######i = 2#######
acc: 0.8466887028400458
numi: 0.6302092661867547
pui: 0.8170413815575106
clu_acc: 0.8159824046920822

#######i = 3#######
acc: 0.8460383316637587
numi: 0.6137235583097852
pui: 0.8113391984359726
clu_acc: 0.8113391984359726

#######i = 4#######
acc: 0.8320272726669629
numi: 0.6159644786784296
pui: 0.8154121863799283
clu_acc: 0.8154121863799283

#######average#######
acc: 0.8450441829146127
numi: 0.6265125580722016
pui: 0.8180514825676116
clu_acc: 0.8158194851743239


In [60]:
label_count = Counter(labels)
temp = results + 1e-20
temp = temp / temp.sum(1).reshape((-1,1))
tru = np.asarray(labels)
p_results = []
for topic_index in range(target):
    top_index = np.argsort(temp[:,topic_index])[::-1][:label_count[topic_index]]
    precision = (tru[top_index] == topic_index).sum()/label_count[topic_index]
    print (precision)
    p_results.append(precision)
print (np.mean(p_results))

0.4762550881953867
0.6622073578595318
0.3538135593220339
0.32893289328932895
0.6373390557939914
0.6733111849390919
0.6071044133476857
0.743421052631579
0.8476605005440696
0.7377777777777778
0.5410010649627263
0.7864184008762322
0.7338530066815144
0.7324561403508771
0.6599045346062052
0.7458703939008895
0.6417079015674325


In [63]:
for i in range(1,6):
    zw = np.zeros((target+1, 5000))
    if i == 1:
        path = 'model/20news/ISLDA/ISLDA_nzw.txt'
    else:
        path = 'model/20news/ISLDA/ISLDA' + str(i) + '_nzw.txt'
    with open(path, 'r') as file:
        z = 0
        for line in file.readlines():
            w = 0
            for n in line.strip().split():
                zw[z,w] = n
                w += 1
            z += 1
    vobs = []
    with open('model/20news/vobs_20news_expert.txt', 'r') as file:
        for line in file.readlines():
            vobs.append(line.strip())
    local_coherence = []
    for z in zw[:target]:
        top_words = [vobs[i] for i in z.argsort()[:-10 - 1:-1]]
        print (top_words)
        local_coherence.append(palmetto.get_coherence(top_words))
    print ('ave coherence', np.mean(local_coherence))

['people', 'question', 'point', 'religion', 'thing', 'make', 'argument', 'evidence', 'claim', 'true']
['file', 'image', 'program', 'list', 'graphic', 'information', 'book', 'send', 'address', 'article']
['card', 'sound', 'monitor', 'pc', 'memory', 'video', 'port', 'board', 'chip', 'modem']
['system', 'drive', 'disk', 'mac', 'problem', 'scsi', 'apple', 'hard', 'work', 'ram']
['price', 'sale', 'offer', 'mail', 'product', 'sell', 'shipping', 'interested', 'condition', 'cd']
['car', 'driver', 'road', 'engine', 'good', 'mile', 'problem', 'dealer', 'vehicle', 'energy']
['bike', 'speed', 'time', 'back', 'thing', 'ride', 'make', 'dog', 'water', 'work']
['team', 'run', 'player', 'game', 'year', 'small', 'hit', 'good', 'baseball', 'play']
['game', 'hockey', 'playoff', 'goal', 'play', 'canada', 'nhl', 'period', 'fan', 'leaf']
['key', 'chip', 'phone', 'clipper', 'encryption', 'algorithm', 'number', 'private', 'nsa', 'security']
['power', 'computer', 'science', 'line', 'equipment', 'work', 'radio',

In [393]:
##################### DFLDA model part ########################
feature = len(feature_names)
target = 8

acc = []
nmi = []
pui = []
clu_acc = []
p_k = []
vobs = []
with open('model/searchsnippets/vobs_searchsnippets_QDTM.txt', 'r') as file:
    for line in file.readlines():
        vobs.append(line.strip())
        
for i in range(5): 
    print (rf"#######i = {i}#######")
    if i == 0:
        path = 'model/searchsnippets/DFLDA/DFLDA'
    else:
        path = 'model/searchsnippets/DFLDA/DFLDA' + str(i+1)

    doc_word = np.zeros((vectors.shape[0], feature))
    with open(path+'.txt') as file:
        for line in file.readlines():
            temp = line.strip().split()
            if temp[0] == 'd' or int(temp[0]) <= target-1:
                continue
            doc_word[int(temp[0])-target, int(temp[1])] += 1


    topic_word = np.zeros((target+1, feature))
    with open(path+'.txt') as file:
        for line in file.readlines():
            temp = line.strip().split()
            if temp[0] == 'd' or int(temp[0]) <= target-1:
                continue
            topic_word[int(temp[-1]), int(temp[1])] += 1
    pzw = topic_word/topic_word.sum(0)
    results = (doc_word @ pzw.T)

    temp = results + 1e-20
    X = temp / temp.sum(1).reshape((temp.shape[0], 1))
    Y = labels
    clf = LogisticRegression(max_iter=500)
    scores = cross_val_score(clf, X, Y, scoring="accuracy", cv=5)
    print ("acc:", np.mean(scores))
    acc.append(np.mean(scores))
    
    pre = np.argmax(results[:,:], 1)
    tru = np.asarray(labels)
    print ("numi:", normalized_mutual_info_score(tru, pre))
    print ("pui:", purity_score(tru, pre))
    print ("clu_acc:", cluster_acc(pre, tru)[0])
    nmi.append(normalized_mutual_info_score(tru, pre))
    pui.append(purity_score(tru, pre))
    clu_acc.append(cluster_acc(pre, tru)[0])
    
    label_count = Counter(labels)
    temp = results + 1e-20
    temp = temp / temp.sum(1).reshape((-1,1))
    tru = np.asarray(labels)
    p_results = []
    pzw = (topic_word+0.5)/(topic_word+0.5).sum(1).reshape((topic_word.shape[0], 1))
    for q_index, query in enumerate(processed_query):
        sim = []
        for i_topic, topic in enumerate(pzw):
            topic_embedding = np.zeros(300)
            for index in topic.argsort()[: -10 - 1: -1]:
                topic_embedding += gloveModel[vobs[index]] * topic[index]

            ave = []
            for term in query.split():
                if (norm(topic_embedding)*norm(gloveModel[term])!=0):
                    cos_sim = dot(topic_embedding, gloveModel[term])/(norm(topic_embedding)*norm(gloveModel[term]))
                else:
                    cos_sim = 0
                ave.append(cos_sim)
            sim.append(np.mean(ave))
#         print (np.argmax(sim))

        topic_index = np.argmax(sim)
        top_index = np.argsort(temp[:,topic_index])[::-1][:label_count[q_index]]
        precision = (tru[top_index] == q_index).sum()/label_count[q_index]
#         print (precision)
        p_results.append(precision)
    print ("p@k:", np.mean(p_results))
    p_k.append(np.mean(p_results))
    print()

print (rf"#######average#######")
print ("acc:", np.mean(acc))
print ("numi:", np.mean(nmi))
print ("pui:", np.mean(pui))
print ("clu_acc:", np.mean(clu_acc))
print ("p@k:", np.mean(p_k))

#######i = 0#######


  pzw = topic_word/topic_word.sum(0)


acc: nan
numi: 4.876943271555342e-16
pui: 0.37978205425457917
clu_acc: 0.37978205425457917


Traceback (most recent call last):
  File "/opt/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1342, in fit
    X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,
  File "/opt/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/base.py", line 432, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/utils/validation.py", line 795, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/opt/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/opt/a

NameError: name 'processed_query' is not defined

In [81]:
coh = []
for i in range(1,6):
    zw = np.zeros((target+1, 5000))
    if i == 1:
        path = 'model/searchsnippets/DFLDA/DFLDA.txt'
    else:
        path = 'model/searchsnippets/DFLDA/DFLDA' + str(i) + '.txt'
    with open(path) as file:
        for line in file.readlines():
            temp = line.strip().split()
            if temp[0] == 'd' or int(temp[0]) <= target-1:
                continue
            zw[int(temp[-1]), int(temp[1])] += 1

    vobs = []
    with open('model/searchsnippets/vobs_searchsnippets_QDTM.txt', 'r') as file:
        for line in file.readlines():
            vobs.append(line.strip())
            
    local_coherence = []
    pzw = (zw+0.5)/(zw+0.5).sum(1).reshape((zw.shape[0], 1))
    for q_index, query in enumerate(processed_query):
        sim = []
        for i_topic, topic in enumerate(pzw):
            topic_embedding = np.zeros(300)
            for index in topic.argsort()[: -10 - 1: -1]:
                topic_embedding += gloveModel[vobs[index]] * topic[index]

            ave = []
            for term in query.split():
                if (norm(topic_embedding)*norm(gloveModel[term])!=0):
                    cos_sim = dot(topic_embedding, gloveModel[term])/(norm(topic_embedding)*norm(gloveModel[term]))
                else:
                    cos_sim = 0
                ave.append(cos_sim)
            sim.append(np.mean(ave))
        topic_index = np.argmax(sim)
    
        top_words = [vobs[i] for i in pzw[topic_index].argsort()[:-10 - 1:-1]]
        print (top_words)
        local_coherence.append(palmetto.get_coherence(top_words))
    print ('ave coherence', np.mean(local_coherence))
    coh.append(np.mean(local_coherence))
print (np.mean(coh))

['business', 'market', 'server', 'producer', 'model', 'stm', 'track', 'car', 'engine', 'economic']
['computer', 'softball', 'synthesis', 'network', 'programmer', 'weapon', 'data', 'memory', 'intel', 'device']
['movie', 'music', 'culture', 'art', 'film', 'victim', 'fashion', 'phone', 'reverse', 'history']
['requirement', 'sciam', 'journal', 'scholarship', 'theoretical', 'universal', 'library', 'profit', 'department', 'structure']
['business', 'market', 'server', 'producer', 'model', 'stm', 'track', 'car', 'engine', 'economic']
['health', 'information', 'gov', 'cancer', 'medical', 'disease', 'drug', 'calorie', 'food', 'national']
['health', 'information', 'gov', 'cancer', 'medical', 'disease', 'drug', 'calorie', 'food', 'national']
['news', 'game', 'splash', 'football', 'worksheet', 'soc', 'teacher', 'played', 'temple', 'match']
ave coherence 0.39283867927582794
['policy', 'partnership', 'synthesis', 'democracy', 'gov', 'government', 'uniquely', 'house', 'information', 'reprint']
['compu

In [48]:
##################### SCLDA model part ########################        
feature = len(feature_names)
target = 8

acc = []
nmi = []
pui = []
clu_acc = []
for i in range(5): 
    print (rf"#######i = {i}#######")
    if i == 0:
        path = 'model/searchsnippets/SCLDA/SCLDA'
    else:
        path = 'model/searchsnippets/SCLDA/SCLDA' + str(i+1)

    doc_word = vectors
    topic_word = np.zeros((target+1, feature))
    with open(path+'_zw.txt') as file:
        z = 0
        for line in file.readlines():
            temp = np.asarray(line.strip().split()[1:]).astype(int)
            topic_word[z] = temp
            z += 1
            
    topic_word += 1e-20
    pzw = topic_word/topic_word.sum(0)
    results = (doc_word @ pzw.T)

    temp = results + 1e-20
    X = temp / temp.sum(1).reshape((temp.shape[0], 1))
    Y = labels
    clf = LogisticRegression(max_iter=500)
    scores = cross_val_score(clf, X, Y, scoring="accuracy", cv=5)
    print ("acc:", np.mean(scores))
    acc.append(np.mean(scores))
    
    pre = np.argmax(results[:,:], 1)
    tru = np.asarray(labels)
    print ("numi:", normalized_mutual_info_score(tru, pre))
    print ("pui:", purity_score(tru, pre))
    print ("clu_acc:", cluster_acc(pre, tru)[0])
    nmi.append(normalized_mutual_info_score(tru, pre))
    pui.append(purity_score(tru, pre))
    clu_acc.append(cluster_acc(pre, tru)[0])
    print()

print (rf"#######average#######")
print ("acc:", np.mean(acc))
print ("numi:", np.mean(nmi))
print ("pui:", np.mean(pui))
print ("clu_acc:", np.mean(clu_acc))

#######i = 0#######
acc: 0.8180996702866581
numi: 0.5881865115461187
pui: 0.7945584881068752
clu_acc: 0.7945584881068752

#######i = 1#######
acc: 0.8145156796274307
numi: 0.5809824756014201
pui: 0.7926849136526556
clu_acc: 0.7926849136526556

#######i = 2#######
acc: 0.8176917744150408
numi: 0.5813066214662225
pui: 0.7880417073965461
clu_acc: 0.7880417073965461

#######i = 3#######
acc: 0.8218442386408116
numi: 0.5956203813838349
pui: 0.7996089931573802
clu_acc: 0.7996089931573802

#######i = 4#######
acc: 0.8101175225724274
numi: 0.5740821442276792
pui: 0.7820136852394917
clu_acc: 0.7820136852394917

#######average#######
acc: 0.8164537771084737
numi: 0.5840356268450552
pui: 0.7913815575105898
clu_acc: 0.7913815575105898


In [109]:
temp = results + 1e-20
X = temp / temp.sum(1).reshape((temp.shape[0], 1))
Y = labels
clf = LogisticRegression(max_iter=500)
scores = cross_val_score(clf, X, Y, scoring="accuracy", cv=5)
np.mean(scores)

0.8040806862972408

In [110]:
pre = np.argmax(results[:,:target], 1)
tru = np.asarray(labels)
print (normalized_mutual_info_score(tru, pre))
print (purity_score(tru, pre))
print (cluster_acc(pre, tru)[0])

0.5119823571393645
0.7894273127753304
0.7894273127753304


In [111]:
label_count = Counter(labels)
temp = results + 1e-20
temp = temp / temp.sum(1).reshape((-1,1))
tru = np.asarray(labels)
p_results = []
for topic_index in range(target):
    top_index = np.argsort(temp[:,topic_index])[::-1][:label_count[topic_index]]
    precision = (tru[top_index] == topic_index).sum()/label_count[topic_index]
    print (precision)
    p_results.append(precision)
print (np.mean(p_results))

0.7894540711757033
0.7586731588557517
0.6866558616963804
0.587251828631139
0.8960927960927961
0.743625543290354


In [89]:
for i in range(1,6):
    zw = np.zeros((target+1, 5000))
    if i == 1:
        file = 'model/20news/SCLDA/SCLDA_zw.txt'
    else:
        file = 'model/20news/SCLDA/SCLDA' + str(i) + '_zw.txt'
    with open(file, 'r') as file:
        z = 0
        for line in file.readlines():
            zw[z] = np.asarray(line.strip().split()[1:]).astype(int)
            z += 1
    local_coherence = []
    for z in zw[:target]:
        top_words = [feature_names[i] for i in z.argsort()[:-10 - 1:-1]]
        print (top_words)
        local_coherence.append(palmetto.get_coherence(top_words))
    print ('ave coherence', np.mean(local_coherence))

['people', 'time', 'make', 'good', 'thing', 'point', 'argument', 'question', 'ca', 'year']
['image', 'program', 'file', 'graphic', 'bit', 'information', 'line', 'time', 'find', 'version']
['card', 'sound', 'monitor', 'pc', 'memory', 'video', 'chip', 'board', 'work', 'bus']
['system', 'drive', 'disk', 'mac', 'problem', 'scsi', 'hard', 'apple', 'work', 'ram']
['price', 'sale', 'offer', 'list', 'mail', 'sell', 'send', 'product', 'interested', 'shipping']
['car', 'driver', 'engine', 'problem', 'good', 'road', 'year', 'dealer', 'time', 'mile']
['bike', 'speed', 'time', 'thing', 'back', 'make', 'ride', 'dog', 'turn', 'put']
['game', 'team', 'player', 'year', 'hit', 'small', 'good', 'play', 'run', 'baseball']
['game', 'hockey', 'time', 'nhl', 'fan', 'playoff', 'canada', 'ca', 'espn', 'season']
['key', 'chip', 'phone', 'encryption', 'clipper', 'algorithm', 'number', 'message', 'nsa', 'private']
['power', 'computer', 'science', 'supply', 'gm', 'equipment', 'battery', 'input', 'circuit', 'line']

In [541]:
from collections import Counter
Counter(labels)

Counter({4: 932,
         15: 787,
         5: 903,
         13: 912,
         3: 909,
         11: 913,
         2: 943,
         14: 838,
         1: 898,
         7: 912,
         9: 900,
         0: 737,
         6: 929,
         12: 898,
         10: 939,
         8: 919})

In [113]:
Counter(labels)

Counter({4: 932,
         15: 787,
         5: 903,
         13: 912,
         3: 909,
         11: 913,
         2: 943,
         14: 838,
         1: 898,
         7: 912,
         9: 900,
         0: 737,
         6: 929,
         12: 898,
         10: 939,
         8: 919})

In [90]:
atheism = "agnosticism theism deism islam paganism moral atheist religions argument exist".split()
graphics = "image digital visual 3d 2d visualization print geometry synthesizing processing".split()
pchardware = "cpu monitor keyboard memory card sound speakers motherboard power pc".split()
machardware = "touchpad touchbar drive apple mac ram gpu system sensors physical".split()
forsale = "product mail discount bargain shopping price sale propertise rent summer".split()
automobile = "car vehicle transportation wheel tire road parking gasoline energy driver".split()
motorcycles = "bike scooters mopads motorbikes trowel commute helmet ride speed harley".split()
baseball = "player ball small hit team fielding batting runs nbl baseball".split()
hockey = "puck nhl hockey ice rink canada rubber curve skater guard".split()
encrypt = "encoding decryption cryptographic secure plaintext ciphertext key algorithm pseudo private".split()
electronics = "equipment science electricity wire console computer outlet engineering power voltage".split()
medicine = "medicine surgery hospital climic doctor nurse healthcare symtoms prescription pharmacy".split()
space = "rocket nasa astronomy explore moon outerspace spaceship telescope satellite orbit".split()
christian = "belief faith church christianity ethics culture ritual Jesus bible truth".split()
guns = "law regulation usa victim murder violence litigation debate firearms legal".split()
middleeast = "israel iran iraq war territory turkey attack soldier turkey government".split()
query = [atheism, graphics, pchardware, machardware,
        forsale, automobile, motorcycles, baseball,
        hockey, encrypt, electronics, medicine,
        space, christian, guns, middleeast]

In [5]:
business = "bank stock market business economy financial investor profit price deal consumer".split()
entertainment = "film movie music tv theater festival actor show book hollywood director comedy".split()
health = "drug health cancer patient disease medical hospital healthcare doctor treatment blood care".split()
sci_tech = "apple google sony facebook internet mobile ipad technology microsoft phone tablet playstation computer".split()
sport = "league win player team tournament game playoff sport championship point nfl coach game baseball nba football".split()
# us = "american state president wisconsin california usa obama texas york union republican arizona".split()
# world = "international Europe Asia country global japan china libya pakistan syria aboard foreign africa egypt yemen afghan".split()
query = [business, entertainment, health, sci_tech, sport]

In [37]:
business = "bank stock market business economy financial investor profit price deal consumer".split()
computers = "computer software programming parallel computing memory hardware driver cpu processor".split()
culture_arts = "movie music art film artist museum fashion culture imdb actor comedy romantic".split()
education_science = "research science journal university student education scientific mathematics theory school library".split()
engineering = "engine electrical car wheel model automobile industrial vehicle cylinder jet transmission".split()
health = "drug health cancer patient disease medical hospital healthcare doctor treatment blood care".split()
politics_society = "political party democracy government republic parliamentary representative president communist congress".split()
sports = "league football player team tournament game basketball sport hockey championship nfl coach soccer nba".split()
query = [business, computers, culture_arts, education_science, engineering, health, politics_society, sports]

[['bank',
  'stock',
  'market',
  'business',
  'economy',
  'financial',
  'investor',
  'profit',
  'price',
  'deal',
  'consumer'],
 ['computer',
  'software',
  'programming',
  'parallel',
  'computing',
  'memory',
  'hardware',
  'driver',
  'cpu',
  'processor'],
 ['movie',
  'music',
  'art',
  'film',
  'artist',
  'museum',
  'fashion',
  'culture',
  'imdb',
  'actor',
  'comedy',
  'romantic'],
 ['research',
  'science',
  'journal',
  'university',
  'student',
  'education',
  'scientific',
  'mathematics',
  'theory',
  'school',
  'library'],
 ['engine',
  'electrical',
  'car',
  'wheel',
  'model',
  'automobile',
  'industrial',
  'vehicle',
  'cylinder',
  'jet',
  'transmission'],
 ['drug',
  'health',
  'cancer',
  'patient',
  'disease',
  'medical',
  'hospital',
  'healthcare',
  'doctor',
  'treatment',
  'blood',
  'care'],
 ['political',
  'party',
  'democracy',
  'government',
  'republic',
  'parliamentary',
  'representative',
  'president',
  'commun

In [36]:
# expanded concept words
business = 'business management debt service marketing export trade operation corporation finance business'.split()
computers = 'memory virus software personal graphic parallel application chip computing program computer'.split()
culture_arts = 'art chinese ancient asian heritage literature japanese india greece travel culture art'.split()
education_science = 'balance energy curriculum columbia teacher mathematics discipline teaching nsf newspaper education science'.split()
engineering = 'automobile advancing automotive membership premier dedicated car detection thoroughbred pack car engineering'.split()
health = 'health nutrition care cholesterol fitness condition vitamin public cancer disease health'.split()
politics_society = 'politics regime transition military csmonitor conflict usa brief authoritative series politics society'.split()
sports = 'sport espn football hockey racing score horse tennis ncaa soccer sport'.split()
query = [business, computers, culture_arts, education_science, engineering, health, politics_society, sports]

In [25]:
business = 'small profit bank credit nikkei sale card sell daily product business'.split()
entertainment = 'quiz pop sony remember celebrity test week mattel culture weekly entertainment'.split()
health = 'health recipe care insurance radiation drug risk disease mental fda health'.split()
sci_tech = 'apple chip tech patent foxconn google ipad china intel rim technology'.split()
sport = 'replay photo day time soccer college cycling tennis team player sport'.split()
query = [business, entertainment, health, sci_tech, sport]

In [21]:
atheism = 'lack theist allah nature existence theism weak atheism strong universe bobby'.split()
graphics ='3d sgi ati vesa graphic support screen manchester version animation'.split()
pchardware = 'joystick norton connect dsp protocol dtr pc utility pirate int ct'.split()
machardware = 'lc mono lab channel centris mac quicktime write adaptor developer quadra'.split()
forsale = 'sleeve excellent box sold original sell interested cd sale included'.split()
automobile = 'screw mustang automobile v6 regulation camaro substitute v8 auto ford'.split()
motorcycles = 'bike riding battery rider countersteering motorcycle buying dod jacket helmet ride'.split()
baseball = 'mpc umpire run dcon baseball stats gant pitcher hit average morris'.split()
hockey = 'ice cup coach ulf roger arena night goal don nhl hockey'.split()
encrypt = 'hash encrypted cryptography encryption de encrypt algorithm message function key'.split()
electronics = 'bell circuit electronics decoder connector amp electronic resistor led company'.split()
medicine = 'clinical doctor yeast jb medical study disease treatment medicine patient'.split()
space = 'space station shuttle launch nasa hulk wolverine satellite moon 1st'.split()
christian = 'truth love christian christ word bible protestant church sin homosexual homosexuality'.split()
guns = 'control amendment gun child revolver police bm fire defend weapon handgun'.split()
middleeast = 'culture east israel yassin arab middle kk zionism deir bc'.split()
query = [atheism, graphics, pchardware, machardware,
        forsale, automobile, motorcycles, baseball,
        hockey, encrypt, electronics, medicine,
        space, christian, guns, middleeast]

In [22]:
for i in range(len(query)):
    for w in query[i]:
#         print ("wordseed " + preprocessor.preprocess(w) + " " + str(i) )
        print ("wordseed " + w + " " + str(i) )

wordseed lack 0
wordseed theist 0
wordseed allah 0
wordseed nature 0
wordseed existence 0
wordseed theism 0
wordseed weak 0
wordseed atheism 0
wordseed strong 0
wordseed universe 0
wordseed bobby 0
wordseed 3d 1
wordseed sgi 1
wordseed ati 1
wordseed vesa 1
wordseed graphic 1
wordseed support 1
wordseed screen 1
wordseed manchester 1
wordseed version 1
wordseed animation 1
wordseed joystick 2
wordseed norton 2
wordseed connect 2
wordseed dsp 2
wordseed protocol 2
wordseed dtr 2
wordseed pc 2
wordseed utility 2
wordseed pirate 2
wordseed int 2
wordseed ct 2
wordseed lc 3
wordseed mono 3
wordseed lab 3
wordseed channel 3
wordseed centris 3
wordseed mac 3
wordseed quicktime 3
wordseed write 3
wordseed adaptor 3
wordseed developer 3
wordseed quadra 3
wordseed sleeve 4
wordseed excellent 4
wordseed box 4
wordseed sold 4
wordseed original 4
wordseed sell 4
wordseed interested 4
wordseed cd 4
wordseed sale 4
wordseed included 4
wordseed screw 5
wordseed mustang 5
wordseed automobile 5
wordsee

In [None]:
for pair in label_docs:
    print ("docseed " + str(pair[0]) + " " + str(pair[1]))

In [4]:
import itertools
for q in query:
    for p in list(itertools.combinations(q, 2)):
        if preprocessor.preprocess(p[0]) in feature_names and preprocessor.preprocess(p[1]) in feature_names:
            print ("wordpair " + preprocessor.preprocess(p[0]) + " " + preprocessor.preprocess(p[1]))

SyntaxError: unexpected EOF while parsing (<ipython-input-4-2d8778b0e069>, line 5)

In [5]:
preprocessor.preprocess("gasoline") in feature_names
//

SyntaxError: invalid syntax (<ipython-input-5-fe1e534b112d>, line 2)

In [6]:
feature_names[-10:]
//

SyntaxError: invalid syntax (<ipython-input-6-2be39b7f137a>, line 2)

In [7]:
feature_names[1892:2000]
//

SyntaxError: invalid syntax (<ipython-input-7-a42431687085>, line 2)

In [4]:
with open("/Users/zhengfang/Git Repository/sparse-constrained-lda/20news.txt", "w") as file:
    for line in processed_texts:
        file.write(line + '\n')
with open("/Users/zhengfang/Git Repository/sparse-constrained-lda/20news_vobs.txt", "w") as file:
    for line in feature_names:
        file.write(line + '\n')

In [4]:
with open("/Users/zhengfang/Git Repository/sparse-constrained-lda/tagmynews.txt", "w") as file:
    for line in processed_texts:
        file.write(line + '\n')
with open("/Users/zhengfang/Git Repository/sparse-constrained-lda/tagmynews_vobs.txt", "w") as file:
    for line in feature_names:
        file.write(line + '\n')

In [69]:
with open("/Users/zhengfang/Git Repository/sparse-constrained-lda/searchsnippets.txt", "w") as file:
    for line in processed_texts:
        file.write(line + '\n')
with open("/Users/zhengfang/Git Repository/sparse-constrained-lda/searchsnippets_vobs.txt", "w") as file:
    for line in feature_names:
        file.write(line + '\n')

In [10]:
feature_names[2200]

'illegal'

In [71]:
processed_texts[0]

'software reference bank transaction accounting software business'

In [59]:
from collections import defaultdict
def loadGloveModel(gloveFile):
    print ("Loading Glove Model")
    f = open(gloveFile, 'r', encoding='utf-8')
    model = defaultdict(lambda: np.zeros(300))
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        model[word] = np.array([float(val) for val in splitLine[1:]])
    print ('Done.', len(model), 'words loaded!')
    return model

In [62]:
gloveModel = loadGloveModel('./embedding/glove.6B.300d.txt')

1006it [00:00, 10055.10it/s]

Loading Glove Model


400000it [00:30, 13001.19it/s]

Done. 400000 words loaded!





In [176]:
from scipy import spatial
test = "doctor medicine clinical jb medical patient study yeast treatment disease"
for i in test.split():
    print (1 - spatial.distance.cosine(gloveModel['christian'], gloveModel[i]))
    

0.1826453004845371
0.1264531005110211
0.05888853134659344
-0.11955256381547086
0.17532838132485917
0.027488436700016994
0.18413169457314793
-0.06206512102275541
0.05231813420496212
0.004415522580539344


In [58]:
gloveModel['drive']

NameError: name 'gloveModel' is not defined

In [36]:
with open('model/20news/vobs_20news_QDTM.txt', 'r') as file:
    for line in file.readlines():
        vobs.append(line.strip())
            
path = 'model/20news/LDA/LDA-GPUPLUS'

doc_word = np.zeros((vectors.shape[0], feature))
with open(path+'.txt') as file:
    for line in file.readlines():
        temp = line.strip().split()
        if temp[0] == 'd' or int(temp[0]) <= target-1:
            continue
        doc_word[int(temp[0])-target, int(temp[1])] += 1

topic_word = np.zeros((50, feature))
with open(path+'.txt') as file:
    for line in file.readlines():
        temp = line.strip().split()
        if temp[0] == 'd' or int(temp[0]) <= target-1:
            continue
        topic_word[int(temp[-1]), int(temp[1])] += 1

pzw = topic_word/topic_word.sum(0)
results = (doc_word @ pzw.T)

temp = results + 1e-20
X = temp / temp.sum(1).reshape((temp.shape[0], 1))
Y = labels
clf = LogisticRegression(max_iter=500)
scores = cross_val_score(clf, X, Y, scoring="accuracy", cv=5)
print ("acc:", np.mean(scores))

acc: 0.5942959512737056


In [37]:
ave = []
zw = np.zeros((16, 5000))
path = 'model/20news/LDA/LDA-GPUPLUS_nzw.txt'
with open(path, 'r') as file:
    z = 0
    for line in file.readlines():
        w = 0
        for n in line.strip().split():
            zw[z,w] = n
            w += 1
        z += 1
vobs = []
with open('model/20news/vobs_20news_QDTM.txt', 'r') as file:
    for line in file.readlines():
        vobs.append(line.strip())
local_coherence = []
for z in zw[:]:
    top_words = [re.sub("\$\d*", "", vobs[i]) for i in z.argsort()[:-10 - 1:-1]]
    print (top_words)
    local_coherence.append(palmetto.get_coherence(top_words))
print ('ave coherence', np.mean(local_coherence))

['buy', 'sell', 'buying', 'sale', 'price', 'bought', 'sold', 'offer', 'selling', 'cost']
['drive', 'card', 'disk', 'system', 'monitor', 'scsi', 'mac', 'problem', 'floppy', 'board']
['government', 'law', 'state', 'federal', 'criminal', 'public', 'court', 'enforcement', 'legal', 'country']
['fire', 'fbi', 'gun', 'time', 'child', 'fired', 'back', 'dog', 'atf', 'people']
['god', 'faith', 'christ', 'belief', 'church', 'christian', 'jesus', 'worship', 'holy', 'religion']
['armenian', 'turkish', 'muslim', 'armenia', 'turkey', 'azerbaijani', 'karabakh', 'azeri', 'moslem', 'greek']
['software', 'user', 'internet', 'file', 'database', 'program', 'information', 'image', 'server', 'application']
['car', 'bike', 'motorcycle', 'vehicle', 'driving', 'driver', 'wheel', 'truck', 'auto', 'passenger']
['patient', 'disease', 'treatment', 'cancer', 'infection', 'medication', 'diagnosis', 'treat', 'diagnosed', 'chronic']
['fact', 'kind', 'lot', 'thought', 'understand', 'find', 'put', 'mind', 'feel', 'guess'