### PubMed10

In [1]:
import numpy as np
from scipy.io import loadmat
import pandas as pd
from DCC_data_utils import make_tfidf, make_PPMI
import random

n_top_terms = 50
n_top_frequent = 5000
n_runs = 1000


In [2]:
# Load PubMed original data
PubMed10_dw = loadmat("/home/saffeldt/Projects/Projects_clustering/DCC/DataDCC/PubMed10/docWordMat.mat")
doc_term_counts = PubMed10_dw["docWordMat"]
PubMed10_lb = loadmat("/home/saffeldt/Projects/Projects_clustering/DCC/DataDCC/PubMed10/label.mat")
labels = PubMed10_lb["label"]
labels = [item for sublist in labels for item in sublist]

# Apply the same permutations of rows as done before using RDCC
tmp_perm = np.random.RandomState(seed=42).permutation(doc_term_counts.shape[0])
doc_term_counts = doc_term_counts[tmp_perm,:]
labels = [labels[i] for i in tmp_perm.tolist()]

# Load the words and get the freq
pm10_terms = loadmat("/home/saffeldt/Projects/Projects_clustering/DCC/DataDCC/PubMed10/wordList.mat")
##pm10_terms
pm10_terms = pm10_terms["wordList"]
pm10_terms_list = list()

for i in range(pm10_terms.shape[0]):
    pm10_terms_list.append(pm10_terms[i,0].tolist()[0])

pm10_terms = pm10_terms_list

In [3]:
# Make the TF-IDF matrix
my_norm = 'l2'
s_idf = True
s_tf = False
mat_tfidf = make_tfidf(doc_term_counts, data_name = "PubMed10", 
                       norm = my_norm,
                       smooth_idf = s_idf, sublinear_tf = s_tf, 
                       verbose = True)

# Make the word PPMI matrix
isCol = True
mat_ppmi = make_PPMI(mat_tfidf, isCol = isCol, verbose = True)


# ---------------
# TF-IDF transform
# ---------------
# X_tfidf shape: (15565, 22437)

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]

# X_tfidf min/max: 0.0, 0.9381604331034942
# X_tfidf sparsity: 0.9972514825070178
# ---------------
(22437, 15565)
(15565, 22437)
(22437, 1)
(1, 22437)
--1-- 12373080
--2-- 8947114
# ---------------
# SPPMI transform from X_TFIDF
# ---------------
# SPPMI shape: (22437, 22437)

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]

# SPPMI min/max: 0.0, 11.7435
# SPPMI sparsity: 0.9822
# ---------------


### Compute a reference PPMI mean value (random)

In [4]:
all_rand_mean_ppmi = list()

# Get randomly n_top_terms words among the n_top_frequent terms
terms_freq = (doc_term_counts.sum(axis=0))
# Sort by decreasing count sum
terms_freq_order = np.array((-terms_freq).argsort()).flatten().tolist()

for iRun in range(n_runs):
    rand_idx = random.sample(range(0, n_top_frequent), n_top_terms)
    terms_freq_ref = [terms_freq_order[i] for i in rand_idx]
    #print(terms_freq_ref)

    # Get the corresponding TF-IDF vector
    #mat_tfidf_mostFreq = mat_tfidf[:,terms_freq_order[:n_terms]]
    mat_ppmi_mostFreq = mat_ppmi[:,terms_freq_ref]
    mat_ppmi_mostFreq = mat_ppmi_mostFreq[terms_freq_ref, :]
    #print(mat_ppmi_mostFreq.shape)

    # Compute the mean PPMI
    #print(mat_ppmi_mostFreq)
    tmp_list = []
    for i in range(mat_ppmi_mostFreq.shape[0]):
        for j in range(i+1, mat_ppmi_mostFreq.shape[1]):
            tmp_list.append(mat_ppmi_mostFreq[i,j])
    tmp_mean = np.mean(tmp_list)
    all_rand_mean_ppmi.append(tmp_mean)
    
print("Median PPMI", np.median(all_rand_mean_ppmi))
print("Max PPMI", np.max(all_rand_mean_ppmi))
print("Min PPMI", np.min(all_rand_mean_ppmi))
print("\nMean PPMI", np.mean(all_rand_mean_ppmi))
print("--> PPMI std", np.std(all_rand_mean_ppmi))

Median PPMI 0.18169191795372383
Max PPMI 0.2504488839806242
Min PPMI 0.11680503330235176

Mean PPMI 0.18171580133203658
--> PPMI std 0.020300961455491134


In [15]:
import os
import numpy as np
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score

all_top_terms = list()
all_top_terms_idx = list()

final_k = 7

# nClust = 10
if final_k == 10:
    col_part = np.load("/home/saffeldt/Projects/Projects_clustering/DCC/output/PubMed10/EBCO_1init_24runs_p0_id_nClust_diverse_exploreWords/final_k_given_10_064_054/W_0.25mod_l2_sidfTrue_stfFalse_k10_stoch70_init1_iter100.npy")
    row_part = np.load("/home/saffeldt/Projects/Projects_clustering/DCC/output/PubMed10/EBCO_1init_24runs_p0_id_nClust_diverse_exploreWords/final_k_given_10_064_054/Z_0.25mod_l2_sidfTrue_stfFalse_k10_stoch70_init1_iter100.npy")
# nClust = 9
elif final_k == 9:
    col_part = np.load("/home/saffeldt/Projects/Projects_clustering/DCC/output/PubMed10/EBCO_1init_24runs_p0_id_nClust_diverse_exploreWords/final_k_given_9_063_053/W_0.25mod_l2_sidfTrue_stfFalse_k10_stoch70_init1_iter100.npy")
    row_part = np.load("/home/saffeldt/Projects/Projects_clustering/DCC/output/PubMed10/EBCO_1init_24runs_p0_id_nClust_diverse_exploreWords/final_k_given_9_063_053/Z_0.25mod_l2_sidfTrue_stfFalse_k10_stoch70_init1_iter100.npy")
# nClust = 8
elif final_k == 8:
    col_part = np.load("/home/saffeldt/Projects/Projects_clustering/DCC/output/PubMed10/EBCO_1init_24runs_p0_id_nClust_diverse_exploreWords/final_k_given_8_063_053/W_0.25mod_l2_sidfTrue_stfFalse_k10_stoch70_init1_iter100.npy")
    row_part = np.load("/home/saffeldt/Projects/Projects_clustering/DCC/output/PubMed10/EBCO_1init_24runs_p0_id_nClust_diverse_exploreWords/final_k_given_8_063_053/Z_0.25mod_l2_sidfTrue_stfFalse_k10_stoch70_init1_iter100.npy")
# nClust = 7
elif final_k == 7:
    col_part = np.load("/home/saffeldt/Projects/Projects_clustering/DCC/output/PubMed10/EBCO_1init_24runs_p0_id_nClust_diverse_exploreWords/final_k_given_7_063_056/W_0.25mod_l2_sidfTrue_stfFalse_k10_stoch70_init1_iter100.npy")
    row_part = np.load("/home/saffeldt/Projects/Projects_clustering/DCC/output/PubMed10/EBCO_1init_24runs_p0_id_nClust_diverse_exploreWords/final_k_given_7_063_056/Z_0.25mod_l2_sidfTrue_stfFalse_k10_stoch70_init1_iter100.npy")

col_part = col_part.flatten()
row_part = row_part.flatten()

ari = adjusted_rand_score(labels, row_part)
nmi = normalized_mutual_info_score(labels, row_part, average_method="arithmetic")

print("NMI {0:0.4f}".format(nmi))
print("ARI {0:0.4f}".format(ari))

# All series of words
all_series = list()
nbr_clust = len(np.unique(col_part))

for n_clust in range(nbr_clust):
    # Get the col and row indices corresponding to the current cluster
    index_clust_col = np.where(col_part == n_clust)
    index_clust_col = index_clust_col[0].flatten().tolist()
    index_clust_row = np.where(row_part == n_clust)
    index_clust_row = index_clust_row[0].flatten().tolist()
        
    # Extract the co-cluster and compute the count sum for all words of the co-cluster
    curr_doc_term = doc_term_counts[index_clust_row,:]
    curr_doc_term = curr_doc_term[:,index_clust_col]
    tmp_terms_freq = (curr_doc_term.sum(axis=0))
    print("n_clust", n_clust, "( row", len(index_clust_row), ")")
    #print(curr_doc_term)

    # Sort by decreasing count sum
    tmp_order = (-tmp_terms_freq).argsort()
    tmp_order = np.array(tmp_order[0]).flatten().tolist()
    # Convert the ordered col index of the co-cluster into the original matrix index
    tmp_order_org_idx = [index_clust_col[i] for i in tmp_order]
        
    curr_ppmi = mat_ppmi[:,tmp_order_org_idx[:n_top_terms]]
    curr_ppmi = curr_ppmi[tmp_order_org_idx[:n_top_terms],:]
    all_top_terms_idx.append(tmp_order_org_idx[:n_top_terms])
    tmp_list = []
    for i in range(curr_ppmi.shape[0]):
        for j in range(i+1, curr_ppmi.shape[1]):
            tmp_list.append(curr_ppmi[i,j])
    tmp_mean = np.mean(tmp_list)
    print("-->> PPMI:", tmp_mean)
      
    # Convert to original index to get the words
    tmp_org_idx = index_clust_col
    tmp_terms = [pm10_terms[i] for i in tmp_org_idx]
    
    for str_w in ["gout", "otiti", "jaundic", "triptan", 
                  "calculi", "raynaud", "chickenpox", "hepat",
                 "macular", "allerg"]:
        res = [i for i in tmp_terms if str_w in i] 
        if len(res)>0:
            print(res)
    top_terms = [tmp_terms[i] for i in tmp_order[:n_top_terms]]
    print(len(tmp_terms))
    
    # See the top terms
    top_terms = np.stack(top_terms, axis = 0 )
    print("-->", top_terms, "\n")
    all_series.append(pd.Series(top_terms))

if final_k == 10:
    print(pd.concat([all_series[0], all_series[1], all_series[2], all_series[3], all_series[4],
                     all_series[5], all_series[6], all_series[7], all_series[8], all_series[9]], axis=1))
elif final_k == 9:
    print(pd.concat([all_series[0], all_series[1], all_series[2], all_series[3], all_series[4],
                     all_series[5], all_series[6], all_series[7], all_series[8]], axis=1))
elif final_k == 8:
    print(pd.concat([all_series[0], all_series[1], all_series[2], all_series[3], all_series[4],
                     all_series[5], all_series[6], all_series[7]], axis=1))
elif final_k == 7:
    print(pd.concat([all_series[0], all_series[1], all_series[2], all_series[3], all_series[4],
                    all_series[5], all_series[6]], axis=1))

print("\n")
    

NMI 0.6343
ARI 0.5625
n_clust 0 ( row 2559 )
-->> PPMI: 0.6418274436799918
['antigout', 'gout', 'gouti', 'pseudogout']
['jaundic']
['calculi', 'microcalculi']
['extrahepat', 'hepatectomi', 'hepati', 'hepatica', 'hepaticojejunostomi', 'hepatobiliari', 'hepatocellular', 'hepatocyt', 'hepatoduoden', 'hepatolog', 'hepatomegali', 'hepatopet', 'hepatoportoenterostomi', 'hepatoprotect', 'hepatoren', 'hepatotox', 'hepatotoxicti', 'intrahepat', 'posthepat', 'posthepatectomi', 'prehepat', 'steatohepat', 'subhepat', 'transhepat', 'transhepatich']
4538
--> ['wa' 'patient' 'stone' 'renal' 'group' 'calcium' 'urinari' 'kidnei'
 'oxal' 'gout' 'level' 'rate' 'crystal' 'present' 'percutan' 'urin'
 'obstruct' 'acid' 'jaundic' 'lower' 'calculi' 'serum' 'report' 'perform'
 'uric' 'cau' 'shock' 'wave' 'lithotripsi' 'requir'] 

n_clust 1 ( row 2401 )
-->> PPMI: 0.5254552715034845
['goutlik', 'nongout']
['almotriptan', 'eletriptan', 'frovatriptan', 'naratriptan', 'nontriptan', 'notriptan', 'rizatriptan', 'sum

In [11]:
for i in range(nbr_clust):
    print("\n# Clust", i)
        
    # PPMI
    # ----
    list_mean = list()
    for j in range(nbr_clust):
        curr_ppmi = mat_ppmi[:,all_top_terms_idx[i]]
        curr_ppmi = curr_ppmi[all_top_terms_idx[j],:]
        #tmp_mean = (np.triu(curr_ppmi.todense(), 1).sum())/(curr_ppmi.shape[0]*(curr_ppmi.shape[0]-1)/2)
        tmp_list = []
        for k in range(curr_ppmi.shape[0]):
            for l in range(k+1, curr_ppmi.shape[1]):
                tmp_list.append(curr_ppmi[k,l])
        tmp_mean = np.mean(tmp_list)
        if j != i: 
            list_mean.append(tmp_mean)
        else:
            print("-->> self PPMI:", tmp_mean)
    print("-->> versus all PPMI:", np.mean(list_mean))


# Clust 0
-->> self PPMI: 1.2240339061614822
-->> versus all PPMI: 0.13624255374752395

# Clust 1
-->> self PPMI: 0.896955059822908
-->> versus all PPMI: 0.11312488786504223

# Clust 2
-->> self PPMI: 0.9764043279587383
-->> versus all PPMI: 0.14901031574638657

# Clust 3
-->> self PPMI: 0.547273258256797
-->> versus all PPMI: 0.09845708982568195

# Clust 4
-->> self PPMI: 1.46302530915902
-->> versus all PPMI: 0.11897814335701283

# Clust 5
-->> self PPMI: 0.5935877510106102
-->> versus all PPMI: 0.11131857030445105

# Clust 6
-->> self PPMI: 1.0207473592090734
-->> versus all PPMI: 0.08091476493608557

# Clust 7
-->> self PPMI: 0.486066114834723
-->> versus all PPMI: 0.1002459814905754

# Clust 8
-->> self PPMI: 1.0117519232181098
-->> versus all PPMI: 0.12130651830715736

# Clust 9
-->> self PPMI: 0.9936665159194995
-->> versus all PPMI: 0.0991590479391268


## NPMI within

In [12]:
n_top_terms=30

In [None]:
import numpy as np
import subprocess
import re
import os

jar_path = '/home/saffeldt/Projects/Projects_clustering/DCC/Palmetto/palmetto-0.1.0-jar-with-dependencies.jar'
wiki_file = '/home/saffeldt/Projects/Projects_clustering/DCC/Palmetto/Wikipedia_bd/wikipedia_bd'
c_measure = 'npmi'
topic_file = '/home/saffeldt/Projects/Projects_clustering/DCC/Palmetto/topic_files/current_topics.txt'
out_file = '/home/saffeldt/Projects/Projects_clustering/DCC/Palmetto/topic_files/current_topics_out.txt'

# PUBMED5 top terms
word_list = ['wa', 'macular', 'eye', 'visual', 'amd', 'retinal', 'acuity', 'associate', 'degeneration', 'group']
word_list = ['otitis', 'ear', 'child', 'media', 'middle', 'acute', 'antibiotics', 'aom', 'tube', 'om']
word_list = ['stone', 'renal', 'calcium', 'urinary', 'kidney', 'oxalate', 'rate', 'urine', 'percutaneous', 'calculi']
word_list = ['allergy', 'nasal', 'rhinitis', 'symptom', 'pollen', 'seasonal', 'effect', 'allergens', 'asthma', 'increase']
word_list = ['migraine', 'patient', 'headache', 'study', 'thi', 'treatment', 'ar', 'pain', 'attack', 'associate']

# AMD vs OTITIS
word_list = ["wa", "macular", "eye", "visual", "amd", "retinal", "acuity", "associate", "degeneration", 
             "group", "neovascular", "significant", "month", "diabetic", "no", "improve", "risk", 
             "choroiditis", "edema", "measure", "intravitreal", "result", "thick", "change", "injection", 
             "case", "evaluate", "development", "show", "factor"]
word_list = ["otitis","ear","children","media","middle","acute","antibiotics","aom","tube","om","ag",
             "hear","effusion","year","chronic","dai","infection","present","cause","isolate",
             "recurrent","pneumonia","bacterial","found","tympan","vaccin","mastoiditis","complication",
             "pneumococcal","influenza"]
word_list = ["stone","renal","calcium","urinary","kidney","oxalate","rate","urine", "percutaneous",
             "calculi","crystal","lower","shock","lithotripsy","wave", "fragment","ureter","acid",
             "require","excretion","nephrolithotomy","procedure", "formation","protein","material",
             "tract","management","metabolite","uric","pcnl"]
word_list = ["allergic", "nasal", "rhinitis", "symptom", "pollen", "seasonal", "effect", "allergen", 
             "asthma", "increase", "significantly", "subject", "score", "level", "cell", "ig", 
             "placebo", "allergy", "test", "sensitive", "specific", "total", "immunotherapy", 
             "active", "grass", "eosinophil", "intranasal", "reduce", "challenge", "daily"]
word_list = ["migraine", "patient", "headache", "study", "thi", "treatment", "ar", "pain", "clinical", 
             "compare", "attack", "than", "may", "different", "dure", "aura", "report", "control", 
             "severe", "ha", "mg", "medical", "response", "efficacy", "sumatriptan", "assess", 
             "include", "data", "triptan", "drug"]

word_list = ["macular", "degeneration", "retinal", "edema", "diabetic", "acuity", "visual", "amd", 
                  "injection", "eye", "neovascular", "risk", "factor", "intravitreal", "significant", 
                  "measure", "associate", "evaluate", "improve", "result", "change", "development", 
                  "case", "thick", "show", "choroiditis", "group", "month", "no", "wa"]

word_list = ["calcium", "urinary", "gout", "oxalate", "crystal", "urine", "acid", "kidney", "uric", 
             "excrete", "serum", "protein", "urater", "format", "metabolite", "gouti", "normal", 
             "citrat", "concentr", "form", "low", "deposit", "male", "nephrolithiasi", "phosphat", 
             "diet", "dietari", "allopurinol", "joint", "collect", "hyperuricemia", "arthriti", 
             "tubular", "caox", "potassium", "ph", "plasma", "hypercalciuria", "magnesium", 
             "inhibitor", "tophi", "creatinin", "msu", "idiopath", "mice", "tophac", "supersatur", 
             "composit", "bind", "monosodium"]

n_words = len(word_list)
n_pairs = int((n_words*(n_words-1)/2))

all_npmi = np.zeros((n_words, n_words))

# Call Palmetto/NPMI
# ----
cmd = "java -jar '{}' '{}' '{}' '{}' > '{}'".format(jar_path, wiki_file, c_measure, topic_file, out_file)
for wi in range(n_words-1):
    # Write all word pairs in a topic file
    outF = open(topic_file, "w")
    for wj in range(wi+1, n_words):
        line = '{} {}\n'.format(word_list[wi], word_list[wj])
        outF.write(line)
    outF.close()
    
    # Call Palmetto for NPMI
    result = os.system(cmd)

    if result == 0:
        # Read out file line by line
        file1 = open(out_file, 'r') 
        #next(file1)
        Lines = file1.readlines()[1:]
        
        # Strips the newline character
        for line in Lines: 
            tmp_list = line.strip().split()
            #print(tmp_list)
            tmp_value = tmp_list[1]
            tmp_wi_str = (tmp_list[2])[1:-1]
            tmp_wj_str = (tmp_list[3])[0:-1]
            tmp_wi_idx = word_list.index(tmp_wi_str)
            tmp_wj_idx = word_list.index(tmp_wj_str)
            all_npmi[tmp_wi_idx, tmp_wj_idx] = tmp_value
            all_npmi[tmp_wj_idx, tmp_wi_idx] = all_npmi[tmp_wi_idx, tmp_wj_idx]
            #print(tmp_wi_str, tmp_wj_str, tmp_value)
        
#print(all_npmi)
# Compute the NPMI3 scores
k = 5
all_npmi_argsort = np.argsort(-all_npmi, axis = 0)
all_npmi_score = []

for i_word in range(len(word_list)):
    tmp_order_idx = all_npmi_argsort[:,i_word].flatten().tolist()
    tmp_order_idx = [idx for idx in tmp_order_idx if idx != i_word ]
    tmp_npmi_score = all_npmi[tmp_order_idx[:k], i_word]
    print(i_word, word_list[i_word], "--> ", [word_list[i] for i in tmp_order_idx[:k]], tmp_npmi_score)
    all_npmi_score.append(np.mean(tmp_npmi_score))

import pandas as pd

word_npmi_order_idx = np.argsort(-np.array(all_npmi_score))
word_npmi_order_str_series = pd.Series([word_list[i] for i in word_npmi_order_idx])
word_npmi_order_val_series = pd.Series([all_npmi_score[i] for i in word_npmi_order_idx])
pd.concat([word_npmi_order_str_series, word_npmi_order_val_series], axis = 1)

In [None]:
# Save all NPMI values as matrix
np.savetxt("/home/saffeldt/Projects/Projects_clustering/DCC/output/PubMed5/NPMI/allNPMI_30w_k5_amd.csv", all_npmi, 
              delimiter = ",")

In [None]:
# Compute all pairwise average
all_npmi_mean = (np.triu(all_npmi, 1).sum())/(all_npmi.shape[0]*(all_npmi.shape[0]-1)/2)
print("# NPMI average: ", all_npmi_mean)

## NPMIk between

In [None]:
import numpy as np
import subprocess
import re

jar_path = '/home/saffeldt/Projects/Projects_clustering/DCC/Palmetto/palmetto-0.1.0-jar-with-dependencies.jar'
wiki_file = '/home/saffeldt/Projects/Projects_clustering/DCC/Palmetto/Wikipedia_bd/wikipedia_bd'
c_measure = 'npmi'
topic_file = '/home/saffeldt/Projects/Projects_clustering/DCC/Palmetto/topic_files/current_topics.txt'
out_file = '/home/saffeldt/Projects/Projects_clustering/DCC/Palmetto/topic_files/current_topics_out.txt'

# PUBMED5 frequency top terms
## -- 10 -- ##
#word_list_amd = ['wa', 'macular', 'eye', 'visual', 'amd', 'retinal', 'acuity', 'associate', 'degeneration', 'group']
##word_list_otitis = ['otitis', 'ear', 'children', 'media', 'middle', 'acute', 'antibiotics', 'aom', 'tube', 'om']
#word_list_otitis = ['otitis', 'ear', 'child', 'media', 'middle', 'acute', 'antibiotics', 'aom', 'tube', 'om']
#word_list_kidney = ['stone', 'renal', 'calcium', 'urinary', 'kidney', 'oxalate', 'rate', 'urine', 'percutaneous', 'calculi']
#word_list_hayfever = ['allergy', 'nasal', 'rhinitis', 'symptom', 'pollen', 'seasonal', 'effect', 'allergens', 'asthma', 'increase']
#word_list_migraine = ['migraine', 'patient', 'headache', 'study', 'thi', 'treatment', 'ar', 'pain', 'attack', 'associate']

# PUBMED5 NPMIk top terms
## -- 30 -- ##
word_list_amd = ["macular", "degeneration", "retinal", "edema", "diabetic", "acuity", "visual", "amd", 
                  "injection", "eye", "neovascular", "risk", "factor", "intravitreal", "significant", 
                  "measure", "associate", "evaluate", "improve", "result", "change", "development", 
                  "case", "thick", "show", "choroiditis", "group", "month", "no", "wa"]
word_list_otitis = ["otitis", "infection", "pneumonia", "bacterial", "acute", "chronic", "antibiotics", 
                    "recurrent", "effusion", "complication", "influenza", "ear", "cause", "pneumococcal", 
                    "isolate", "media", "tube", "hear", "middle", "vaccin", "present", "found", "tympan", 
                    "mastoiditis", "children", "ag", "year", "aom", "dai", "om"]
word_list_kidney = ["urinary", "excretion", "ureter", "urine", "kidney", "renal", "uric", "oxalate", 
                    "acid", "metabolite", "calcium", "lithotripsy", "tract", "calculi", "protein", 
                    "crystal", "stone", "percutaneous", "procedure", "shock", "wave", "lower", 
                    "formation", "nephrolithotomy", "rate", "fragment", "require", "material", 
                    "management", "pcnl"]
word_list_hayfever = ["allergic", "rhinitis", "allergy", "asthma", "allergen", "immunotherapy", "pollen", "nasal", 
                      "symptom", "eosinophil", "cell", "seasonal", "effect", "sensitive", "placebo", "reduce", 
                      "increase", "test", "specific", "grass", "level", "subject", "total", "active", "daily", 
                      "intranasal", "score", "challenge", "ig", "significantly"]
word_list_migraine = ["migraine", "headache", "triptan", "treatment", "pain", "patient", "efficacy", 
                      "clinical", "drug", "severe", "medical", "sumatriptan", "aura", "assess", "study", 
                      "compare", "data", "mg", "response", "attack", "control", "report", "may", 
                      "different", "ha", "include", "thi", "ar", "than", "dure"]


# One vs Others,
word_list = word_list_migraine
word_list.extend(word_list_hayfever)

n_words = len(word_list)
n_pairs = int((n_words*(n_words-1)/2))

all_npmi = np.zeros((n_words, n_words))

# Call Palmetto/NPMI
# ----
cmd = "java -jar '{}' '{}' '{}' '{}' > '{}'".format(jar_path, wiki_file, c_measure, topic_file, out_file)
for wi in range(n_words-1):
    print("# word", wi)
    # Write all word pairs in a topic file
    outF = open(topic_file, "w")
    for wj in range(wi+1, n_words):
        line = '{} {}\n'.format(word_list[wi], word_list[wj])
        outF.write(line)
    outF.close()
    
    # Call Palmetto for NPMI
    result = os.system(cmd)

    if result == 0:
        # Read out file line by line
        file1 = open(out_file, 'r') 
        #next(file1)
        Lines = file1.readlines()[1:]
        
        # Strips the newline character
        for line in Lines: 
            tmp_list = line.strip().split()
            #print(tmp_list)
            tmp_value = tmp_list[1]
            tmp_wi_str = (tmp_list[2])[1:-1]
            tmp_wj_str = (tmp_list[3])[0:-1]
            tmp_wi_idx = word_list.index(tmp_wi_str)
            tmp_wj_idx = word_list.index(tmp_wj_str)
            all_npmi[tmp_wi_idx, tmp_wj_idx] = tmp_value
            all_npmi[tmp_wj_idx, tmp_wi_idx] = all_npmi[tmp_wi_idx, tmp_wj_idx]
            #print(tmp_wi_str, tmp_wj_str, tmp_value)
        

In [None]:
#print(all_npmi)
# Compute the NPMIk scores
k = 5
#print(word_list)
sub_npmi = all_npmi[n_top_terms:(2*n_top_terms),:n_top_terms]
sub_npmi_argsort = np.argsort(-sub_npmi, axis = 0)
sub_npmi_score = []

for i_word in range(sub_npmi.shape[1]):
    tmp_order_idx = sub_npmi_argsort[:,i_word].flatten().tolist()
    tmp_npmi_score = sub_npmi[tmp_order_idx[:k], i_word]
    print(i_word, word_list[i_word], "--> ", [word_list[i+n_top_terms] for i in tmp_order_idx[:k]], 
          tmp_npmi_score)
    sub_npmi_score.append(np.mean(tmp_npmi_score))

import pandas as pd
word_npmi_str_series = pd.Series(word_list[:n_top_terms])
word_npmi_val_series = pd.Series(sub_npmi_score)
pd.concat([word_npmi_str_series, word_npmi_val_series], axis = 1)

## CosSim within coCluster from doc-term counts

In [None]:
n_top_terms=5

In [None]:
import os
from sklearn.metrics.cluster import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

out_dir = "/home/saffeldt/Projects/Projects_clustering/DCC/output"
data_name = "PubMed5"

version = "mulPPMI_ColW-RowZ" # mulPPMI_Col, mulPPMI_Row, mulPPMI_ColZ-RowW, mulPPMI_ColW-RowZ
#all_versions = ["mulPPMI_Col", "mulPPMI_Row", "mulPPMI_ColZ-RowW", "mulPPMI_ColW-RowZ", "id"]
all_versions = ["mulPPMI_ColZ-RowW"]
p = 1
n_init = 10
n_runs = 20
    
all_top_terms = dict()
all_top_terms_idx = dict()

for version in all_versions:
    print("# Version", version)
    all_top_terms[version] = []
    all_top_terms_idx[version] = []
    
    if version == "id":
        p = 0
        
    # Load the column partition
    col_part_path = os.path.join(out_dir, data_name, "{}init_{}runs_p{}_{}".format(n_init, n_runs, p, version), "dcc_l2_sidfTrue_stfFalse_k5_stoch70_init10_iter100__axis1.npy")
    col_part = np.load(col_part_path)

    # Load the row partition
    row_part_path = os.path.join(out_dir, data_name, "{}init_{}runs_p{}_{}".format(n_init, n_runs, p, version), "dcc_l2_sidfTrue_stfFalse_k5_stoch70_init10_iter100__axis0.npy")
    row_part = np.load(row_part_path)

    # Select one of the co-clustering result
    part_num = -1
    best_nmi = 0.0

    # Evaluate the row partition and select the partition with the best NMI
    for i_part in range(n_runs):
        ari = adjusted_rand_score(labels, row_part[i_part,:])
        nmi = normalized_mutual_info_score(labels, row_part[i_part,:], average_method="arithmetic")
        #print("# --> ({}) NMI".format(i_part), "{0:.4f}".format(nmi), 
        #      "| ({}) ARI".format(i_part), "{0:.4f}".format(ari), "\n")

        if nmi > best_nmi:
            part_num =  i_part
            best_nmi = nmi

    print("{} Best row partition: {} | ".format(version, part_num), "NMI {0:0.4f}".format(best_nmi))

    # All series of words
    all_series = []
    nbr_clust = len(np.unique(col_part))

    for n_clust in range(nbr_clust):
        print("n_clust", n_clust)
        # Get the col and row indices corresponding to the current cluster
        index_clust_col = np.where(col_part[part_num,:] == n_clust)
        index_clust_col = index_clust_col[0].flatten().tolist()
        index_clust_row = np.where(row_part[part_num,:] == n_clust)
        index_clust_row = index_clust_row[0].flatten().tolist()
        
        # Extract the co-cluster
        curr_doc_term = mat[index_clust_row,:]
        curr_doc_term = curr_doc_term[:,index_clust_col]
        print("--> curr_doc_term.shape", curr_doc_term.shape)
        tmp_terms_freq = (curr_doc_term.sum(axis = 0))
        tmp_terms_freq = tmp_terms_freq.flatten().tolist()
        tmp_terms_freq = tmp_terms_freq[0]
        print("--> tmp_terms_freq[:10]", tmp_terms_freq[:10], len(tmp_terms_freq))
        
        # Sort by decreasing count sum
        tmp_order = (-np.array(tmp_terms_freq)).argsort()
        tmp_order = tmp_order.flatten().tolist()
        print("--> tmp_order[:10]", tmp_order[:10], len(tmp_order))
        print("--> tmp freq order[:10]", [tmp_terms_freq[i] for i in tmp_order[:10]])
              
        # Convert the ordered col index of the co-cluster into the original matrix index
        tmp_order_org_idx = [index_clust_col[i] for i in tmp_order]
        
        # Convert to original index to get the words
        tmp_org_idx = (word_org_idx[index_clust_col]).tolist()
        tmp_terms = [pm10_terms[i] for i in tmp_org_idx]
        top_terms = [tmp_terms[i] for i in tmp_order[:n_top_terms]]    

        # See the top terms
        top_terms = np.stack(top_terms, axis = 0 )
        print("-->", top_terms, "\n")
        
        # Compute dot product
        # Reorder doc-term sub mat
        curr_doc_term = curr_doc_term[:,tmp_order]        
        curr_doc_term = normalize(curr_doc_term, norm='l2', axis = 0)       
        Sc = curr_doc_term.T * curr_doc_term
        Sc = Sc[:n_top_terms,:n_top_terms]
        #print("MIN", np.min(np.triu(Sc.todense(), 1)))
        #print(np.triu(Sc.todense(), 1))
        #print(Sc.todense())
        #print("MAX", np.max(np.triu(Sc.todense(), 1)))
        tmp_list = []
        for i in range(n_top_terms):
            for j in range(i+1, n_top_terms):
                if Sc[i,j] > 0.0:
                    tmp_list.append(Sc[i,j])
                    print(top_terms[i], top_terms[j], Sc[i,j])
                
        tmp_mean = np.mean(tmp_list)
        print("-->> Mean CosSim:", tmp_mean, '\n')
                
        
    #print(pd.concat([all_series[0], all_series[1], all_series[2], all_series[3], all_series[4]], axis=1))
    #print("\n")
    #import matplotlib.pyplot as plt
    #plt.plot(tmp_terms_freq[:,tmp_order][0][:,:500])
    #plt.show()