In [1]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

# update a pip in the current Jupyter kernel
#import sys
#!{sys.executable} -m pip install -U scikit-learn

The scikit-learn version is 0.23.1.


In [2]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import com_func

# parameters
#----- threshold for selecting set of name group -----------#
threshold_select_name_group = 100
#----- threshold for selecting min sample in name group ----#
threshold_min_class_sample_size = 100

Dataset = "pubmed"

In [3]:
# text embedding only
pp_text_emb = ["tf", "tf_idf", "lsa", "pv_dm", "pv_dbow"]
pp_citation_emb = ["off"]

In [None]:
# citation embedding only
pp_text_emb = ["off"]
pp_citation_emb = ["node2vec", "n2v"]

In [None]:
# combined embedding
pp_text_emb = ["lsa", "pv_dm", "pv_dbow"]
pp_citation_emb = ["n2v"]

In [4]:
print(pp_text_emb)
print(pp_citation_emb)

['tf']
['off']


In [5]:
# method to evaluate cluster result
# https://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-clustering-1.html
def pairwise_f1(true_label, pred_label):
    total_pair = len(true_label)*(len(true_label)-1)/2
    # predictions that are positive, TP+FP
    pred_pos = 0
    # conditions that are positive, TP+FN
    cond_pos = 0
    # Pairs Correctly Predicted To SameAuthor, TP
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for i in range(len(true_label)):
        for j in range(i + 1, len(true_label)):
            if pred_label[i] == pred_label[j]:
                pred_pos +=1
            if true_label[i] == true_label[j]:
                cond_pos +=1
            if (true_label[i] == true_label[j]) and (pred_label[i] == pred_label[j]):
                #print(true_label[i], " ", i)
                #print(pred_label[i])
                tp +=1
    fp = pred_pos-tp
    fn = cond_pos-tp
    tn = int(total_pair-tp-fp-fn)
    print("tp: ", tp)
    print("fp: ", fp)
    print("fn: ", fn)
    print("tn: ", tn)
    print("tp+fp: ", pred_pos)
    print("tp+fn:", cond_pos)
    # calculate pairwise f1 score
    if tp == 0:
        pairwise_precision = 0
        pairwise_recall = 0
        pairwise_f1 = 0
    else:
        pairwise_precision = tp / pred_pos
        pairwise_recall = tp / cond_pos
        pairwise_f1 = (2 * pairwise_precision * pairwise_recall) / (pairwise_precision + pairwise_recall)
        
    return pairwise_f1, pairwise_precision, pairwise_recall

In [None]:
import io
import collections
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering, MeanShift, DBSCAN
from sklearn.cluster import estimate_bandwidth
from statistics import mean

# fix random seed for reproducibility
# np.random.seed(1)

fileDir = "../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

# ----------------------- different textual embedding ----------------------#
for text_emb in pp_text_emb:
    print("Load text embedding: ", text_emb)
    # read pretrained embeddings
    if text_emb in ["tf", "tf_idf"]:
        all_text_emb_pid, all_text_embedding = com_func.read_text_embedding(emb_type=text_emb, training_size="140k")
    elif text_emb != "off":
        all_text_embedding = com_func.read_text_embedding(emb_type=text_emb, training_size="140k")
        all_text_emb_pid = [emb[0] for emb in all_text_embedding]
        all_text_embedding = [emb[1:] for emb in all_text_embedding]
    
    for citation_emb in pp_citation_emb:
        print("Load citation embedding: ", citation_emb)
        all_citation_embedding = com_func.read_citation_embedding_sorted(emb_type = citation_emb)
        all_citation_emb_pid = [emb[0] for emb in all_citation_embedding]
        all_citation_embedding = [emb[1:] for emb in all_citation_embedding]
        
        # collect statistic to output
        allname, num_class, per_class_count, selected_sample_size, orginal_sample_size= ([] for i in range(5))
        all_kmean_Pf1, AHC_single_Pf1,AHC_complete_Pf1,AHC_average_Pf1,AHC_ward_Pf1 = ([] for i in range(5))

        cluster_count = 0

        # ------- different name group in all name group --------------------#
        for file in listfiles:
            # group name
            temp = file.split("_")
            name = temp[1]+"_"+temp[-1]
            print("For name: ",name)
            # read needed content in labeled file
            labeled_data = com_func.read_pid_aid(fileDir+file)
            #----------- select name group contain productive author------------------------------------#
            #----------- (contain pair of author write more than 100 papers) ---------------------------#
            # count number of paper each author write based on author ID
            authorCounter = collections.Counter(labeled_data["authorID"])
            # remove name group that do not contain pair of author write more than 100 papers
            for k in list(authorCounter):
                if authorCounter[k] < threshold_select_name_group:
                    del authorCounter[k]
            # if only have one class or no class pass the threshold, not applicable
            if(len(authorCounter)==0) or (len(authorCounter)==1):
                print(name, " pass")
            else:
                #plot_save_path = "../../plot/140k_sample_threshold="+str(threshold_min_class_sample_size)+"/"+"textemb="+text_emb+"_citation_emb="+citation_emb+"/"
                plot_save_path = None
                orginal_sample_size.append(len(labeled_data))
                #--------select authors in name group are very productive (more than threshold)---------#
                print("Total sample size before apply threshold: ",len(labeled_data))
                # count number of paper each author write based on author ID
                paperCounter = collections.Counter(labeled_data["authorID"])
                print(paperCounter)
                print("Total author before apply threshoid: ", len(paperCounter))
                # collect per class statistic
                for k in list(paperCounter):
                    if paperCounter[k] < threshold_min_class_sample_size:
                        del paperCounter[k]
                temp =list(paperCounter.keys())
                print(temp)
                print("Total author after apply threshoid: ", len(temp))
                # remove samples that are smaller than threshold
                labeled_data = labeled_data[labeled_data.authorID.isin(temp)]
                print("Total sample size after apply threshold: ",len(labeled_data))
                cluster_count = len(paperCounter)
                selected_sample_size.append(len(labeled_data))
                allname.append(name)
                num_class.append(cluster_count)
                per_class_count.append(paperCounter)
                #------------ extract paper representation -------------------------------------------#
                # shuffle the data
                labeled_data = labeled_data.sample(frac=1).reset_index(drop=True)
                # extract true label and pid
                label = labeled_data["authorID"]
                pid = labeled_data["paperID"]
                # list of different data field
                part_collection = []
                # select feature wanted to fit to clustering/classification algorithm
                # data part, textual information
                data_part_textual = com_func.extract_embedding(all_text_embedding, all_text_emb_pid, pid)
                print("Text embedding shape: ", data_part_textual.shape)
                part_collection.append(data_part_textual)
                # data part, citation information
                data_part_citation = com_func.extract_embedding(all_citation_embedding, all_citation_emb_pid, pid)
                data_part_citation.fillna(0, inplace=True)
                print("Citation embedding shape: ", data_part_citation.shape)
                part_collection.append(data_part_citation)
                # merge different part of data data together by concatenate it all together
                # remove empty emb (when emb set off)
                part_collection = [part for part in part_collection if len(part)!=0]
                if len(part_collection)>1:
                    combinedata = np.concatenate(part_collection,axis=1)
                elif len(part_collection)==1:
                    if isinstance(part_collection[0], pd.DataFrame):
                        combinedata = part_collection[0].values
                    else:
                        combinedata = part_collection[0]
                else:
                    print("No data available")
                    break
                print("Final feature (combined embedding) shape: ", combinedata.shape)
                # -------------- plot true label with PCA --------------------------------------- #
                com_func.visualizeWithPCA(combinedata, label, plotSavingPath = plot_save_path,
                                          name = name, plot_title='Dataset '+name+' applied PCA with ground truth labels')
                # -------------- using converted feature vector to form cluster------------------ #
                # --------------- algorithm need to know number of cluster ---------------------- #
                # ------------- 100 runs for non deterministic result to stable ----------------- #
                non_deterministic_PF1_result = collections.defaultdict(list)
                for i in range(100):
                    # k-means baseline
                    kmeans = KMeans(n_clusters=cluster_count, random_state=i).fit(combinedata)
                    kmeans_pred = kmeans.predict(combinedata)
                    k_means_pairwise_f1 = pairwise_f1(label, kmeans_pred)
                    non_deterministic_PF1_result["k_mean"].append(k_means_pairwise_f1[0])
                    
                print(non_deterministic_PF1_result)
                
                # euclidean distance with single linkage Agglomerative hierarchical clustering
                euclidean_single_AHC = AgglomerativeClustering(n_clusters = cluster_count, linkage = "single", affinity = "euclidean")
                euclidean_single_AHC_pred = euclidean_single_AHC.fit_predict(combinedata)
                # euclidean distance complete linkage Agglomerative hierarchical clustering
                euclidean_complete_AHC = AgglomerativeClustering(n_clusters = cluster_count, linkage = "complete", affinity = "euclidean")
                euclidean_complete_AHC_pred = euclidean_complete_AHC.fit_predict(combinedata)
                # euclidean distance average linkage Agglomerative hierarchical clustering
                euclidean_average_AHC = AgglomerativeClustering(n_clusters = cluster_count, linkage = "average", affinity = "euclidean")
                euclidean_average_AHC_pred = euclidean_average_AHC.fit_predict(combinedata)
                # ward linkage Agglomerative hierarchical clustering
                ward_AHC = AgglomerativeClustering(n_clusters = cluster_count, linkage = "ward")
                ward_AHC_pred = ward_AHC.fit_predict(combinedata)
                # Spectral Clustering
                # SpectralCluster = SpectralClustering(n_clusters=cluster_count, affinity = "nearest_neighbors")
                # SpectralCluster_pred = SpectralCluster.fit_predict(combinedata)
                # --------------- algorithm auto estimate number of cluster --------------------- #
                # mean shift
                # estimate bandwidth for mean shift
                # bandwidth = estimate_bandwidth(combinedata, quantile=0.3)
                # print("bandwidth: ",bandwidth)
                # MScluster = MeanShift(bandwidth=bandwidth)
                # MSclustering_pred = MScluster.fit_predict(combinedata)
                # MS_estimated_cluster_count = np.unique(MSclustering_pred)
                # print("MS:",MSclustering_pred)
                # print("MS:",MS_estimated_cluster_count)
                # DBSCAN
                # DBSCANcluster = DBSCAN()
                # DBSCANcluster_pred = DBSCANcluster.fit_predict(combinedata)
                # DBSCAN_estimated_cluster_count = np.unique(DBSCANcluster_pred)
                # print("DBSCAN: ",DBSCANcluster_pred)
                # print("DBSCAN: ",DBSCAN_estimated_cluster_count)
                # -------------- evaluate clustering result using pairwise f1 ------------------- #
                euclidean_single_AHC_pairwise_f1 = pairwise_f1(label, euclidean_single_AHC_pred)
                euclidean_complete_AHC_pairwise_f1 = pairwise_f1(label, euclidean_complete_AHC_pred)
                euclidean_average_AHC_pairwise_f1 = pairwise_f1(label, euclidean_average_AHC_pred)
                ward_AHC_pairwise_f1 = pairwise_f1(label, ward_AHC_pred)
                # SpectralCluster_pairwise_f1_score = pairwise_f1(label, SpectralCluster_pred)
                # MSclustering_pairwise_f1_score = pairwise_f1(label, MSclustering_pred)
                # DBSCANcluster_pairwise_f1_score = pairwise_f1(label, DBSCANcluster_pred)
                print("Average k_mean pf1: ",non_deterministic_PF1_result["k_mean"])
                print(kmeans)
                print("euclidean_single_AHC_pairwise_f1: ", euclidean_single_AHC_pairwise_f1)
                print(euclidean_single_AHC)
                print("euclidean_complete_AHC_pairwise_f1: ", euclidean_complete_AHC_pairwise_f1)
                print(euclidean_complete_AHC)
                print("euclidean_average_AHC_pairwise_f1: ", euclidean_average_AHC_pairwise_f1)
                print(euclidean_average_AHC)
                print("ward_AHC_pairwise_f1_score: ", ward_AHC_pairwise_f1)
                print(ward_AHC)
                # print("SpectralCluster_pairwise_f1_score: ",SpectralCluster_pairwise_f1_score)
                # print(SpectralCluster)
                # print("mean_shift_pf1: ", MSclustering_pairwise_f1_score)
                # print(MScluster)
                # print("DBSCANcluster_pairwise_f1_score: ", DBSCANcluster_pairwise_f1_score)
                # print(DBSCANcluster)
                
                all_kmean_Pf1.append(mean(non_deterministic_PF1_result["k_mean"]))
                AHC_single_Pf1.append(euclidean_single_AHC_pairwise_f1[0])
                AHC_complete_Pf1.append(euclidean_complete_AHC_pairwise_f1[0])
                AHC_average_Pf1.append(euclidean_average_AHC_pairwise_f1[0])
                AHC_ward_Pf1.append(ward_AHC_pairwise_f1[0])

        # write evaluation result to excel
        output = pd.DataFrame({'Name Group':allname, "Class number":num_class, "Per class size":per_class_count,
                               "Total selected sample size":selected_sample_size,"Orginal sample size":orginal_sample_size,
                               "kmean_Pf1":all_kmean_Pf1, "AHC_single_Pf1": AHC_single_Pf1,
                               "AHC_complete_Pf1": AHC_complete_Pf1,"AHC_average_Pf1": AHC_average_Pf1,
                               "AHC_ward_Pf1": AHC_ward_Pf1})

        savePath = "../../result/"+Dataset+"/1_Clustering_sample=140k/"
        filename = "citation="+citation_emb+"_textual="+text_emb+"_threshold="+str(threshold_min_class_sample_size)+".csv"
        com_func.write_csv_df(savePath, filename, output)
        print("Done")

Load text embedding:  tf
Load citation embedding:  off
For name:  j_read
j_read  pass
For name:  f_esteves
f_esteves  pass
For name:  c_miller
c_miller  pass
For name:  r_jha
r_jha  pass
For name:  a_lowe
a_lowe  pass
For name:  a_vega
a_vega  pass
For name:  k_smith
k_smith  pass
For name:  j_gordon
j_gordon  pass
For name:  s_liao
s_liao  pass
For name:  j_qian
j_qian  pass
For name:  s_bernardi
s_bernardi  pass
For name:  t_hill
t_hill  pass
For name:  s_schindler
s_schindler  pass
For name:  j_williams
j_williams  pass
For name:  s_jacobson
s_jacobson  pass
For name:  e_andrade
e_andrade  pass
For name:  t_santos
t_santos  pass
For name:  k_kim
Total sample size before apply threshold:  1111
Counter({'0000-0002-6929-5359': 211, '0000-0001-9498-284X': 154, '0000-0002-5878-8895': 139, '0000-0002-1864-3392': 92, '0000-0002-7045-8004': 57, '0000-0001-7896-6751': 57, '0000-0002-7991-9428': 55, '0000-0002-4010-1063': 45, '0000-0002-2186-3484': 28, '0000-0002-4899-1929': 25, '0000-0003-04

In [None]:
# cosine similarity with single linkage Agglomerative hierarchical clustering
cosine_single_AHC = AgglomerativeClustering(n_clusters = cluster_count, linkage = "single", affinity = "cosine")
cosine_single_AHC_pred = cosine_single_AHC.fit_predict(combinedata)
# cosine similarity complete linkage Agglomerative hierarchical clustering
cosine_complete_AHC = AgglomerativeClustering(n_clusters = cluster_count, linkage = "complete", affinity = "cosine")
cosine_complete_AHC_pred = cosine_complete_AHC.fit_predict(combinedata)
# cosine similarity average linkage Agglomerative hierarchical clustering
cosine_average_AHC = AgglomerativeClustering(n_clusters = cluster_count, linkage = "average", affinity = "cosine")
cosine_average_AHC_pred = cosine_average_AHC.fit_predict(combinedata)

cosine_single_AHC_pairwise_f1 = pairwise_f1(label, cosine_single_AHC_pred)
cosine_complete_AHC_pairwise_f1 = pairwise_f1(label, cosine_complete_AHC_pred)
cosine_average_AHC_pairwise_f1 = pairwise_f1(label, cosine_average_AHC_pred)

print("cosine_single_AHC_pairwise_f1: ", cosine_single_AHC_pairwise_f1)
print(cosine_single_AHC)
print("cosine_complete_AHC_pairwise_f1: ", cosine_complete_AHC_pairwise_f1)
print(cosine_complete_AHC)
print("cosine_average_AHC_pairwise_f1: ", cosine_average_AHC_pairwise_f1)
print(cosine_average_AHC)


In [None]:
# import pandas as pd

# def create_cluster_partition(labels):
#     """
#     Create a dictionary where the key is the row number and the value is the
#     actual label.
#     In this case, labels is an array where the position corresponds to the row
#     number and the value is an integer indicating the label.
#     """
#     labels_lookup = {}
#     for idx, label in enumerate(labels):
#         labels_lookup[idx] = label
#     # --------- create cluster_partition from label ---------------#
#     cluster_partition = []
#     unique_labels = pd.unique(labels)
#     for unique_label in unique_labels:
#         cluster_partition.append([idx for idx, item in enumerate(labels) if item == unique_label])
#     return labels_lookup, cluster_partition


# # method to evaluate cluster result
# def pairwise_f1_v2(true_label, pred_label):
#     datasize = len(true_label)
#     # --------- create cluster_partition from label ---------------#
#     _, true_cluster_partition = create_cluster_partition(true_label)
#     _, pred_cluster_partition = create_cluster_partition(pred_label)
#     num_pred_cluster = len(pred_cluster_partition)
#     purity = 0
#     f1_measure_i = 0
#     f1_measure = 0
#     print(true_cluster_partition)
#     print(pred_cluster_partition)
    
#     TP = 0
#     TN = 0
#     FP = 0
#     FN = 0
    
#     # For each cluster in clustering
#     for i, pred_cluster in enumerate(pred_cluster_partition):
#         max_intersec_ground_index = 0
#         max_intersec = len(set(true_cluster_partition[0]) & set(pred_cluster_partition[i]))
#         print(max_intersec)
#         # For each cluster in ground truth
#         for j in range(1, len(true_cluster_partition)):
#             # Get the max number of elements which belongs both to the pred_cluster_partition[i] and the true_cluster_partition[j]
#             local_intersec = len(set(true_cluster_partition[j]) & set(pred_cluster_partition[i]))
#             if (local_intersec > max_intersec):
#                 max_intersec = local_intersec
#                 max_intersec_ground_index = j
        
#         # Precision of pred_cluster_partition[i]
#         precision_i = 1.0 * max_intersec / len(pred_cluster_partition[i])
#         # Recall of pred_cluster_partition[i]
#         recall_i = 1.0 * max_intersec / len(true_cluster_partition[max_intersec_ground_index])
        
#         # Purity
#         purity += (1.0 * len(pred_cluster_partition[i]) * precision_i / datasize)
#         # F-measure
#         f1_measure_i = (2.0 * precision_i * recall_i) / (precision_i + recall_i)
#         f1_measure += f1_measure_i
        
#     final_f1_measure = f1_measure/num_pred_cluster
#     print(final_f1_measure)
    
#     return final_f1_measure

# pairwise_f1_v2([2,2,2,2,1,2,0,0,1,0,2,0,1,1,2,1,2],[1,1,1,1,1,1,2,2,2,2,2,2,0,0,0,0,0])