In [7]:
import os
import sys
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

import com_func
import pandas as pd

def dummy(doc):
    return doc

Dataset = "pubmed"
# parameters
threshold = 30
cutoff = 3

pp_textual = "tf"

In [8]:
# load text information
Dataset = "pubmed"
raw_filepath = "../../Data/"+Dataset+"/id_textual_combined.txt"
all_text_content = []
with open(raw_filepath, 'r', encoding = 'utf8') as f:
    # items[0] is paper ID, items[1] is title, items[2] is abstract
    for line in f:
        items = line.split("\t")
        # lower case all character
        paperID = items[0]
        title = items[1].lower()
        keywords = items[2].lower()
        mesh = items[3].lower()
        abstract = items[4].lower()
        # keyword and mesh
        key_mesh = keywords+" "+mesh
        # textual information can be defined as all feature combined
        content = title+" "+keywords+" "+mesh+" "+abstract
        paper_text_content = {"paperID": paperID, "title":title, "keywords_mesh":key_mesh,
                              "abstract": abstract, "combine_textual":content}
        all_text_content.append(paper_text_content)
print("Total ", len(all_text_content), " paper have text information")
# convert to dataframe so it's easy to process
all_text_content = pd.DataFrame(all_text_content)

Total  3151504  paper have text information


In [9]:
def dummy(doc):
    return doc
def read_labeled_file(infile):
    LabeledRecords_original = []
    with open(infile, 'r', encoding = 'utf8') as f:
        for line in f:
            read_data = line.split("\t")
            # get ride of bad formated lines
            if(len(read_data)==13 or len(read_data)==12):
                paper_detail = {"paperID": read_data[0], "authorID":read_data[1], 
                                "co-author": read_data[5], "venue_id": read_data[7]}
                LabeledRecords_original.append(paper_detail)
            else:
                print(len(read_data))
        f.close()
    return pd.DataFrame(LabeledRecords_original)

In [10]:
# remove author(positive sample) from other(negative sample)
import random
def extractNegativeSample(positiveSample, allSample):
    negativeSample = [x for x in allSample if x not in positiveSample]
    return negativeSample

In [11]:
def LSA(cleaned_token, dim=100):
    # Tf-idf Transformation
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
    tfidfMatrix = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
    if(tfidfMatrix.shape[1]<dim):
        dim = tfidfMatrix.shape[1] -1
    # tf-idf + svd
    svd = TruncatedSVD(n_components=dim)
    final_lsa_Matrix = svd.fit_transform(tfidfMatrix)
    print(svd.explained_variance_ratio_.sum())
    return final_lsa_Matrix

In [12]:
# document relation wrt textual content
# convert raw text to numerical feature vectors
# bow(Bags of words) are used with uni-gram setting
def raw_text_to_vector(raw_textual_content, emb_type="off", stopword=True):
    cleaned_token, sample_size= com_func.clean_batch_of_raw(raw_textual_content, stopword=stopword)
    average_token_size = sum(sample_size)/len(sample_size)
    print("Minimal token size: ", min(sample_size))
    print("maximal token size: ", max(sample_size))
    while True:
        if emb_type == "tf_idf":
            # using tf-idf
            tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, tokenizer=dummy,
                                               preprocessor=dummy, stop_words = None,min_df=cutoff)
            print(tfidf_vectorizer)
            result_vector = tfidf_vectorizer.fit_transform(cleaned_token).toarray()
            #print(len(tfidf_vectorizer.vocabulary_))
            #print(tfidf_vectorizer.get_feature_names())
            break
        elif emb_type == "tf":
            # Document-Term frequence Matrix
            count_vectorizer = CountVectorizer(tokenizer=dummy,preprocessor=dummy, min_df=cutoff)
            result_vector = normalize(count_vectorizer.fit_transform(cleaned_token).toarray())
            break
        elif emb_type == "lsa":
            # use lsa
            result_vector = LSA(cleaned_token, dim=100)
            break
        elif emb_type == "off":
            result_vector = pd.DataFrame()
            break
        else:
            print("Embedding type not available, selecting default setting")
            emb_type="off"
    return result_vector, average_token_size

In [14]:
# load the file
import sys
import io
import os
import collections
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# fix random seed for reproducibility
np.random.seed(1)

fileDir = "../../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)

# different feature
train_feature = ["keywords_mesh","abstract", "combine_textual"]
for feature in train_feature:
    # collect statistic to output
    allname = []
    average_token_size = []
    positive_sample_size = []
    negative_sample_size = []


    all_mnb_accuracy = []
    all_mnb_f1 = []
    all_LR_accuracy = []
    all_LR_f1 = []
    all_svcLinear_accuracy = []
    all_svcLinear_f1 = []

    # read all file in labeled group
    for file in listfiles:
        # group name
        temp = file.split("_")
        name = temp[1]+"_"+temp[-1]
        print("For name: ",name)
        # read needed content in labeled file
        labeled_data = read_labeled_file(fileDir+file)
        # merge textual from all raw data to labeled dataset
        labeled_data = pd.merge(left=labeled_data,right=all_text_content, how='left', left_on='paperID', right_on='paperID')
        # collect all labeled sample
        all_labeled_sample = labeled_data["paperID"].tolist()
        print("total sample size before apply threshold: ",len(labeled_data))
        # count number of paper each author write based on author ID
        paperCounter = collections.Counter(labeled_data["authorID"])
        print(paperCounter)
        # collect per class statistic
        for k in list(paperCounter):
            if paperCounter[k] < threshold:
                del paperCounter[k]
        temp =list(paperCounter.keys())
        print(temp)
        # remove authors that write smaller than threshold number of authors
        temp = labeled_data[labeled_data.authorID.isin(temp)]
        author_list = set(temp["authorID"])
        # if only have one class or no class pass the threshold, not applicable
        if(len(paperCounter)==0) or (len(paperCounter)==1):
            print(name," pass")
        else:
            # for each name group
            # split test and train
            train_test_split(labeled_data["authorID"], shuffle=False)
            # read in data in name group 
            group_pid = labeled_data["paperID"]
            # list of different data field
            part_collection = []
            # select feature wanted to fit to clustering/classification algorithm
            data_textual, data_token_size = raw_text_to_vector(labeled_data[feature], emb_type=pp_textual)
            average_token_size.append(data_token_size)
            print(data_textual.shape)
            part_collection.append(data_textual)
            # merge different part of data data together by concatenate it all together
            # remove empty emb (when emb set off)
            part_collection = [part for part in part_collection if len(part)!=0]
            print(len(part_collection))
            if len(part_collection)>1:
                combinedata = np.concatenate(part_collection,axis=1)
            elif len(part_collection)==1:
                if isinstance(part_collection[0], pd.DataFrame):
                    combinedata = part_collection[0].values
                else:
                    combinedata = part_collection[0]
            else:
                print("No data available")
                break
            print(combinedata.shape)
            print(combinedata[0])
            
            svcModels = []
            lrModels = []
            counter = 0
            # loop through each author and train classifier
            for author in author_list:
                author_name = name+'_'+str(counter)
                allname.append(author_name)
                print(author_name)
                mask = labeled_data["authorID"] == author
                temp = labeled_data[mask]
                positive_sample_pid = temp["paperID"].tolist()
                negative_sample_pid = extractNegativeSample(positive_sample_pid, all_labeled_sample)
                # append to statistic collection
                positive_sample_size.append(len(positive_sample_pid))
                negative_sample_size.append(len(negative_sample_pid))
                # form positive and negative (negative class come from similar name group)
                all_authors = []
                all_authors.append(positive_sample_pid)
                all_authors.append(negative_sample_pid)
                appended_data = []
                for label, pid in enumerate(all_authors):
                    # create df save one author data 
                    authordf = pd.DataFrame({"paperID":pid})
                    authordf['label'] = label
                    appended_data.append(authordf)
                processed_data = pd.concat(appended_data, axis=0,ignore_index=True)
                # extract true label and it's corresponeding pid for matching
                label = processed_data["label"]
                pid = processed_data["paperID"]
                
                # alignment
                processed_data = pd.merge(group_pid, processed_data, on="paperID")
                
                print(processed_data[:50])
                print(group_pid[:50])
                
                
#                 # using converted feature vector to train classifier
#                 # using logistic regression
#                 clf = LogisticRegression()
#                 clf.fit(combinedata)
#                 svcModels.append()
#                 # using SVM with linear kernal
#                 clf = SVC(kernel='linear')
#                 svcaccuracy, svcmarcof1 = k_fold_cv(combinedata, label, clf, k=10)
#                 print("svc Accuracy: ",svcaccuracy)
#                 print("svc F1: ", svcmarcof1)
#                 all_svcLinear_accuracy.append(svcaccuracy)
#                 all_svcLinear_f1.append(svcmarcof1)
#                 counter+=1
            break
#     # write evaluation result to excel
#     output = pd.DataFrame({'Author Name':allname, "sample average token":average_token_size,
#                            "positive sample size":positive_sample_size,"negative sample size":negative_sample_size, 
#                            "MNB Accuracy":all_mnb_accuracy, "MNB F1": all_mnb_f1, 
#                            "svc(linear) accuracy":all_svcLinear_accuracy, "svc(linear) f1": all_svcLinear_f1, 
#                            "logistic regression accuracy":all_LR_accuracy, "logistic regression f1": all_LR_f1})

#     savePath = "../../result/"+Dataset+"/binary_clf/"+feature+"/"
#     if not os.path.exists(savePath):
#         os.makedirs(savePath)
#     filename = "textual="+pp_textual+"_threshold="+str(threshold)+".csv"
#     output.to_csv(savePath+filename, encoding='utf-8',index=False)
#     print(feature, " Done")

For name:  j_read
total sample size before apply threshold:  136
Counter({'0000-0002-5159-1192': 57, '0000-0002-9029-5185': 39, '0000-0002-9697-0962': 31, '0000-0002-4739-9245': 3, '0000-0003-0605-5259': 3, '0000-0003-4316-7006': 1, '0000-0002-0784-0091': 1, '0000-0002-3888-6631': 1})
['0000-0002-9697-0962', '0000-0002-9029-5185', '0000-0002-5159-1192']


NameError: name 'y' is not defined

In [None]:
from sklearn.preprocessing import normalize