In [2]:
# load the file
import sys
import io
import os

fileDir = "../Data/canopies_labeled/"
listfiles = os.listdir(fileDir)
LabeledRecords_original = []

for file in listfiles:
    if not file.startswith('.'):
        with open(fileDir+file, 'r', encoding = 'utf8') as f:
            for line in f:
                read_data = line.split("\t")
                # get ride of bad formated lines
                if(len(read_data)==13):
                    paper_detail = {"paperID": read_data[0], "authorID":read_data[1],
                                    "author_name": read_data[4], "co_authors": read_data[5], 
                                    "department": read_data[6], "vendor": read_data[7], 
                                    "publish_year": read_data[10], "citation graph": read_data[11]}
                    LabeledRecords_original.append(paper_detail)
        f.close()
print("Total labeled records:",len(LabeledRecords_original), "records")

Total labeled records: 140266 records


In [3]:
# data column
for key in LabeledRecords_original[0]:
    print(key)

paperID
authorID
author_name
co_authors
department
vendor
publish_year
citation graph


In [4]:
# merge author name based on their id, replace abstract name to full name if full name available
# method: count length of string that have same authorID, replace it with longest name
# this case, we assume that author id is unique identifier means each author will only have one author ID

print(LabeledRecords_original[:3])
# process 1: count length of string that have same authorID, replace it with longest name
LabeledRecords_processed1 = LabeledRecords_original
authorIDs = set([records['authorID'] for records in LabeledRecords_original])

for authorID in authorIDs:
    # get longest string name
    maxLengthName = ""
    for record in LabeledRecords_original:
        if (authorID == record['authorID'] and len(maxLengthName)<len(record['author_name'])):
            maxLengthName = record['author_name']
    # replace all string name shorter than longest string name to longest string name (same author ID)
    for record in LabeledRecords_processed1:
        if (authorID == record['authorID']):
            record['author_name'] = maxLengthName

print(LabeledRecords_processed1[:3])

[{'paperID': '27406695', 'authorID': '0000-0002-9697-0962', 'author_name': 'jonathan m read', 'co_authors': 'truelove zhu lessler riley wang kwok guan jiang cummings', 'department': 'department epidemiology public health institute infection global health university liverpool neston uk', 'vendor': '27279', 'publish_year': '2016', 'citation graph': '10.3345/kjp.2012.55.12.474$$10.1016/j.vaccine.2011.11.019$$10.1007/s00430-010-0143-4$$10.1080/15388220801955596$$10.1017/s0022172400022610$$10.1038/ncomms1432$$10.1016/j.vaccine.2007.02.039$$10.1128/cvi.00278-15$$10.3201/eid1508.081754'}, {'paperID': '26876744', 'authorID': '0000-0002-9697-0962', 'author_name': 'j m read', 'co_authors': 'hungerford cooke vivancos iturriza-gómara allen french cunliffe', 'department': 'chicas group lancaster medical school faculty health medicine lancaster university lancaster uk', 'vendor': '11865', 'publish_year': '2016', 'citation graph': '10.3201/eid1601.090401$$10.1016/j.jcv.2008.12.001$$10.1136/bmjopen-20

In [5]:
# remove authors that don't have full name
# process 2: if author doesn't have full first name, filter out
LabeledRecords_processed2 = []
authorNames = set([records["author_name"] for records in LabeledRecords_processed1])
# count unqualify authorName (Not full name)
fullNameList = []
counter = 0
for author in authorNames:
    name_part = author.split(" ")
    # if first part of name length less than 2, assume not full name
    if (len(name_part[0])<2):
        counter+=1;
    else :
        # check if last name is complete or not
        if(len(name_part[-1])<2):
            print(name_part)
            print(author)
        fullNameList.append(author)

for record in LabeledRecords_processed1:
    if(record["author_name"] in fullNameList):
        LabeledRecords_processed2.append(record)
print("Number of records: ",len(LabeledRecords_processed2))

Number of records:  138896


In [6]:
from collections import Counter

# create author ID to name map
author_ID_to_name_map = []

for record in LabeledRecords_processed2:
    if(record["authorID"] not in [item["author_ID"] for item in author_ID_to_name_map]):
        author_ID_to_name_map.append({"author_name": record["author_name"], "author_ID": record["authorID"]})

print("Number of author: ", len(author_ID_to_name_map))

Number of author:  9482


In [7]:
print(author_ID_to_name_map[:5])

[{'author_name': 'jonathan m read', 'author_ID': '0000-0002-9697-0962'}, {'author_name': 'jenny c a read', 'author_ID': '0000-0002-9029-5185'}, {'author_name': 'james read', 'author_ID': '0000-0003-4316-7006'}, {'author_name': 'jazlyn read', 'author_ID': '0000-0002-0784-0091'}, {'author_name': 'jordan s read', 'author_ID': '0000-0002-3888-6631'}]


In [8]:
# extract author with same name but different author ID
sameNameAuthor = []
for i in author_ID_to_name_map:
    for j in author_ID_to_name_map:
        if(j["author_name"]==i["author_name"] and j["author_ID"]!=i["author_ID"]):
            if(i["author_ID"] not in [item["author_ID"] for item in sameNameAuthor]):
                sameNameAuthor.append(i)
            if(j["author_ID"] not in [item["author_ID"] for item in sameNameAuthor]):
                sameNameAuthor.append(j)
print(len(sameNameAuthor))
# count author with same name
sameNameCounter = Counter([author["author_name"] for author in sameNameAuthor])
for key, value in sameNameCounter.items():
    if value>=2:
        print(value, "people have same name:", key)

# for author in sameNameAuthor:
#     print("Author: {k}, Author ID: {c}".format(k=author["author_name"], c=author["author_ID"]))    

1051
2 people have same name: james read
2 people have same name: francisco esteves
2 people have same name: jason williams
2 people have same name: tiago santos
2 people have same name: kyung su kim
2 people have same name: davide ricci
2 people have same name: muhammad nawaz
2 people have same name: helena pereira
2 people have same name: amit patel
2 people have same name: gabriele guidi
2 people have same name: francisco ortega
2 people have same name: richard w morris
2 people have same name: robert h morris
3 people have same name: hao song
2 people have same name: ming-kai pan
2 people have same name: jing huang
2 people have same name: jian huang
2 people have same name: jun huang
2 people have same name: paulo f santos
2 people have same name: pedro santos
2 people have same name: david a ross
2 people have same name: qi wang
3 people have same name: qian wang
3 people have same name: quan wang
3 people have same name: qiang wang
2 people have same name: rachel bennett
2 peopl

In [9]:
# filter out authors that have same name but write less paper than threshold
# set up threshold
threshold = 10

# count number of paper each author write based on author ID
paperCounter = Counter([record["authorID"] for record in LabeledRecords_processed2])
print(paperCounter.most_common(3))
for key, value in paperCounter.items():
    for author in sameNameAuthor:
        if(key == author["author_ID"]):
            author["paper_count"] = value

sameNameAuthorWithCount = sorted(sameNameAuthor, key=lambda k: (k['author_name'], k["paper_count"]))
# collect authors that have repeated name and write more paper than threshold
uniqueAuthors = []
temp = []
authorFilter = []
for author in sameNameAuthorWithCount:
    if(author["paper_count"]>threshold):
        temp.append(author)
        #print("Author name: {n}, Author: {k}, Paper count: {c}".format(n=author["author_name"],k=author["author_ID"], c= author["paper_count"]))

for author in temp:
    if(author["author_name"] not in uniqueAuthors):
        uniqueAuthors.append(author["author_name"])
    else: 
        if author["author_name"] not in authorFilter:
            authorFilter.append(author["author_name"])
# collect filtered data
filteredAuthors = []
for author in temp:
    if(author["author_name"] in authorFilter):
        filteredAuthors.append(author)

for author in filteredAuthors:
    print("Author name: {n}, Author: {k}, Paper count: {c}".format(n=author["author_name"],k=author["author_ID"], c= author["paper_count"]))
print(len(filteredAuthors))

[('0000-0002-2381-2349', 587), ('0000-0002-9955-6003', 487), ('0000-0002-4295-6129', 423)]
Author name: alfredo martinez, Author: 0000-0003-4882-4044, Paper count: 17
Author name: alfredo martinez, Author: 0000-0002-4804-6687, Paper count: 20
Author name: amit patel, Author: 0000-0003-3874-3216, Paper count: 11
Author name: amit patel, Author: 0000-0001-7214-5901, Paper count: 18
Author name: ana castro, Author: 0000-0003-4035-3444, Paper count: 11
Author name: ana castro, Author: 0000-0001-6964-6879, Paper count: 13
Author name: ana castro, Author: 0000-0001-7526-6717, Paper count: 39
Author name: anna ferrari, Author: 0000-0002-7022-9906, Paper count: 17
Author name: anna ferrari, Author: 0000-0001-9536-3995, Paper count: 49
Author name: bin liu, Author: 0000-0002-5836-2333, Paper count: 14
Author name: bin liu, Author: 0000-0002-0956-2777, Paper count: 97
Author name: carmen moreno, Author: 0000-0002-1660-7072, Paper count: 13
Author name: carmen moreno, Author: 0000-0003-0541-4846,

In [10]:
# collect paper id from author with same name and save it to file
newfileDir = "../Data/filteredSameNameAuthor/"
if not os.path.exists(newfileDir):
    os.makedirs(newfileDir)
# extract the records from all records
# process 3: get selected authors records from all records
LabeledRecords_processed3 = []
for author in filteredAuthors:
    for record in LabeledRecords_processed2:
        if (author["author_ID"]== record["authorID"]):
            LabeledRecords_processed3.append(record)
# # double check with output file
# for record in LabeledRecords_processed3:
#      print("Author name: {n}, Author: {k}, Paper id: {c}".format(n=record["author_name"],k=record["authorID"], c= record["paperID"]))
    

In [11]:
# print detail for check
print(len(LabeledRecords_processed3))
# remove duplicate terms based on name, id, and department
NoDuplicateRecord = list({(r['author_name'], r['authorID'],r["department"]):r for r in LabeledRecords_processed3}.values())
print(len(NoDuplicateRecord))
for record in NoDuplicateRecord:
    # remove empty apartments records
    if record["department"] != "":
        print("Author name: {n}, Author: {k}, department: {d}".format(n=record["author_name"], k=record["authorID"], d= record["department"]))


3950
1147
Author name: alfredo martinez, Author: 0000-0003-4882-4044, department: biomarkers prevention research branch national cancer institute national institutes health rockville md 208503300 usa martinezabprbncinihgov
Author name: alfredo martinez, Author: 0000-0002-4804-6687, department: departamento ingeniería celular biocatálisis instituto biotecnología universidad nacional autónoma méxico apdo postal 5103 cuernavaca morelos cp 62271 mexico alfredoibtunammx
Author name: alfredo martinez, Author: 0000-0002-4804-6687, department: departamento ingeniería celular biocatálisis instituto biotecnología universidad nacional autónoma méxico p 5103 cuernavaca morelos 62250 méxico
Author name: alfredo martinez, Author: 0000-0002-4804-6687, department: institute biotechnology national autonomous university mexico cuernavaca morelos mexico
Author name: alfredo martinez, Author: 0000-0002-4804-6687, department: institute food agricultural sciences department microbiology cell science univers

Author name: jin young kim, Author: 0000-0003-4085-293X, department: lash miller chemical laboratories center quantum information quantum control university toronto ontario m5s 3h6 canada
Author name: john f marshall, Author: 0000-0002-0494-2295, department: barts cancer institute barts london queen marys school medicine dentistry queen mary university london london uk jfmarshallqmulacuk
Author name: john f marshall, Author: 0000-0002-8344-2589, department: equine clinical sciences division weipers centre equine hospital school veterinary medicine university glasgow uk
Author name: john f marshall, Author: 0000-0002-8344-2589, department: weipers centre equine hospital school veterinary medicine college medical veterinary life sciences university glasgow glasgow uk electronic address johnfmarshallglasgowacuk
Author name: john f marshall, Author: 0000-0002-8344-2589, department: weipers centre equine hospital school veterinary medicine university glasgow glasgow united kingdom
Author na

In [12]:
nameSet = set([author["author_name"] for author in filteredAuthors])
comparter = filteredAuthors

In [13]:
print(nameSet)
print(list(nameSet)[:5])
print(comparter[:5])

{'martin wagner', 'yong liu', 'jeong hwan kim', 'jun chen', 'kevin m. ryan', 'robert j young', 'giovanni volpe', 'lei wang', 'john f marshall', 'yong wang', 'fang liu', 'jun zhang', 'ana castro', 'michael wagner', 'wei lu', 'cheng luo', 'yang zhao', 'lin yang', 'kyung su kim', 'sebastian wolf', 'bin liu', 'vivek gupta', 'jeremy m brown', 'feng xu', 'hao song', 'wei xu', 'marco ferrari', 'xin li', 'yu zhang', 'carmen moreno', 'feng liu', 'anna ferrari', 'hong yang', 'alfredo martinez', 'vineet gupta', 'vivek kumar', 'chung-may yang', 'qian wang', 'jie zhang', 'qin li', 'jong hee chang', 'mikael svensson', 'wei wang', 'chao liu', 'luís alves', 'jin young kim', 'carmen torres', 'richard w morris', 'francisco j blanco', 'ying liu', 'peng zhang', 'qiang wang', 'yang wang', 'amit patel', 'yongsheng liu', 'francisco esteves', 'ying zhang', 'jacob john', 'marta crespo', 'yu-jun zhao', 'pei-ming yang', 'david g lloyd'}
['martin wagner', 'yong liu', 'jeong hwan kim', 'jun chen', 'kevin m. ryan']

In [15]:
# final step: put same name author's paper id into different file
# group same name author

for name in nameSet:
    # write all record belone to same name to one file as negative sample
    newf = open((newfileDir+name+".txt").encode('utf-8'), "w",encoding='utf8')
    for record in LabeledRecords_processed3:
        if (record["author_name"]==name):
            newf.write(record["paperID"]+"\n")
    newf.close()
    
    # write each author's records based on author id as positive sample
    idList = []
    counter = 0
    for author in filteredAuthors:
        if (author["author_name"]==name and author["author_ID"] not in idList):
            #print(author["author_ID"])
            newf = open((newfileDir+name+str(counter)+".txt").encode('utf-8'), "w",encoding='utf8')
            for record in LabeledRecords_processed3:
                if (author["author_ID"]== record["authorID"]):
                    newf.write(record["paperID"]+"\n")
            newf.close()
            counter+=1
            idList.append(author["author_ID"])
    print(idList)

['0000-0002-9831-9110', '0000-0002-4402-3234']
['0000-0001-8181-1080', '0000-0002-4638-0788']
['0000-0002-8383-8524', '0000-0003-2068-7287']
['0000-0002-3850-4875', '0000-0002-4114-3046', '0000-0002-8021-7458']
['0000-0003-3670-8505', '0000-0002-1059-9681']
['0000-0001-6073-9489', '0000-0001-7003-3017']
['0000-0001-5057-1846', '0000-0001-9993-5348']
['0000-0002-6156-9028', '0000-0002-1919-9107', '0000-0002-5859-2526', '0000-0003-3870-3388']
['0000-0002-0494-2295', '0000-0002-8344-2589']
['0000-0002-9893-8296', '0000-0001-8043-5757']
['0000-0002-8325-1213', '0000-0003-3028-5927']
['0000-0002-7068-5135', '0000-0002-9831-6796']
['0000-0003-4035-3444', '0000-0001-6964-6879', '0000-0001-7526-6717']
['0000-0003-3421-4763', '0000-0003-2589-6440', '0000-0002-9778-7684']
['0000-0001-5358-305X', '0000-0001-6722-1527']
['0000-0003-2193-3670', '0000-0003-0524-5886']
['0000-0003-0302-3470', '0000-0002-7916-8687']
['0000-0002-1698-6666', '0000-0002-5964-3233']
['0000-0002-3897-0278', '0000-0002-7991