In [1]:
# load the file
import sys
import io
import os

fileDir = "../Data/canopies_labeled/"
listfiles = os.listdir(fileDir)
LabeledRecords_original = []

for file in listfiles:
    if not file.startswith('.'):
        with open(fileDir+file, 'r', encoding = 'utf8') as f:
            for line in f:
                read_data = line.split("\t")
                # get ride of bad formated lines
                if(len(read_data)==13):
                    paper_detail = {"paperID": read_data[0], "authorID":read_data[1],
                                    "author_name": read_data[4], "co_authors": read_data[5], 
                                    "department": read_data[6], "vendor": read_data[7], 
                                    "publish_year": read_data[10], "citation graph": read_data[11]}
                    LabeledRecords_original.append(paper_detail)
        f.close()
print("Total labeled records:",len(LabeledRecords_original), "records")

Total labeled records: 140266 records


In [2]:
# number of unique records
print(len(set([records['paperID'] for records in LabeledRecords_original])))

135796


In [3]:
# data column
for key in LabeledRecords_original[0]:
    print(key)

paperID
authorID
author_name
co_authors
department
vendor
publish_year
citation graph


In [5]:
# merge author name based on their id, replace abstract name to full name if full name available
# method: count length of string that have same authorID, replace it with longest name
# this case, we assume that author id is unique identifier means each author will only have one author ID

print(LabeledRecords_original[:3])
# process 1: count length of string that have same authorID, replace it with longest name
LabeledRecords_processed1 = LabeledRecords_original
authorIDs = set([records['authorID'] for records in LabeledRecords_original])

for authorID in authorIDs:
    # get longest string name
    maxLengthName = ""
    for record in LabeledRecords_original:
        if (authorID == record['authorID'] and len(maxLengthName)<len(record['author_name'])):
            maxLengthName = record['author_name']
    # replace all string name shorter than longest string name to longest string name (same author ID)
    for record in LabeledRecords_processed1:
        if (authorID == record['authorID']):
            record['author_name'] = maxLengthName

print(LabeledRecords_processed1[:3])

[{'paperID': '27406695', 'authorID': '0000-0002-9697-0962', 'author_name': 'jonathan m read', 'co_authors': 'truelove zhu lessler riley wang kwok guan jiang cummings', 'department': 'department epidemiology public health institute infection global health university liverpool neston uk', 'vendor': '27279', 'publish_year': '2016', 'citation graph': '10.3345/kjp.2012.55.12.474$$10.1016/j.vaccine.2011.11.019$$10.1007/s00430-010-0143-4$$10.1080/15388220801955596$$10.1017/s0022172400022610$$10.1038/ncomms1432$$10.1016/j.vaccine.2007.02.039$$10.1128/cvi.00278-15$$10.3201/eid1508.081754'}, {'paperID': '26876744', 'authorID': '0000-0002-9697-0962', 'author_name': 'jonathan m read', 'co_authors': 'hungerford cooke vivancos iturriza-gómara allen french cunliffe', 'department': 'chicas group lancaster medical school faculty health medicine lancaster university lancaster uk', 'vendor': '11865', 'publish_year': '2016', 'citation graph': '10.3201/eid1601.090401$$10.1016/j.jcv.2008.12.001$$10.1136/bmj

In [6]:
# remove authors that don't have first name 
# process 2: if author doesn't have full last name, filter out
LabeledRecords_processed2 = []
authorNames = set([records["author_name"] for records in LabeledRecords_processed1])
# count unqualify authorName (Not have full last name)
fullNameList = []
counter = 0
for author in authorNames:
    name_part = author.split(" ")
    # if last name length more than 2, append to list
    if(len(name_part[-1])>=2):
        fullNameList.append(author)
        counter+=1
        if(name_part[0]=="unknown"):
            print(name_part)
#         # check if first name is complete or not
#         if(len(name_part[0])<2):
#             print(name_part)
    else:
        print(author)

for record in LabeledRecords_processed1:
    if(record["author_name"] in fullNameList):
        LabeledRecords_processed2.append(record)
print("Number of records have last name: ",len(LabeledRecords_processed2))

Number of records have last name:  140266


In [9]:
# create author ID to name map
author_ID_to_name_map = []

for record in LabeledRecords_processed2:
    if(record["authorID"] not in [item["author_ID"] for item in author_ID_to_name_map]):
        author_ID_to_name_map.append({"author_name": record["author_name"], "author_ID": record["authorID"]})

print("Number of author: ", len(author_ID_to_name_map))

Number of author:  9914


In [10]:
print(author_ID_to_name_map[:5])

[{'author_name': 'jonathan m read', 'author_ID': '0000-0002-9697-0962'}, {'author_name': 'jenny c a read', 'author_ID': '0000-0002-9029-5185'}, {'author_name': 'james read', 'author_ID': '0000-0003-4316-7006'}, {'author_name': 'jazlyn read', 'author_ID': '0000-0002-0784-0091'}, {'author_name': 'jordan s read', 'author_ID': '0000-0002-3888-6631'}]


In [11]:
from collections import Counter
# extract author with same name but different author ID
sameNameAuthor = []
for i in author_ID_to_name_map:
    for j in author_ID_to_name_map:
        if(j["author_name"]==i["author_name"] and j["author_ID"]!=i["author_ID"]):
            if(i["author_ID"] not in [item["author_ID"] for item in sameNameAuthor]):
                sameNameAuthor.append(i)
            if(j["author_ID"] not in [item["author_ID"] for item in sameNameAuthor]):
                sameNameAuthor.append(j)
print("Total number of same name author: ", len(sameNameAuthor))
# count author with same name
sameNameCounter = Counter([author["author_name"] for author in sameNameAuthor])
counter = 0
for key, value in sameNameCounter.items():
    if value>=2:
        counter+=1
        print(value, "people have same name:", key)
print("Total number of name been repeated: ", counter)
print("Top 10 most repeated names: ")
print(sameNameCounter.most_common(10))
# for author in sameNameAuthor:
#     print("Author: {k}, Author ID: {c}".format(k=author["author_name"], c=author["author_ID"]))    

Total number of same name author:  1080
2 people have same name: james read
2 people have same name: francisco esteves
2 people have same name: jason williams
2 people have same name: tiago santos
2 people have same name: kyung su kim
2 people have same name: davide ricci
2 people have same name: muhammad nawaz
2 people have same name: helena pereira
2 people have same name: amit patel
2 people have same name: gabriele guidi
2 people have same name: francisco ortega
2 people have same name: richard w morris
2 people have same name: robert h morris
3 people have same name: hao song
2 people have same name: ming-kai pan
2 people have same name: jing huang
2 people have same name: jian huang
2 people have same name: jun huang
2 people have same name: paulo f santos
2 people have same name: pedro santos
2 people have same name: david a ross
2 people have same name: qi wang
3 people have same name: qian wang
3 people have same name: quan wang
3 people have same name: qiang wang
2 people hav

In [22]:
# filter out authors that have same name but write less paper than threshold
# set up threshold
threshold = 10

# count number of paper each author write based on author ID
paperCounter = Counter([record["authorID"] for record in LabeledRecords_processed2])
print(paperCounter.most_common(3))
for key, value in paperCounter.items():
    for author in sameNameAuthor:
        if(key == author["author_ID"]):
            author["paper_count"] = value

sameNameAuthorWithCount = sorted(sameNameAuthor, key=lambda k: (k['author_name'], k["paper_count"]))
# collect authors that have repeated name and write more paper than threshold
uniqueAuthors = []
temp = []
authorFilter = []
for author in sameNameAuthorWithCount:
    if(author["paper_count"]>threshold):
        temp.append(author)
        #print("Author name: {n}, Author: {k}, Paper count: {c}".format(n=author["author_name"],k=author["author_ID"], c= author["paper_count"]))

for author in temp:
    if(author["author_name"] not in uniqueAuthors):
        uniqueAuthors.append(author["author_name"])
    else: 
        if author["author_name"] not in authorFilter:
            authorFilter.append(author["author_name"])
# collect filtered data
filteredAuthors = []
for author in temp:
    if(author["author_name"] in authorFilter):
        filteredAuthors.append(author)
filteredAuthorName = set()
for author in filteredAuthors:
    filteredAuthorName.add(author["author_name"])
    print("Author name: {n}, Author: {k}, Paper count: {c}".format(n=author["author_name"],k=author["author_ID"], c= author["paper_count"]))
print("Total number of authors: ",len(filteredAuthors))
print("Total number of names: ",len(filteredAuthorName))

[('0000-0002-2381-2349', 587), ('0000-0002-9955-6003', 487), ('0000-0002-4295-6129', 423)]
Author name: alfredo martinez, Author: 0000-0003-4882-4044, Paper count: 17
Author name: alfredo martinez, Author: 0000-0002-4804-6687, Paper count: 20
Author name: amit patel, Author: 0000-0003-3874-3216, Paper count: 11
Author name: amit patel, Author: 0000-0001-7214-5901, Paper count: 18
Author name: ana castro, Author: 0000-0003-4035-3444, Paper count: 11
Author name: ana castro, Author: 0000-0001-6964-6879, Paper count: 13
Author name: ana castro, Author: 0000-0001-7526-6717, Paper count: 39
Author name: anna ferrari, Author: 0000-0002-7022-9906, Paper count: 17
Author name: anna ferrari, Author: 0000-0001-9536-3995, Paper count: 49
Author name: bin liu, Author: 0000-0002-5836-2333, Paper count: 14
Author name: bin liu, Author: 0000-0002-0956-2777, Paper count: 97
Author name: carmen moreno, Author: 0000-0002-1660-7072, Paper count: 13
Author name: carmen moreno, Author: 0000-0003-0541-4846,

In [17]:
# collect paper id from author with same name and save it to file
newfileDir = "../Data/filteredSameNameAuthor/filter="+str(threshold)+"/"
if not os.path.exists(newfileDir):
    os.makedirs(newfileDir)
# extract the records from all records
# process 3: get selected authors records from all records
LabeledRecords_processed3 = []
for author in filteredAuthors:
    for record in LabeledRecords_processed2:
        if (author["author_ID"]== record["authorID"]):
            LabeledRecords_processed3.append(record)
# # double check with output file
# for record in LabeledRecords_processed3:
#      print("Author name: {n}, Author: {k}, Paper id: {c}".format(n=record["author_name"],k=record["authorID"], c= record["paperID"]))
    

In [18]:
# print detail for check
print(len(LabeledRecords_processed3))
# remove duplicate terms based on name, id, and department
NoDuplicateRecord = list({(r['author_name'], r['authorID'],r["department"]):r for r in LabeledRecords_processed3}.values())
print(len(NoDuplicateRecord))
for record in NoDuplicateRecord:
    # remove empty apartments records
    if record["department"] != "":
        print("Author name: {n}, Author: {k}, department: {d}".format(n=record["author_name"], k=record["authorID"], d= record["department"]))


11533
4528
Author name: a s oliveira, Author: 0000-0001-9287-0959, department: instituto ciências agrárias ambientais universidade federal mato grosso campus sinop 78557267 sinop mt brazil electronic address andresoliufmtbr
Author name: a s oliveira, Author: 0000-0001-9287-0959, department: institute agriculture environmental sciences federal university mato grosso campus sinop sinop mt 78557267 brazil electronic address andresoliufmtbr
Author name: abid ali khan, Author: 0000-0002-9325-6640, department: department mechanical engineering jamia millia islamia new delhi india
Author name: abid ali khan, Author: 0000-0002-9325-6640, department: ergonomics laboratory department mechanical engineering aligarh muslim university aligarh up india
Author name: abid ali khan, Author: 0000-0002-9325-6640, department: department mechanical engineering aligarh muslim university aligarh up india
Author name: alessandro bianchi, Author: 0000-0002-4571-0511, department: department surgery fondazione i

Author name: ching-yao yang, Author: 0000-0001-6312-3719, department: department surgery national taiwan university hospital taipei 10002 taiwan
Author name: ching-yao yang, Author: 0000-0001-6312-3719, department: department surgery national taiwan university hospital national taiwan university college medicine taipei taiwan
Author name: ching-yao yang, Author: 0000-0001-6312-3719, department: department surgery national taiwan university hospital college medicine national taiwan university taipei taiwan
Author name: ching-yao yang, Author: 0000-0001-6312-3719, department: department surgery national taiwan university hospital college medicine national taiwan university 7 chung shan south road taipei taiwan
Author name: ching-yao yang, Author: 0000-0001-6312-3719, department: department surgery national taiwan university hospital national taiwan university college medicine 7 chungshan south rd taipei 10002 taiwan roc
Author name: ching-yao yang, Author: 0000-0001-6312-3719, department

Author name: peng zhang, Author: 0000-0002-5409-7480, department: department chemical engineering university washington seattle 98195 usa
Author name: peng zhang, Author: 0000-0002-5409-7480, department: school materials science engineering tianjin university tianjin 300072 pr china
Author name: peng zhang, Author: 0000-0002-5409-7480, department: school materials science engineering tianjin key laboratory composite functional materials tianjin university tianjin pr china
Author name: peng zhang, Author: 0000-0002-2774-5534, department: key laboratory biological effects nanomaterials nanosafety institute high energy physics chinese academy sciences beijing 100049 china
Author name: peng zhang, Author: 0000-0002-2774-5534, department: key laboratory biological effects nanomaterials nanosafety key laboratory nuclear radiation nuclear energy technology institute high energy physics chinese academy sciences beijing 100049 china
Author name: peng zhang, Author: 0000-0002-2774-5534, departme

In [19]:
nameSet = set([author["author_name"] for author in filteredAuthors])
comparter = filteredAuthors

In [20]:
print(nameSet)
print(list(nameSet)[:5])
print(comparter[:5])

{'ana nunes', 'carla silva', 'hao jiang', 'jun young choi', 'jian-min chen', 'a s oliveira', 'yuan yuan', 'hong chen', 'liang wang', 'clara pereira', 'stefan nielsen', 'jin seok kim', 'haifeng shi', 'cheng zhang', 'xue li', 'vishal patel', 'richard j walker', 'jack chen', 'john m boyle', 'han wang', 'feng yu', 'bo yan', 'michael p kelly', 'francisco esteves', 'tae-won kim', 'yongjun zhang', 'lu wang', 'jihye kim', 'isabel carvalho', 'sandeep kumar', 'anabela oliveira', 'manas mukherjee', 'kelly m jones', 'jian gao', 'terence jackson', 'sandeep singh', 'mun-il kang', 'jae-hyun kim', 'zoltán nagy', 'yanbo zhang', 'peng li', 'joana p fernandes', 'feng xu', 'yan lin', 'richard w morris', 'joseph lopez', 'hui jiang', 'hui chen', 'm fernandes', 'y yang', 'bilal ahmed', 'c silva', 'manuela ferreira', 'david j kavanagh', 'xun li', 'anna ferrari', 'joana m dias', 'zhen ma', 'm pilar ruiz', 'sanjiv k mishra', 'michael wagner', 'vivek kumar', 'carlos silva', 'hongjun liu', 'jason williams', 'yong

In [21]:
# final step: put same name author's paper id into different file
# group same name author

for name in nameSet:
    # write all record belone to same name to one file as negative sample
    newf = open((newfileDir+name+".txt").encode('utf-8'), "w",encoding='utf8')
    for record in LabeledRecords_processed3:
        if (record["author_name"]==name):
            newf.write(record["paperID"]+"\n")
    newf.close()
    
    # write each author's records based on author id as positive sample
    idList = []
    counter = 0
    for author in filteredAuthors:
        if (author["author_name"]==name and author["author_ID"] not in idList):
            #print(author["author_ID"])
            newf = open((newfileDir+name+str(counter)+".txt").encode('utf-8'), "w",encoding='utf8')
            for record in LabeledRecords_processed3:
                if (author["author_ID"]== record["authorID"]):
                    newf.write(record["paperID"]+"\n")
            newf.close()
            counter+=1
            idList.append(author["author_ID"])
    print(idList)

['0000-0003-4440-0391', '0000-0002-3296-0183', '0000-0003-2760-3277']
['0000-0002-1439-9214', '0000-0001-6252-8693']
['0000-0003-0561-5058', '0000-0002-4388-6548']
['0000-0002-3864-9521', '0000-0002-2775-3315']
['0000-0002-2424-3969', '0000-0001-5859-3070']
['0000-0001-6422-9486', '0000-0001-9287-0959']
['0000-0002-2292-7339', '0000-0003-4706-7897']
['0000-0002-4425-3128', '0000-0002-1724-8649', '0000-0003-4053-7147']
['0000-0001-5038-694X', '0000-0001-6223-5962']
['0000-0001-9224-1917', '0000-0002-8167-6912']
['0000-0002-9214-2932', '0000-0003-4175-3829']
['0000-0001-8986-8436', '0000-0001-5951-8013']
['0000-0003-1920-5914', '0000-0001-8421-0002']
['0000-0001-8206-5171', '0000-0002-3721-8586', '0000-0001-9042-4007']
['0000-0003-3272-5180', '0000-0001-7073-7532']
['0000-0003-2844-2698', '0000-0003-2875-4659']
['0000-0001-9736-3497', '0000-0003-0348-2407']
['0000-0003-4693-5234', '0000-0002-3764-1149']
['0000-0001-6173-6765', '0000-0002-9404-6901']
['0000-0001-5623-1148', '0000-0003-242

['0000-0003-4552-1953', '0000-0001-6143-2139']
['0000-0002-9706-2421', '0000-0002-1355-1616', '0000-0002-6514-3470']
['0000-0002-4751-2180', '0000-0002-7014-8014']
['0000-0002-2894-3364', '0000-0003-2379-2226']
['0000-0002-1397-5224', '0000-0002-0437-9834', '0000-0001-9697-6689', '0000-0002-0906-0099', '0000-0001-9732-798X', '0000-0003-1572-8339', '0000-0002-8344-5907', '0000-0001-9803-7140', '0000-0003-2493-5209']
['0000-0003-4443-2326', '0000-0002-9462-496X']
['0000-0003-1113-7478', '0000-0003-1179-7003']
['0000-0003-1057-9194', '0000-0002-1698-6666', '0000-0002-5964-3233']
['0000-0003-1382-3295', '0000-0003-1359-5130']
['0000-0002-7861-4366', '0000-0001-6328-8097']
['0000-0002-2068-7618', '0000-0002-8244-3002']
['0000-0003-1214-8240', '0000-0002-6477-5345', '0000-0003-4516-6904', '0000-0001-5445-1032']
['0000-0003-1608-4467', '0000-0001-7433-1820']
['0000-0002-1393-326X', '0000-0001-9535-9152']
['0000-0003-0182-4215', '0000-0001-9934-7925', '0000-0002-8121-3678', '0000-0002-0814-296

['0000-0003-0283-4263', '0000-0001-7009-6552', '0000-0002-7447-6146', '0000-0002-2986-1272', '0000-0002-3041-2917']
['0000-0001-8672-6301', '0000-0003-1420-0276']
['0000-0003-3881-7369', '0000-0001-5024-5311']
['0000-0001-9129-3539', '0000-0003-3324-4151']
['0000-0003-1077-2082', '0000-0002-5948-3364', '0000-0002-1646-709X']
['0000-0002-5656-0061', '0000-0001-6475-6622', '0000-0002-1196-306X', '0000-0002-9413-4573', '0000-0003-1413-8038']
['0000-0003-1236-3047', '0000-0003-0161-3274']
['0000-0001-8006-2399', '0000-0003-4019-5140']
['0000-0001-6050-8699', '0000-0001-8602-1248']
['0000-0001-9263-5369', '0000-0001-7247-7404']
['0000-0003-2236-4553', '0000-0003-2330-398X']
['0000-0002-6891-0655', '0000-0001-9306-3227']
['0000-0002-4148-2603', '0000-0003-0302-3470', '0000-0002-7916-8687']
['0000-0001-7231-7021', '0000-0002-7802-8690']
['0000-0001-5022-5505', '0000-0001-6817-7126']
['0000-0002-1689-2002', '0000-0001-8231-5556']
['0000-0001-7690-7037', '0000-0003-2790-6294']
['0000-0002-7135-