In [2]:
# load the file
import sys
import io
import os

fileDir = "../Data/canopies_labeled/"
listfiles = os.listdir(fileDir)
LabeledRecords_original = []

for file in listfiles:
    if not file.startswith('.'):
        with open(fileDir+file, 'r', encoding = 'utf8') as f:
            for line in f:
                read_data = line.split("\t")
                # get ride of bad formated lines
                if(len(read_data)==13):
                    paper_detail = {"paperID": read_data[0], "authorID":read_data[1],
                                    "author_position":read_data[2], "total_author": read_data[3],
                                    "author_name": read_data[4], "co_authors": read_data[5], 
                                    "department": read_data[6], "vendor": read_data[7], 
                                    "mesh": read_data[8], "keywords": read_data[9], 
                                    "publish_year": read_data[10], "citation graph": read_data[11]}
                    LabeledRecords_original.append(paper_detail)
        f.close()
print("Total labeled records:",len(LabeledRecords_original), "records")

Total labeled records: 140266 records


In [3]:
# data column
for key in LabeledRecords_original[0]:
    print(key)

paperID
authorID
author_position
total_author
author_name
co_authors
department
vendor
mesh
keywords
publish_year
citation graph


In [4]:
# merge author name based on their id, replace abstract name to full name if full name available
# method: count length of string that have same authorID, replace it with longest name
# this case, we assume that author id is unique identifier means each author will only have one author ID

#print(LabeledRecords_original[:3])
# process 1: count length of string that have same authorID, replace it with longest name
LabeledRecords_processed1 = LabeledRecords_original
authorIDs = set([records['authorID'] for records in LabeledRecords_original])

for authorID in authorIDs:
    # get longest string name
    maxLengthName = ""
    for record in LabeledRecords_original:
        if (authorID == record['authorID'] and len(maxLengthName)<len(record['author_name'])):
            maxLengthName = record['author_name']
    # replace all string name shorter than longest string name to longest string name (same author ID)
    for record in LabeledRecords_processed1:
        if (authorID == record['authorID']):
            record['author_name'] = maxLengthName

#print(LabeledRecords_processed1[:3])

In [5]:
# remove authors that don't have full name
# process 2: if author doesn't have full first name, filter out
LabeledRecords_processed2 = []
authorNames = set([records["author_name"] for records in LabeledRecords_processed1])
# count unqualify authorName (Not full name)
fullNameList = []
counter = 0
for author in authorNames:
    name_part = author.split(" ")
    # if first part of name length less than 2, assume not full name
    if (len(name_part[0])<2):
        counter+=1;
    else :
        fullNameList.append(author)

for record in LabeledRecords_processed1:
    if(record["author_name"] in fullNameList):
        LabeledRecords_processed2.append(record)
print(len(LabeledRecords_processed2))

138896


In [7]:
from collections import Counter

# create author ID to name map
author_ID_to_name_map = []

for record in LabeledRecords_processed2:
    if(record["authorID"] not in [item["author_ID"] for item in author_ID_to_name_map]):
        author_ID_to_name_map.append({"author_name": record["author_name"], "author_ID": record["authorID"]})

print(len(author_ID_to_name_map))


9482


In [8]:
# extract author with same name
# author with same name different author ID
sameNameAuthor = []
for i in author_ID_to_name_map:
    for j in author_ID_to_name_map:
        if(j["author_name"]==i["author_name"] and j["author_ID"]!=i["author_ID"]):
            if(i["author_ID"] not in [item["author_ID"] for item in sameNameAuthor]):
                sameNameAuthor.append(i)
            if(j["author_ID"] not in [item["author_ID"] for item in sameNameAuthor]):
                sameNameAuthor.append(j)

# count author with same name
# sameNameCounter = Counter([author["author_name"] for author in sameNameAuthor])
# for key, value in sameNameCounter.items():
#     if value>=2:
#         print(value, "people have same name:", key)

# for author in sameNameAuthor:
#     print("Author: {k}, Author ID: {c}".format(k=author["author_name"], c=author["author_ID"]))    

In [9]:
# filter out authors that have same name but write less paper than threshold

# set up threshold
threshold = 25

# count number of paper author write
c = Counter([record["authorID"] for record in LabeledRecords_processed2])
c.most_common(3) 
for key, value in c.items():
    for author in sameNameAuthor:
        if(key == author["author_ID"]):
            author["paper_count"] = value

sameNameAuthorWithCount = sorted(sameNameAuthor, key=lambda k: (k['author_name'], k["paper_count"]))
# collect authors that have repeated name and write more paper than threshold
uniqueAuthors = []
temp = []
authorFilter = []
for author in sameNameAuthorWithCount:
    if(author["paper_count"]>threshold):
        temp.append(author)
        #print("Author name: {n}, Author: {k}, Paper count: {c}".format(n=author["author_name"],k=author["author_ID"], c= author["paper_count"]))

for author in temp:
    if(author["author_name"] not in uniqueAuthors):
        uniqueAuthors.append(author["author_name"])
    else: 
        if author["author_name"] not in authorFilter:
            authorFilter.append(author["author_name"])
# collect filtered data
filteredAuthors = []
for author in temp:
    if(author["author_name"] in authorFilter):
        filteredAuthors.append(author)

for author in filteredAuthors:
    print("Author name: {n}, Author: {k}, Paper count: {c}".format(n=author["author_name"],k=author["author_ID"], c= author["paper_count"]))


Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper count: 42
Author name: chung-may yang, Author: 0000-0002-4328-8716, Paper count: 71
Author name: david g lloyd, Author: 0000-0003-0658-8995, Paper count: 50
Author name: david g lloyd, Author: 0000-0002-0824-9682, Paper count: 104
Author name: feng liu, Author: 0000-0001-6224-5167, Paper count: 30
Author name: feng liu, Author: 0000-0003-3228-0943, Paper count: 31
Author name: hao song, Author: 0000-0002-3134-782X, Paper count: 29
Author name: hao song, Author: 0000-0001-5553-2539, Paper count: 30
Author name: jeong hwan kim, Author: 0000-0002-8383-8524, Paper count: 33
Author name: jeong hwan kim, Author: 0000-0003-2068-7287, Paper count: 51
Author name: kevin m. ryan, Author: 0000-0003-3670-8505, Paper count: 36
Author name: kevin m. ryan, Author: 0000-0002-1059-9681, Paper count: 79
Author name: lei wang, Author: 0000-0002-5859-2526, Paper count: 53
Author name: lei wang, Author: 0000-0003-3870-3388, Paper count: 64
Aut

In [10]:
# collect paper id from author with same name and save it to file
newfileDir = "../Data/filteredSameNameAuthor/"
if not os.path.exists(newfileDir):
    os.makedirs(newfileDir)
# extract the records from all records
# process 3: get selected authors records from all records
LabeledRecords_processed3 = []
for author in filteredAuthors:
    for record in LabeledRecords_processed2:
        if (author["author_ID"]== record["authorID"]):
            LabeledRecords_processed3.append(record)
# double check with output file
for record in LabeledRecords_processed3:
     print("Author name: {n}, Author: {k}, Paper id: {c}".format(n=record["author_name"],k=record["authorID"], c= record["paperID"]))
    

Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 26200509
Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 26891760
Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 26732884
Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 26934453
Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 27084002
Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 26582311
Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 27156648
Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 27063374
Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 26803488
Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 26868376
Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 26311257
Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 26404862
Author name: chung-may yang, Author: 0000-0003-4082-420X, Paper id: 26047532

In [13]:
# print detail for check
for record in LabeledRecords_processed3:
     print("Author name: {n}, Author: {k}, department: {d}".format(n=record["author_name"], k=record["authorID"], d= record["department"]))


Author name: chung-may yang, Author: 0000-0003-4082-420X, department: 
Author name: chung-may yang, Author: 0000-0003-4082-420X, department: department ophthalmology national taiwan university hospital taipei taiwan college medicine national taiwan university taipei taiwan
Author name: chung-may yang, Author: 0000-0003-4082-420X, department: department ophthalmology national taiwan university hospital taipei taiwan
Author name: chung-may yang, Author: 0000-0003-4082-420X, department: 
Author name: chung-may yang, Author: 0000-0003-4082-420X, department: department ophthalmology national taiwan university hospital taipei taiwan
Author name: chung-may yang, Author: 0000-0003-4082-420X, department: department ophthalmology national taiwan university hospital taipei taiwan college medicine national taiwan university taipei taiwan electronic address chungmayntuedutw
Author name: chung-may yang, Author: 0000-0003-4082-420X, department: taipei taiwan
Author name: chung-may yang, Author: 0000-

Author name: michael wagner, Author: 0000-0002-9778-7684, department: 
Author name: michael wagner, Author: 0000-0002-9778-7684, department: 
Author name: michael wagner, Author: 0000-0002-9778-7684, department: 
Author name: michael wagner, Author: 0000-0002-9778-7684, department: 
Author name: michael wagner, Author: 0000-0002-9778-7684, department: 
Author name: michael wagner, Author: 0000-0002-9778-7684, department: 
Author name: michael wagner, Author: 0000-0002-9778-7684, department: 
Author name: michael wagner, Author: 0000-0002-9778-7684, department: 
Author name: michael wagner, Author: 0000-0002-9778-7684, department: 
Author name: michael wagner, Author: 0000-0002-9778-7684, department: department microbial ecology faculty life sciences university vienna a1090 wien austria wagnermicrobialecologynet
Author name: michael wagner, Author: 0000-0002-9778-7684, department: 
Author name: michael wagner, Author: 0000-0002-9778-7684, department: 
Author name: michael wagner, Author

In [12]:
# final step: put same name author's paper id into different file
# group same name author
nameSet = set([author["author_name"] for author in filteredAuthors])
comparter = filteredAuthors

for name in nameSet:
    idList = []
    counter = 0
    for author in filteredAuthors:
        if (author["author_name"]==name and author["author_ID"] not in idList):
            #print(author["author_ID"])
            newf = open(newfileDir+name+str(counter)+".txt", "w",encoding='utf8')
            for record in LabeledRecords_processed3:
                if (author["author_ID"]== record["authorID"]):
                    newf.write(record["paperID"]+"\n")
            newf.close()
            counter+=1
            idList.append(author["author_ID"])
    #print(idList)