In [1]:
# load the file
import sys
import io
import os

Dataset = "pubmed"

fileDir = "../Data/"+Dataset+"/canopies_labeled/"
listfiles = os.listdir(fileDir)
LabeledRecords_original = []

for file in listfiles:
    if not file.startswith('.'):
        with open(fileDir+file, 'r', encoding = 'utf8') as f:
            for line in f:
                read_data = line.split("\t")
                # get ride of bad formated lines
                if(len(read_data)==13):
                    paper_detail = {"paperID": read_data[0], "authorID":read_data[1],
                                    "author_position":read_data[2], "total_author": read_data[3],
                                    "author_name": read_data[4], "co_authors": read_data[5], 
                                    "department": read_data[6], "vendor": read_data[7], 
                                    "mesh": read_data[8], "keywords": read_data[9], 
                                    "publish_year": read_data[10], "citation graph": read_data[11]}
                    LabeledRecords_original.append(paper_detail)
        f.close()
print("Total labeled records:",len(LabeledRecords_original), "records")

Total labeled records: 140266 records


In [2]:
# print a sample
print(LabeledRecords_original[:3])

[{'paperID': '27406695', 'authorID': '0000-0002-9697-0962', 'author_position': '4', 'total_author': '10', 'author_name': 'jonathan m read', 'co_authors': 'truelove zhu lessler riley wang kwok guan jiang cummings', 'department': 'department epidemiology public health institute infection global health university liverpool neston uk', 'vendor': '27279', 'mesh': '', 'keywords': 'cross-protection$$hemagglutination inhibition test$$immunity$$influenza$$microneutralization test$$neutralization test', 'publish_year': '2016', 'citation graph': '10.3345/kjp.2012.55.12.474$$10.1016/j.vaccine.2011.11.019$$10.1007/s00430-010-0143-4$$10.1080/15388220801955596$$10.1017/s0022172400022610$$10.1038/ncomms1432$$10.1016/j.vaccine.2007.02.039$$10.1128/cvi.00278-15$$10.3201/eid1508.081754'}, {'paperID': '26876744', 'authorID': '0000-0002-9697-0962', 'author_position': '1', 'total_author': '8', 'author_name': 'j m read', 'co_authors': 'hungerford cooke vivancos iturriza-gómara allen french cunliffe', 'depart

In [7]:
# process metadata keywords and mesh use term frequence
# Medical Subject Headings (MeSH)
# extract keywords and mesh
pids = []
key_plus_mesh_per_pid = []
for record in LabeledRecords_original:
    pid = record["paperID"]
    keywords = record["keywords"].replace("$$", " ")
    mesh = record["mesh"].replace("$$", " ")
    key_plus_mesh = keywords + " " + mesh
    pids.append(pid)
    key_plus_mesh_per_pid.append(key_plus_mesh)
print(pids[:3])
print(key_plus_mesh_per_pid[:3])

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df = 1)
count_matrix = count_vect.fit_transform(key_plus_mesh_per_pid)
print(len(count_vect.vocabulary_))
#print(count_vect.get_feature_names())

['27406695', '26876744', '26978780']
['cross-protection hemagglutination inhibition test immunity influenza microneutralization test neutralization test ', 'Epidemiology Healthcare-associated infection Rotavirus Vaccination Adolescent Child Child, Preschool Community-Acquired Infections Cross Infection Female Gastroenteritis Hospitals Humans Infant Infant, Newborn Male Prevalence Rotavirus Infections Rotavirus Vaccines United Kingdom Vaccination', ' Humans Pennsylvania Schools Social Support Students']
37869


In [None]:
# calculate tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
allContent = [paper.content for paper in allpaperCollection]
print(allContent[:2])
tfidf_vectorizer = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=True, norm=None,min_df = 3).fit(allContent)
tfidf_matrix = tfidf_vectorizer.transform(allContent)
# for i in range(0, len(collection)):
#     print(collection[i].pid)
#     print(tfidf_matrix[i].toarray())
print(tfidf_matrix[:2].toarray())
print(tfidf_matrix.shape)

In [12]:
# process metadata total_author and co_authors
# calculate co-author
pids = []
co_authors = []
for record in LabeledRecords_original:
    pids.append(record["paperID"])
    co_authors.append(record["co_authors"])
print(pids[:3])
print(co_authors[:3])

from sklearn.feature_extraction.text import CountVectorizer
co_author_count_vect = CountVectorizer(min_df = 1)
co_author_matrix = co_author_count_vect.fit_transform(co_authors)
print(len(co_author_count_vect.vocabulary_))
print(co_author_count_vect.get_feature_names()[:100])

['27406695', '26876744', '26978780']
['truelove zhu lessler riley wang kwok guan jiang cummings', 'hungerford cooke vivancos iturriza-gómara allen french cunliffe', 'guclu vukotich galloway gao rainey uzicanin zimmer cummings']
120615
['004', '006', '0102', '0201', '03', '039', '045', '054', '058', '06', '069', '082', '09', '0974', '1000', '1014', '11', '1263', '1b', '2005', '2006', '201', '2012', '2013', '2014', '202', '2093', '236', '23andme', '300', '301', '303', '318', '3202', '34', '343', '343t', '381', '3mg', '42', '43', '5000', '50k', '700', '736', '98', '999', 'a362', 'a5001', 'a5090', 'a5142', 'a5221', 'a5235', 'aa', 'aaag', 'aabakken', 'aabed', 'aabenhus', 'aachmann', 'aacvpr', 'aad', 'aadahl', 'aadland', 'aaen', 'aagaard', 'aagc', 'aagenes', 'aagesen', 'aal', 'aalbaek', 'aalbers', 'aalbersberg', 'aalberse', 'aalen', 'aalfs', 'aalkjaer', 'aalok', 'aalseth', 'aalsma', 'aalten', 'aalto', 'aaltonen', 'aama', 'aams', 'aanestad', 'aap', 'aapro', 'aarab', 'aarabi', 'aarden', 'aares

In [13]:
print(co_author_count_vect.get_feature_names()[:10000])

['004', '006', '0102', '0201', '03', '039', '045', '054', '058', '06', '069', '082', '09', '0974', '1000', '1014', '11', '1263', '1b', '2005', '2006', '201', '2012', '2013', '2014', '202', '2093', '236', '23andme', '300', '301', '303', '318', '3202', '34', '343', '343t', '381', '3mg', '42', '43', '5000', '50k', '700', '736', '98', '999', 'a362', 'a5001', 'a5090', 'a5142', 'a5221', 'a5235', 'aa', 'aaag', 'aabakken', 'aabed', 'aabenhus', 'aachmann', 'aacvpr', 'aad', 'aadahl', 'aadland', 'aaen', 'aagaard', 'aagc', 'aagenes', 'aagesen', 'aal', 'aalbaek', 'aalbers', 'aalbersberg', 'aalberse', 'aalen', 'aalfs', 'aalkjaer', 'aalok', 'aalseth', 'aalsma', 'aalten', 'aalto', 'aaltonen', 'aama', 'aams', 'aanestad', 'aap', 'aapro', 'aarab', 'aarabi', 'aarden', 'aarestrup', 'aarntzen', 'aaron', 'aarons', 'aaronson', 'aarrass', 'aarrestad', 'aars', 'aarsaether', 'aarsen', 'aarseth', 'aarsland', 'aarts', 'aarvak', 'aas', 'aasarød', 'aase', 'aasheim', 'aasi', 'aasim', 'aasly', 'aasnaes', 'aasprang', '

In [None]:
# process metadata department
#
pids = []
departments = []
for record in LabeledRecords_original:
    pid_department.append({"paperID": read_data[0],"department": read_data[6])

In [None]:
# process metadata vendor
# the vendor is in id form, no future process needed
pid_vendor = []
for record in LabeledRecords_original:
    pid_vendor.append({"paperID": read_data[0],"vendor": read_data[7])

In [None]:
# process metadata publish_year
# year do not need future process
pid_year = []
for record in LabeledRecords_original:
    pid_year.append({"paperID": read_data[0],"publish_year": read_data[10])