# Extract from textual files that are useful

Many of the data in raw files are useless, thus we need to extract data that are useful

In [1]:
# load the file
import sys
import io
import os

Dataset = "pubmed"

fileDir = "Data/"+Dataset+"/canopies/"
listfiles = os.listdir(fileDir)

allpaper_need_extract = []

for file in listfiles:
    if not file.startswith('.'):
        with open(fileDir+file, 'r', encoding = 'utf8') as f:
            for line in f:
                read_data = line.split("\t")
                # some record's doi contain \r or \n character in which creating issue, since we do not use those, ignore it
                if(len(read_data)==13 or len(read_data)==12):
                    paper_detail = {"paperID": read_data[0], "mesh": read_data[8].lower().strip(),
                                    "keywords": read_data[9].lower().strip()}
                    allpaper_need_extract.append(paper_detail)
                else:
                    print(len(read_data))
        f.close()
        
print("Total records:",len(allpaper_need_extract), "records")

2
2
Total records: 4163772 records


In [2]:
# find unique paper id
# sort paperID
paperIDs = []
for paper in allpaper_need_extract:
    paperIDs.append(int(paper["paperID"]))
# the paper id is not unique, we need to extract text info with unique paper id 
paperIDs = sorted(set(paperIDs))
num_need_extract = len(paperIDs)
print("Total unique paper count: ",len(paperIDs))

Total unique paper count:  3151504


In [3]:
paperIDs[-1]

28355772

In [5]:
# extract text info based on paperID and generate new smaller file with id, title, abstract, keyword, and mesh
filePath = "Data/"+Dataset+"/allAdditional/id_title_abstract.txt"
paperID_title_abstract = []
num_fail_extract = 0
with open(filePath, 'r', encoding = 'utf8') as f:
    for line in f:
        if(len(paperIDs)==0):
            break
        read_data = line.split("\t")
        # if pid > allpid, pass
        while (paperIDs[0]<=int(read_data[0])):
            if(paperIDs[0]==int(read_data[0])):
                if(len(paperID_title_abstract)%500000==0):
                    print("reache point: ",read_data[0])
                paper = {"paperID": read_data[0], "title": read_data[1], "abstract": read_data[2]}
                paperID_title_abstract.append(paper)
                paperIDs.remove(paperIDs[0])
            elif (paperIDs[0]<int(read_data[0])):
                # remove paper that not in all dataset
                print(paperIDs[0], " : ",read_data[0])
                paperIDs.remove(paperIDs[0])
                num_fail_extract+=1
            if len(paperIDs)==0:
                break
                
f.close()
# this means some paper are missing from all text information
print("Total extracted title and abstract info: ",len(paperID_title_abstract))
print("Total unique paper need to extract: ",num_need_extract)
print("Total paper fail to extract: ",num_fail_extract)
print(paperID_title_abstract[-1]["paperID"])

reache point:  3
8362786  :  8362787
reache point:  8608573


KeyboardInterrupt: 

In [None]:
print(len(paperID_title_abstract))
print(num_need_extract)
print(paperID_title_abstract[-1])
print(allpaper_need_extract[-1])

In [None]:
'''
Some recent papers not in old database do not have abstract and title, only pid and keyword+mesh
Code below removed paper do not have abstract and title.
'''
# # write content to file
# newfile = open("Data/"+Dataset+"/allAdditional/id_title_abstract_extracted.txt", "w",encoding='utf8')
# for paper in paperID_title_abstract:
#     newfile.write((paper["paperID"]+"\t"+paper["title"]+"\t"+paper["abstract"]).strip('\n')+"\n")
# newfile.close()

# extract from canopies folder

We have 4,163,772 records, but we only extract keyword and mesh. When we drop the duplicate items, we should have only unique paper(pid) and it's keywords and mesh

But some of records are not consist with keyword and mesh even it's same paper (Different records gives different keyword and mesh). Thus we fix it by select the longer length keyword+mesh

In [None]:
# extract keyword and mesh, drop duplicate
keywords_mesh = pd.DataFrame(allpaper_need_extract).drop_duplicates()
print(keywords_mesh.shape)
# show some case failed
idcol = keywords_mesh["paperID"]
duplicatedset = keywords_mesh[idcol.isin(idcol[idcol.duplicated()])].sort_values("paperID")
print(duplicatedset.shape)

In [None]:
# keep keywords + mesh length longer one
unique_pid = sorted(set(duplicatedset["paperID"]))
remove_dup_idx = []
for pid in unique_pid:
    lenlist = []
    idxlist = []
    pidlist = []
    for index, row in duplicatedset.iterrows():
        if pid == row["paperID"]:
            item_char_length = len(row["keywords"]+row["mesh"])
            lenlist.append(item_char_length)
            idxlist.append(index)
            pidlist.append(pid)
    print(lenlist)
    print(idxlist)
    print(pidlist)
    keep_idx = lenlist.index(max(lenlist))
    del idxlist[keep_idx]
    print(" idx: ", idxlist)
    remove_dup_idx.extend(idxlist)
    
print(sorted(remove_dup_idx))
print(len(remove_dup_idx))

In [None]:
no_dup_keywords_mesh = keywords_mesh.drop(remove_dup_idx)
print(no_dup_keywords_mesh.shape)

In [None]:
import pandas as pd
title_abstract = pd.DataFrame(paperID_title_abstract)
print(title_abstract.shape)
# merge information together
text_data = no_dup_keywords_mesh.merge(title_abstract, how='outer', left_on='paperID', right_on='paperID')
text_data.fillna('', inplace=True)

In [None]:
text_data.shape
text_data.info()

In [None]:
text_data.head()

In [None]:
text_data.tail()

In [None]:
# write content to file
newf = open("Data/"+Dataset+"/id_text_combined.txt", "w",encoding='utf8')
for idx, paper in text_data.iterrows():
    try:
        newf.write((paper["paperID"]+"\t"+paper["title"]+"\t"+str(paper["keywords"])+"\t"+str(paper["mesh"])+"\t"+paper["abstract"]).strip('\n')+"\n")
    except:
        print(paper["paperID"],"\t",paper["title"],"\t",str(paper["keywords"]),"\t",str(paper["mesh"]),"\t",paper["abstract"])
        print(type(paper["paperID"]))
        print(type(paper["title"]))
        print(type(paper["keywords"]))
        print(type(paper["mesh"]))
        print(type(paper["abstract"]))
    #print("Currently on row: {}; Currently iterrated {}% of rows".format(idx, (idx + 1)/len(text_data.index) * 100))
newf.close()

In [None]:
# write pid to file
file = open("Data/"+Dataset+"/pids.txt", "w",encoding='utf8')
for idx, paper in text_data.iterrows():
    file.write(paper["paperID"]+"\n")
file.close()