## **This notebook processes the PubMed dataset ready for analysis**

In [1]:
import pandas as pd
import numpy as np
import json
from processing_pubMed import ProcessPubMed

In [2]:
PubMed_Processing = ProcessPubMed()

In [3]:
with open(PubMed_Processing.PUBMED_JSON, "r") as file:
     data = json.load(file)

print(f"Dataset contains as many instances as nodes in the original dataset: {PubMed_Processing.NUM_NODES_PUBMED == len(data)}")

Dataset contains as many instances as nodes in the original dataset: True


## **Dataset cleaning**

In [4]:
print(f"Data to proces; number of fields: {len(data[0])}")
print(f"Fields for each paper: {[k for k, v in data[0].items()]}")

# Look for useful fields
PubMed_Processing.get_data_info(data)

Data to proces; number of fields: 31
Fields for each paper: ['PMID', 'OWN', 'STAT', 'DCOM', 'LR', 'IS', 'VI', 'IP', 'DP', 'TI', 'PG', 'AB', 'FAU', 'AU', 'AD', 'LA', 'PT', 'PL', 'TA', 'JT', 'JID', 'RN', 'SB', 'MH', 'EDAT', 'MHDA', 'CRDT', 'PHST', 'AID', 'PST', 'SO']
Missing identifiers: 1
Missing titles: 1
Missing abstracts: 1
Missing medical headers: 1
Missing authors: 111


In [5]:
data = PubMed_Processing.get_valid_papers(data)
PubMed_Processing.get_data_info(data)

Missing identifiers: 0
Missing titles: 0
Missing abstracts: 0
Missing medical headers: 0
Missing authors: 110


In [6]:
data = PubMed_Processing.get_features(data) 
print(f"Useful variables: {[k for k, v in data[0].items()]}") # Identifier, Title, Abstract, Paper Key words, Authors

df = PubMed_Processing.to_pandas(data)

random_row = np.random.randint(0, len(df)-1)
print(df.iloc[random_row, :])

df = PubMed_Processing.flatten_lists(df)
print(df.iloc[random_row, :])

# Missing values should just be those 110 authors:
print(f"Number of missing values: {PubMed_Processing.get_missing(df)}") # It has to be equal to 110 for the authors


# Parse column types
df = PubMed_Processing.parse_columns(df)

Useful variables: ['PMID', 'TI', 'AB', 'MH', 'FAU']
PMID                                              2784589
TI      High frequency of coeliac disease in adult pat...
AB      Anti-reticulin antibodies were measured by an ...
MH      [Adolescent, Adult, Antibodies, Anti-Idiotypic...
FAU     [Collin, P, Salmi, J, Hallstrom, O, Oksa, H, O...
Name: 12265, dtype: object
PMID                                              2784589
TI      High frequency of coeliac disease in adult pat...
AB      Anti-reticulin antibodies were measured by an ...
MH      Adolescent; Adult; Antibodies, Anti-Idiotypic/...
FAU     Collin, P; Salmi, J; Hallstrom, O; Oksa, H; Ok...
Name: 12265, dtype: object
Number of missing values: PMID      0
TI        0
AB        0
MH        0
FAU     110
dtype: int64


In [7]:
file_path = PubMed_Processing.FILE_PATH

PubMed_Processing.compare_files(file_path, df)

# Create a dataset from the .cites file
tf_idf_df = PubMed_Processing.create_df(file_path)
print(tf_idf_df.iloc[0, :])

merged_df = PubMed_Processing.merge_datasets(df, tf_idf_df)

Same IDs in both datasets
PMID                                                      12187484
label                                                            1
tfidf_words      {'rat': 0.09393489570187145, 'common': 0.02869...
summary_words    [rat, common, use, examin, pathogenesi, retino...
Name: 0, dtype: object


In [8]:
merged_df.head(10)

Unnamed: 0,PMID,Title,Abstract,Key_words,Authors,label,TFIDF,summary_words
0,12187484,Retinal metabolic abnormalities in diabetic mo...,PURPOSE: Dogs and rats are commonly used to ex...,"Animals; Diabetes Mellitus, Experimental/*meta...","Kowluru, Renu A",1,"{'rat': 0.09393489570187145, 'common': 0.02869...","[rat, common, use, examin, pathogenesi, retino..."
1,2344352,Spatially resolved changes in diabetic rat ske...,Phase-modulated rotating-frame imaging (p.m.r....,Adenosine Triphosphate/metabolism; Animals; Di...,"Challiss, R A; Blackledge, M J; Radda, G K",1,"{'rat': 0.023617916633613394, 'use': 0.0147841...","[rat, use, anim, metabol, investig, 2, compar,..."
2,14654069,Mitochondria respiration and susceptibility to...,Cardiovascular complications are the primary c...,Animals; Body Weight/physiology; Cell Respirat...,"Lashin, Ossama; Romani, Andrea",1,"{'rat': 0.10226314418677966, 'use': 0.01066898...","[rat, use, anim, contribut, develop, investig,..."
3,16443886,Mean blood glucose and biological variation ha...,OBJECTIVE: Mean blood glucose (MBG) over 2-3 m...,"Blood Glucose/*analysis; Diabetes Mellitus, Ty...","McCarter, Robert J; Hempe, James M; Chalew, St...",2,"{'model': 0.038714646134547365, 'develop': 0.0...","[model, develop, month, method, level, result,..."
4,2684155,Regulation of very-low-density-lipoprotein lip...,Hepatocytes were derived from 2-3-day streptoz...,"Animals; Cells, Cultured; Cholesterol/analysis...","Duerden, J M; Bartlett, S M; Gibbons, G F",1,"{'rat': 0.030615817858387732, 'anim': 0.080179...","[rat, anim, compar, normal, level, result, inc..."
5,15032912,Specific changes of somatostatin mRNA expressi...,Abstract Most current studies of diabetic ence...,Animals; *Brain Chemistry; Dementia/complicati...,"XiaoMing, Zhang; Xi, Zhu; Fang, Shen; Jilin, Zhou",1,"{'rat': 0.11689675909566226, 'studi': 0.005201...","[rat, studi, metabol, 2, compar, 6, inject, 30..."
6,17988185,Glycemic response to newly initiated diabetes ...,OBJECTIVE: The glycemic response to antihyperg...,Blood Glucose/*drug effects; Diabetes Mellitus...,"Karter, Andrew J; Moffet, Howard H; Liu, Jenni...",3,"{'use': 0.007445259958367352, 'studi': 0.01111...","[use, studi, model, 2, month, method, 30, leve..."
7,9834350,The gastric bypass operation reduces the progr...,Of 232 morbidly obese patients with non-insuli...,Adult; Blood Glucose/analysis; Cardiovascular ...,"MacDonald, K G Jr; Long, S D; Swanson, M S; Br...",3,"{'2': 0.006157469794431223, 'compar': 0.031883...","[2, compar, 6, level, increas, p, control, mas..."
8,16230722,Exenatide versus insulin glargine in patients ...,BACKGROUND: Physicians may use either insulin ...,Adult; Aged; Blood Glucose/metabolism; Blood G...,"Heine, Robert J; Van Gaal, Luc F; Johns, Don; ...",3,"{'common': 0.010479105166473187, 'use': 0.0042...","[common, use, studi, 2, compar, 6, inject, lev..."
9,3542527,Assessing daily management in childhood diabetes.,One hundred sixty-eight patients with childhoo...,Adolescent; Blood Glucose/metabolism; Child; D...,"Johnson, Suzanne B; Silverstein, Janet; Rosenb...",2,"{'use': 0.027970030654407077, 'studi': 0.01391...","[use, studi, obtain, 6, measur, result, group,..."


In [9]:
# Store the dataframe
PubMed_Processing.to_csv(merged_df)