## Use this notebook to generate a PubMed dataset from PubMed10

In [39]:
import numpy as np
import pickle
import pandas as pd

from scipy.io import loadmat
from scipy import sparse

In [None]:
# Origin of the PubMed10 dataset
# http://www-personal.umich.edu/~chenyanh/ev_data.html

In [40]:
# Define the classes to be selected
# For PubMed5: 
#  (6) Hay Fever --> nDoc = 1517
#  (7) Kidney Calculi --> nDoc = 1549
#  (8) Age-related Macular Degeneration --> nDoc = 3283
#  (9) Migraine --> nDoc = 3703
# (10) Otitis --> nDoc = 2596
classes_select = [6, 7, 8, 9, 10]

In [41]:
# Load PubMed10 document-term data
PubMed10_dw = loadmat("../../../../data/PubMed10/docWordMat.mat")
doc_term_counts = PubMed10_dw["docWordMat"]

# Load PubMed10 document true labels
PubMed10_lb = loadmat("../../../../data/PubMed10/label.mat")
labels = PubMed10_lb["label"]
labels = [item for sublist in labels for item in sublist]

# Load the words
pm10_terms = loadmat("../../../../data/PubMed10/wordList.mat")
pm10_terms = pm10_terms["wordList"]
pm10_terms_list = list()

for i in range(pm10_terms.shape[0]):
    pm10_terms_list.append(pm10_terms[i,0].tolist()[0])
pm10_terms = pm10_terms_list

# Identify labels for PubMed5
PubMed5_lb_unique = classes_select
PubMed5_idx = list()
for searchval in PubMed5_lb_unique:
    PubMed5_idx.extend(np.where(np.asarray(labels) == searchval)[0])

# Reduce the labels list
labels = [labels[i] for i in PubMed5_idx]

# Reduce the doc_term_counts matrix with the list of document index
doc_term_counts = doc_term_counts[np.asarray(PubMed5_idx), :]

# Reduce the doc_term_counts matrix by removing columns of 0
word_org_idx = np.unique(doc_term_counts.nonzero()[1])
doc_term_counts = doc_term_counts[:, np.unique(doc_term_counts.nonzero()[1])]

# Get the corresponding terms
terms = [pm10_terms[i] for i in word_org_idx]
terms = np.asarray(terms)        

In [42]:
# Save the document-term matrix
sparse.save_npz('../../../../data/PubMed5/PubMed5_docWordMat.npz', doc_term_counts) 

# Save the labels
df = pd.DataFrame(labels, columns=["labels"])
df.to_csv('../../../../data/PubMed5/PubMed5_label.csv', index=False)

# Save the terms
df = pd.DataFrame(terms, columns=["terms"])
df.to_csv('../../../../data/PubMed5/PubMed5_wordList.csv', index=False)

In [43]:
# How to load from files ?
# ----------------
# Doc-term
doc_term_counts = sparse.load_npz('../../../../data/PubMed5/PubMed5_docWordMat.npz')
print("# --> doc-term shape: ", doc_term_counts.shape)
# Labels
labels_df = pd.read_csv('../../../../data/PubMed5/PubMed5_label.csv', 
                     usecols = [0], delim_whitespace = True)
labels = labels_df.values.flatten()
print("# --> nb docs:", len(labels))
# Terms
terms_df = pd.read_csv('../../../../data/PubMed5/PubMed5_wordList.csv', 
                     usecols = [0], delim_whitespace = True)
terms = terms_df.values.flatten()
print("# --> nb terms:", len(terms))
# ----------------

doc-term shape:  (12648, 19518)
nb docs: 12648
nb terms: 19518
