In [107]:
import pandas as pd
import string
import json
import glob
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from nltk.corpus import stopwords
# import nltk
# nltk.download('stopwords')

In [108]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [109]:
# Import data extracted from PubMed
mesh_articles = pd.read_json('JSON Data/biomed_pubmed_data.json')[['Pubmed ID', 'Title', 'Abstract', 'Date']].dropna()
med_dev_articles = pd.read_json('JSON Data\device_pubmed_data.json')[['Pubmed ID', 'Title', 'Abstract', 'Date']].dropna()

# Combine datasets and remove duplicates
articles = med_dev_articles.append(mesh_articles).drop_duplicates()

# TF-IDF

In [110]:
def clean_docs(doc):
    '''Accepts an individual string (not a list of string). Removes stopwords and does other cleaning'''
    stops = stopwords.words('english') + ['biomedical', 'engineering', 'medical', 'devices', 'device', 'used', 'using', 'use']
    
    doc = doc.lower()
    doc = doc.translate(str.maketrans("", "", r'!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~')) # remove punctuation
    doc = doc.translate(str.maketrans("-", " "))  # replace dashes with spaces
    doc = re.sub(r'([ ]{2,})', ' ', doc)  # remove extra white spaces
    doc = [word for word in doc.split() if word not in stops] # remove stopwords
    doc = " ".join(doc)
    doc = re.sub(r'[\d]', '', doc) # remove digits
    doc = re.sub(r'([ ]{2,})', ' ', doc)
    doc = re.sub(r'[\s]$', '', doc) # remove whitespace at end of string
    return doc

In [111]:
# Cleaning method seems to break words that are hyphenateddue to a line break. ex: "units integral requirements delive ry direct patient"
abst = [clean_docs(i) for i in list(med_dev_articles['Abstract'])]

In [112]:
# Documentation for vectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
vec_max_feat = 200
vectorizer = TfidfVectorizer(   max_features=vec_max_feat,
                                max_df=0.8,  # max document frequency. Words in more the x% will be ignored
                                min_df=5, # min document frequency. Will ignore words that occur less than x times in the corpus
                                ngram_range = (1,3),  # allows x-grams to y-grams to be captured. E.g. unigrams to trigrams
                                stop_words = "english"  # will catch stopwords specific to sklearn
                                )

In [113]:
# Vectorize and isolate keywords from documents
vectors = vectorizer.fit_transform(abst)
feature_names = vectorizer.get_feature_names()

dense = vectors.todense()
denselist = dense.tolist()

'''
Denselist is a collection of all documents. But each document in the denselist is not a representation of the words in the document. Instead, it is an index of the feature_names with each word represented with its corresponding tf-idf score. In other words, it is a list of x key words and the tf-idf score for each word in the feature_names for a given document. If the keyword does not appear in the document, it has a score of 0. If, however, it does, that keyword has a tf-idf score, which means it is found within the document.
'''

all_keywords = []
for description in denselist:  # Find features (keywords) that have been found in each document
    keywords = []
    for i, word in enumerate(description):
        if word > 0:
            keywords.append(feature_names[i])
    all_keywords.append(keywords)

Example: The 9th value in this denselist document is the tf-idf value for the 9th word in the feature_names.

In [114]:
denselist[0][0:10]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

In [115]:
feature_names[0:10]

['act',
 'activity',
 'addition',
 'adhesion',
 'administration',
 'administration fda',
 'adverse',
 'agency',
 'amendments',
 'analysis']

In [116]:
# K-Means Clustering

nk = 8 # Number of desired clusters
model = KMeans(n_clusters=nk, init="k-means++", max_iter=100, n_init=1)
model.fit(vectors)

KMeans(max_iter=100, n_clusters=10, n_init=1)

In [117]:
# Write file with top 10 words from each cluster
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

with open ("cluster_results.txt", "w", encoding="utf-8") as f:
    f.write(f"Max vectorization features:{vec_max_feat}\n")
    for i in range(nk):
        f.write(f"Cluster {i + 1}")
        f.write("\n")
        for ind in order_centroids[i, :10]:
            f.write (" %s" % terms[ind],)
            f.write("\n")
        f.write("\n")
        f.write("\n")