In [86]:
import pandas as pd
import string
import json
import glob
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from nltk.corpus import stopwords

In [8]:
# import nltk
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Briceno\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [2]:
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return (data)

def write_data(file, data):
    with open (file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

In [11]:
# Import data extracted from PubMed
mesh_articles = pd.read_json('JSON Data/biomed_pubmed_data.json')[['Pubmed ID', 'Title', 'Abstract', 'Date']].dropna()
med_dev_articles = pd.read_json('JSON Data\device_pubmed_data.json')[['Pubmed ID', 'Title', 'Abstract', 'Date']].dropna()

# Combine datasets and remove duplicates
articles = med_dev_articles.append(mesh_articles).drop_duplicates()

# TF-IDF

In [82]:
def clean_docs(doc):
    '''Accepts an individual string (not a list of string). Removes stopwords and does other cleaning'''
    stops = stopwords.words('english')
    
    doc = doc.lower()
    doc = doc.translate(str.maketrans("", "", r'!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~')) # remove punctuation
    doc = doc.translate(str.maketrans("-", " "))  # replace dashes with spaces
    doc = re.sub(r'([ ]{2,})', ' ', doc)  # remove extra white spaces
    doc = [word for word in doc.split() if word not in stops] # remove stopwords
    doc = " ".join(doc)
    doc = re.sub(r'[\d]', '', doc) # remove digits
    doc = re.sub(r'([ ]{2,})', ' ', doc)
    doc = re.sub(r'[\s]$', '', doc) # remove whitespace at end of string
    return doc

In [83]:
# Cleaning method seems to break words that are hyphenateddue to a line break. ex: "units integral requirements delive ry direct patient"
abst = [clean_docs(i) for i in list(articles['Abstract'])]

In [88]:
# Documentation for vectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
vectorizer = TfidfVectorizer(   max_features=100,
                                max_df=0.8,  # max document frequency. Words in more the x% will be ignored
                                min_df=5, # min document frequency. Will ignore words that occur less than x times in the corpus
                                ngram_range = (1,3),  # allows x-grams to y-grams to be captured. E.g. unigrams to trigrams
                                stop_words = "english"  # will catch stopwords specific to sklearn
                                )

In [96]:
# Vectorize and isolate keywords from documents
vectors = vectorizer.fit_transform(abst)
feature_names = vectorizer.get_feature_names()

dense = vectors.todense()
denselist = dense.tolist()

'''
Denselist is a collection of all documents. But each document in the denselist is not a representation of the words in the document. Instead, it is an index of the feature_names with each word represented with its corresponding tf-idf score. In other words, it is a list of x key words and the tf-idf score for each word in the feature_names for a given document. If the keyword does not appear in the document, it has a score of 0. If, however, it does, that keyword has a tf-idf score which means it is found within the document.
'''

all_keywords = []
for description in denselist:  # Find features (keywords) that have been found in each document
    keywords = []
    for i, word in enumerate(description):
        if word > 0:
            keywords.append(feature_names[i])
    all_keywords.append(keywords)

The 9th value in this denselist document is the tf-idf value for the 9th word in the feature_names.

In [109]:
denselist[0][0:10]

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.09839929223995666, 0.0]

In [110]:
feature_names[0:10]

['administration',
 'analysis',
 'application',
 'applications',
 'approach',
 'article',
 'associated',
 'available',
 'based',
 'biomedical']

In [None]:
# K-Means Clustering