In [21]:
import json
import nltk
import re
import os

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [3]:
# Default list of stopwords from NLTK Library
all_stop_words = set(stopwords.words('english'))

In [4]:
def addStopwords(stop_words, words):    
    for word in words:
        stop_words.add(word)
    return stop_words    

In [5]:
def removeStopwords(stop_words, words):    
    for word in words:
        if word in stop_words:
            stop_words.remove(word)
            print(word + " removed")
        else:
            print(word + " not in set")
    return stop_words    

In [6]:
stop_words = removeStopwords(all_stop_words, ["do", "not"])

do removed
not removed


In [7]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
tokenizer.tokenize('Lower Leg Pain: Causes and Treatments')

['Lower', 'Leg', 'Pain', 'Causes', 'and', 'Treatments']

In [8]:
def filterStopwords(text, stop_words):
    
    
    
    if (text is None) : return 'Null'
    
#     tokenizer = nltk.RegexpTokenizer(r"\w+")
#     words = tokenizer.tokenize(text)

    filtered_words = []
    
    for word in text:
                        
        if (word == None): continue 
        
        elif (word.lower() not in stop_words):
            filtered_words.append(word)
            
    return filtered_words

In [9]:
def removeDuplicates(text) :
    
    dictionary = {}
    duplicates_removed = []
    
    for word in text:
        
        if (word in dictionary):
            continue
        else:
            dictionary[word] = text.index(word)
            duplicates_removed.append(word)
            
    return (dictionary, duplicates_removed)

In [10]:
def readJSON(location):
    with open(location) as json_file:
        data = json.load(json_file)
        return data

In [11]:
site_0 = readJSON('Extractions/Leg Pain/Site_0.json')

In [61]:
def removeHTML(data):
    p = re.compile(r'<.*?>')
    return p.sub('', data)

In [62]:
def removeExpression(data, expression):
    p = re.compile(r'{}'.format(expression))
    return p.sub('', data)

In [76]:
def removeTags(taglist):
    
    
    formatted = []
    
#     extraction = readJSON(location)
#     tagTypes = ['title', 'h','h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'a', 'meta']
    
    for i in taglist:
        
        x = removeHTML(i)
        x = removeExpression(x, '\n')
        x = removeExpression(x, '\r')
        x = removeExpression(x, '\t')
        
        formatted.append(x)
        
        
    return formatted

In [77]:
removeTags(site_0['h2'])

['Bones, Joints, and Muscles', 'Veins and Clots', 'Lower Leg Pain: Nerves']

In [78]:
def formatData(data):
    
    
    collection = {}
    tagTypes = ['title', 'h','h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p',]
    
    for i in tagTypes:
        
        formatted = removeTags(data[i])
        collection[i] = formatted

    return collection

In [79]:
def compileAllWords(data):
    
    words = []
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    
    for key in data:
        
        for i in data[key]:
            
            words += tokenizer.tokenize(i)
        
    return words

In [80]:
def filter(words, stopwords):
    
    filtered = []
    
    for word in words:
        
        if word.lower() not in stopwords:
            
            filtered.append(word)
            
    return filtered

In [81]:
def count(filtered):
    
    counted = {}
    
    for word in filtered:
        
        if word in counted:
            counted[word] += 1
        else:
            counted[word] = 1
            
            
    counted = sorted(counted.items(), key=lambda x: x[1], reverse=True)
    
    return counted

In [83]:
def getFilesFromDirectory(directory):

    data = []
    
    for file in os.listdir(directory):
        if file.endswith(".json"):
            
            data.append(readJSON(directory + file))
            
    return data            

In [84]:
all_files = getFilesFromDirectory('Extractions/Leg Pain/')

In [85]:
def compileWordsForFiles(files):
    
    all_words = []
    
    for file in files:
        formatted = formatData(file)
        compiled = compileAllWords(formatted)
        filtered = filter(compiled, stop_words)
        all_words += filtered
        
    return all_words        

In [86]:
def compileStringsInFiles(files):
    
    strings = []
    
    for file in files:
        
        data = formatData(file)
        
        for key in data:
            
            for i in data[key]:
            
                strings.append(i)
    
    return strings    

In [87]:
compileStringsInFiles(all_files)

['Leg pain When to see a doctor - Mayo Clinic',
 'Leg pain',
 'Appointments at Mayo Clinic',
 'Free E-newsletter',
 'When to see a doctor',
 'Products and Services',
 'See also',
 'Advertisement',
 'Other Topics in Patient Care &amp; Health Info',
 'Mayo Clinic Footer',
 'Call for immediate medical help or go to an emergency room if you:',
 'See your doctor as soon as possible if you have:',
 'Schedule an office visit if:',
 'Self-care',
 'Mayo Clinic Marketplace',
 'Legal Conditions and Terms',
 'Reprint Permissions',
 'HON',
 'COVID-19 updates',
 'Mayo Clinic offers appointments in Arizona, Florida and Minnesota and at Mayo Clinic Health System locations.',
 'Subscribe to Housecall',
 'Our general interest e-newsletter keeps you up to date on a wide variety of health topics.',
 'Minor leg pain often responds well to home treatments. To relieve mild pain and swelling:',
 'Mayo Clinic does not endorse companies or products. Advertising revenue supports our not-for-profit mission.',
 'C

In [45]:
count(compileWordsForFiles(all_files))

[('pain', 238),
 ('leg', 149),
 ('may', 87),
 ('blood', 74),
 ('disease', 67),
 ('legs', 66),
 ('doctor', 63),
 ('cause', 63),
 ('symptoms', 53),
 ('Leg', 51),
 ('causes', 51),
 ('Pain', 49),
 ('help', 44),
 ('common', 42),
 ('also', 41),
 ('muscles', 41),
 ('include', 41),
 ('treatment', 41),
 ('medical', 38),
 ('injury', 37),
 ('swelling', 36),
 ('might', 36),
 ('See', 35),
 ('condition', 34),
 ('people', 34),
 ('Symptoms', 34),
 ('muscle', 33),
 ('WebMD', 33),
 ('get', 33),
 ('often', 31),
 ('Mayo', 30),
 ('Health', 30),
 ('not', 29),
 ('veins', 29),
 ('usually', 29),
 ('Clinic', 28),
 ('skin', 28),
 ('conditions', 27),
 ('Treatment', 27),
 ('lower', 26),
 ('vein', 26),
 ('bone', 26),
 ('knee', 25),
 ('amp', 24),
 ('diagnosis', 24),
 ('peripheral', 24),
 ('nerve', 24),
 ('heart', 24),
 ('artery', 24),
 ('injuries', 23),
 ('feel', 23),
 ('serious', 23),
 ('nerves', 23),
 ('body', 23),
 ('back', 22),
 ('happens', 22),
 ('area', 22),
 ('walking', 22),
 ('health', 21),
 ('Causes', 21),
