In [None]:
from nltk.corpus import CategorizedPlaintextCorpusReader
import random

#I create a personalized corpus with two category: medical and other.
#500 files are medical and 500 files are other.
my_corpus = CategorizedPlaintextCorpusReader('corpora', r"(?!\.).*\.txt", cat_pattern=r"(medical|other)/.*")

#I create a list of file where every file is categorized in medical or other.
#In my_documents[i][0] for i in range(len(my_documents)) I have access to the text of the file.
#In my documents[i][1] for i in range(len(my_documents)) I have access to the category of the file.
my_documents = [(list(my_corpus.words(fileid)), category) for category in my_corpus.categories() for fileid in my_corpus.fileids(category)]
random.shuffle(my_documents)

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stop_words = set(stopwords.words('english'))

#This function process a document take from my_documents.
def processed_document(document):
    text = document
    #In this point I eliminate from the document every word that isn't an alfanumeric word (ex.: '(', ',', '.', '[' etc.).
    text_only_alnum = [word for word in text if word.isalnum()]
    #In this point I eliminate from the document every number.
    text_without_digit = [word for word in text_only_alnum if not word.isdigit()]
    #In this point I eliminate from the document every english stop words.
    sw_remove_text = [word for word in text_without_digit if not word in stop_words]
    #In the final two point I use the stemming and lemmatization method.
    stemmed = [PorterStemmer().stem(w) for w in sw_remove_text]
    lemmed = [WordNetLemmatizer().lemmatize(w) for w in stemmed]  
    document.clear()
    for word in lemmed:
        document.append(word)
        
for i in range(len(my_documents)):
    processed_document(my_documents[i][0]) 

In [None]:
import nltk

#I select the first 2000 most frequent words from my personalized corpus.
all_words = nltk.FreqDist(w for w in my_corpus.words())
word_features = list(all_words)[:2000] 

#This function is used as a feature extractor.
#The feature extractor simply checks whether each of these words is present in a given document.
def document_features(document, words): 
    document_words = set(document) 
    features = {}
    for word in words:
        features['contains({})'.format(word)] = (word in document_words)
    return features

featuresets = [(document_features(d, word_features), c) for (d,c) in my_documents]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [None]:
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(10)

In [None]:
from bs4 import BeautifulSoup
import requests
from nltk.tokenize import word_tokenize

#This function is used to process a wikipedia page.
#Is very similar to the function 'processed_document(document)'.
def processed_wiki_text(wiki_text):
    #In this point I tokenize the page with a word tokenizer.
    tokenize_text = word_tokenize(wiki_text)
    #The other points are equals to the points of the function 'processed_document(document)' 
    text_only_alnum = [word for word in tokenize_text if word.isalnum()]
    text_without_digit = [word for word in text_only_alnum if not word.isdigit()]
    sw_remove_text = [word for word in text_without_digit if not word in stop_words]
    stemmed = [PorterStemmer().stem(w) for w in sw_remove_text]
    lemmed = [WordNetLemmatizer().lemmatize(w) for w in stemmed]
    
    return lemmed

print("If you want to terminate the program leave the url input in blank!")

while True:
    url = input("Insert the URL of wikipedia topic: \n")
    
    if url == "":
        break

    # get URL
    page = requests.get(url)

    # scrape webpage
    soup = BeautifulSoup(page.content.lower(), 'html.parser')

    my_wiki = ""

    for i in range(len(soup.find_all('p'))):
        my_wiki += soup.find_all('p')[i].get_text()

    processed_wiki = processed_wiki_text(my_wiki)
    all_wiki_words = nltk.FreqDist(w for w in processed_wiki)
    wiki_word_features = list(all_wiki_words)[:20]

    wiki_features = document_features(processed_wiki, wiki_word_features)

    category = classifier.classify(wiki_features)
    print("Category: ", category, '\n')
