# Building medical and non-medical document Classifier Model 

# Option one: train a model using train test split

In [15]:
#importing libraries
import nltk
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

def get_documents_from_category(category_title, max_documents=100):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "categorymembers",
        "cmtitle": category_title,
        "cmlimit": max_documents
    }

    response = requests.get(base_url, params=params)
    data = response.json()
    
    documents = []
    # Check if the response contains category members
    if "query" in data and "categorymembers" in data["query"]:
        for member in data["query"]["categorymembers"]:
            page_title = member["title"]
            #print("page_title",page_title)
            document_text = get_wikipedia_text(page_title)
            #print(document_text)
            documents.append(document_text)
       
    return documents

def get_wikipedia_text(title):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "exintro": True
    }

    response = requests.get(base_url, params=params)
    data = response.json()
    #print("json data", data) 
    # Check if the page exists
    if "query" in data and "pages" in data["query"]:
        page_id = list(data["query"]["pages"].keys())[0]

        # Check if the page has an extract
        if "extract" in data["query"]["pages"][page_id]:
            return data["query"]["pages"][page_id]["extract"]

    return ""

def preprocess_text(text, use_stopwords=False, use_stemming=False, use_lemmatization=False):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Tokenization
    words = nltk.word_tokenize(text.lower())

    # Remove non-alphabetic characters
    words = [word for word in words if word.isalpha()]

    # remove stopwords
    if use_stopwords:
        stop_words = set(stopwords.words("english"))
        words = [word for word in words if word not in stop_words]

    # apply stemming
    if use_stemming:
        stemmer = SnowballStemmer("english")
        words = [stemmer.stem(word) for word in words]

    # apply lemmatization
    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words)

# Fetch documents from multiple Wikipedia categories
medical_categories = [
    "Category:Medical literature",
    "Category:Medicine",
    "Category:Health",
    "Category:Anatomy",
    "Category:Diseases",
    "Category:Medical treatments",
    "Category:Oncology",
    "Category:Pediatrics"
]

non_medical_categories = [
    "Category:Science",
    "Category:History",
    "Category:Arts",
    "Category:Geography",
    "Category:Technology",
    "Category:Sport",
    "Category:Biology"
]

# Fetch documents from medical categories
medical_documents = []
for category in medical_categories:
    medical_documents.extend(get_documents_from_category(category, max_documents=100))

# Fetch documents from non-medical categories
non_medical_documents = []
for category in non_medical_categories:
    non_medical_documents.extend(get_documents_from_category(category, max_documents=100))

# Preprocess the texts
preprocessed_medical_documents = [preprocess_text(doc, use_stopwords=True, use_stemming=True) for doc in medical_documents]
preprocessed_non_medical_documents = [preprocess_text(doc, use_stopwords=True, use_stemming=True) for doc in non_medical_documents]

# Create a dataset with labels (1 for medical, 0 for non-medical)
documents = preprocessed_medical_documents + preprocessed_non_medical_documents
labels = [1] * len(preprocessed_medical_documents) + [0] * len(preprocessed_non_medical_documents)

# Split the data into training, validation, and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(documents, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create a Bag of Words model
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_val_bow = vectorizer.transform(X_val)
X_test_bow = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_bow, y_train)

# Train a Logistic Regression classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_bow, y_train)

# Make predictions on the validation set
nb_val_predictions = nb_classifier.predict(X_val_bow)
lr_val_predictions = lr_classifier.predict(X_val_bow)

# Evaluate the models on the validation set
nb_val_accuracy = accuracy_score(y_val, nb_val_predictions)
lr_val_accuracy = accuracy_score(y_val, lr_val_predictions)

print(f"Naive Bayes Validation Accuracy: {nb_val_accuracy}")
print(f"Logistic Regression Validation Accuracy: {lr_val_accuracy}")

# Make predictions on the test set
nb_test_predictions = nb_classifier.predict(X_test_bow)
lr_test_predictions = lr_classifier.predict(X_test_bow)

# Evaluate the models on the test set
nb_test_accuracy = accuracy_score(y_test, nb_test_predictions)
lr_test_accuracy = accuracy_score(y_test, lr_test_predictions)

print(f"Naive Bayes Test Accuracy: {nb_test_accuracy}")
print(f"Logistic Regression Test Accuracy: {lr_test_accuracy}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Abel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Naive Bayes Validation Accuracy: 0.8270676691729323
Logistic Regression Validation Accuracy: 0.8872180451127819
Naive Bayes Test Accuracy: 0.7969924812030075
Logistic Regression Test Accuracy: 0.8045112781954887


# Option two: train a model using cross-validation

In [13]:
import numpy as np
import nltk
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

def get_documents_from_category(category_title, max_documents=100):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "categorymembers",
        "cmtitle": category_title,
        "cmlimit": max_documents
    }

    response = requests.get(base_url, params=params)
    data = response.json()
    
    documents = []
    # Check if the response contains category members
    if "query" in data and "categorymembers" in data["query"]:
        for member in data["query"]["categorymembers"]:
            page_title = member["title"]
            #print("page_title",page_title)
            document_text = get_wikipedia_text(page_title)
            #print(document_text)
            documents.append(document_text)
       
    return documents

def get_wikipedia_text(title):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "exintro": True
    }

    response = requests.get(base_url, params=params)
    data = response.json()
    #print("json data", data) 
    # Check if the page exists
    if "query" in data and "pages" in data["query"]:
        page_id = list(data["query"]["pages"].keys())[0]

        # Check if the page has an extract
        if "extract" in data["query"]["pages"][page_id]:
            return data["query"]["pages"][page_id]["extract"]

    return ""

def preprocess_text(text, use_stopwords=False, use_stemming=False, use_lemmatization=False):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()

    # Tokenization
    words = nltk.word_tokenize(text.lower())

    # Remove non-alphabetic characters
    words = [word for word in words if word.isalpha()]

    # remove stopwords
    if use_stopwords:
        stop_words = set(stopwords.words("english"))
        words = [word for word in words if word not in stop_words]

    # apply stemming
    if use_stemming:
        stemmer = SnowballStemmer("english")
        words = [stemmer.stem(word) for word in words]

    # apply lemmatization
    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words)

# Fetch documents from multiple Wikipedia categories
medical_categories = [
    "Category:Medical literature",
    "Category:Medicine",
    "Category:Health",
    "Category:Anatomy",
    "Category:Diseases",
    "Category:Medical treatments",
    "Category:Oncology",
    "Category:Pediatrics"
]

non_medical_categories = [
    "Category:Science",
    "Category:History",
    "Category:Arts",
    "Category:Geography",
    "Category:Technology",
    "Category:Sport",
    "Category:Biology"
]

# Fetch documents from medical categories
medical_documents = []
for category in medical_categories:
    medical_documents.extend(get_documents_from_category(category, max_documents=100))

# Fetch documents from non-medical categories
non_medical_documents = []
for category in non_medical_categories:
    non_medical_documents.extend(get_documents_from_category(category, max_documents=100))

# Preprocess the texts
preprocessed_medical_documents = [preprocess_text(doc, use_stopwords=True, use_stemming=True) for doc in medical_documents]
preprocessed_non_medical_documents = [preprocess_text(doc, use_stopwords=True, use_stemming=True) for doc in non_medical_documents]

# Create a dataset with labels (1 for medical, 0 for non-medical)
documents = preprocessed_medical_documents + preprocessed_non_medical_documents
labels = [1] * len(preprocessed_medical_documents) + [0] * len(preprocessed_non_medical_documents)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(documents, labels, test_size=0.2, random_state=42)

# Create a pipeline with CountVectorizer and Naive Bayes
nb_model = make_pipeline(CountVectorizer(), MultinomialNB())

# Create a pipeline with CountVectorizer and Logistic Regression
lr_model = make_pipeline(CountVectorizer(), LogisticRegression())

# Perform cross-validation on the training set
nb_cv_scores = cross_val_score(nb_model, X_train, y_train, cv=5, scoring='accuracy')
lr_cv_scores = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='accuracy')

# Display cross-validation scores
print("Naive Bayes Cross-Validation Scores:", nb_cv_scores)
print("Logistic Regression Cross-Validation Scores:", lr_cv_scores)

# Average cross-validation scores
print("Average Naive Bayes Cross-Validation Score:", np.mean(nb_cv_scores))
print("Average Logistic Regression Cross-Validation Score:", np.mean(lr_cv_scores))

# Train the models on the entire training set
nb_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)

# Evaluate the models on the test set
nb_test_predictions = nb_model.predict(X_test)
lr_test_predictions = lr_model.predict(X_test)

nb_test_accuracy = accuracy_score(y_test, nb_test_predictions)
lr_test_accuracy = accuracy_score(y_test, lr_test_predictions)

print("Naive Bayes Test Accuracy:", nb_test_accuracy)
print("Logistic Regression Test Accuracy:", lr_test_accuracy)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Abel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Naive Bayes Cross-Validation Scores: [0.82394366 0.82394366 0.79577465 0.86524823 0.69503546]
Logistic Regression Cross-Validation Scores: [0.83802817 0.85211268 0.84507042 0.81560284 0.84397163]
Average Naive Bayes Cross-Validation Score: 0.8007891319548497
Average Logistic Regression Cross-Validation Score: 0.838957147138148
Naive Bayes Test Accuracy: 0.8135593220338984
Logistic Regression Test Accuracy: 0.8305084745762712
