# Building medical and non-medical document classifier model 

In [10]:
from sklearn.metrics import precision_score, f1_score
import numpy as np
import nltk
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

def get_documents_from_category(category_title, max_documents=100):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "list": "categorymembers",
        "cmtitle": category_title,
        "cmlimit": max_documents
    }

    response = requests.get(base_url, params=params)
    data = response.json()
    
    documents = []
    # Check if the response contains category members
    if "query" in data and "categorymembers" in data["query"]:
        for member in data["query"]["categorymembers"]:
            page_title = member["title"]
            #print("page_title",page_title)
            document_text = get_wikipedia_text(page_title)
            #print(document_text)
            if document_text.strip():
                documents.append(document_text)
       
    return documents

def get_wikipedia_text(title):
    base_url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "exintro": True
    }

    response = requests.get(base_url, params=params)
    data = response.json()
    #print("json data", data) 
    # Check if the page exists
    if "query" in data and "pages" in data["query"]:
        page_id = list(data["query"]["pages"].keys())[0]

        # Check if the page has an extract
        if "extract" in data["query"]["pages"][page_id]:
            return data["query"]["pages"][page_id]["extract"]

    return ""

def preprocess_text(text, use_stopwords=False, use_stemming=False, use_lemmatization=False):
    # Remove HTML tags
    text = BeautifulSoup(text, 'html.parser').get_text()
    #print("text", text)
    # Tokenization
    words = nltk.word_tokenize(text.lower())

    # Remove non-alphabetic characters
    words = [word for word in words if word.isalpha()]

    # remove stopwords
    if use_stopwords:
        stop_words = set(stopwords.words("english"))
        words = [word for word in words if word not in stop_words]

    # apply stemming
    if use_stemming:
        stemmer = SnowballStemmer("english")
        words = [stemmer.stem(word) for word in words]

    # apply lemmatization
    if use_lemmatization:
        lemmatizer = WordNetLemmatizer()
        words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words)

# Fetch documents from multiple Wikipedia categories
medical_categories = [
    "Category:Medical literature",
    "Category:Medicine",
    "Category:Health",
    "Category:Anatomy",
    "Category:Diseases",
    "Category:Medical treatments",
    "Category:Oncology",
    "Category:Pediatrics",
    "Category:Pharmacology",
    "Category:Nursing",
    "Category:Public_health",
    "Category:Surgery",
    "Category:Medical_diagnosis",
    "Category:Genetics",
    "Category:Neurology",
    "Category:Psychiatry",
    "Category:Immunology",
    "Category:Cardiology"
]

non_medical_categories = [
    "Category:Science",
    "Category:History",
    "Category:Arts",
    "Category:Geography",
    "Category:Technology",
    "Category:Sport",
    "Category:Computing",
    "Category:Entertainment",
    "Category:Business",
    "Category:Food_and_drink",
    "Category:Philosophy",
    "Category:Literature",
    "Category:Music",
    "Category:Politics",
    "Category:Economics",
    "Category:Religion"
]

# Fetch documents from medical categories
medical_documents = []
for category in medical_categories:
    medical_documents.extend(get_documents_from_category(category, max_documents=100))

# Fetch documents from non-medical categories
non_medical_documents = []
for category in non_medical_categories:
    non_medical_documents.extend(get_documents_from_category(category, max_documents=100))

# Preprocess the texts
preprocessed_medical_documents = [preprocess_text(doc, use_stopwords=True, use_stemming=True) for doc in medical_documents if doc.strip()]
preprocessed_non_medical_documents = [preprocess_text(doc, use_stopwords=True, use_stemming=True) for doc in non_medical_documents if doc.strip()]

# Create a dataset with labels (1 for medical, 0 for non-medical)
documents = preprocessed_medical_documents + preprocessed_non_medical_documents
labels = [1] * len(preprocessed_medical_documents) + [0] * len(preprocessed_non_medical_documents)

# Create a DataFrame
df = pd.DataFrame({'Text': documents, 'Label': labels})

# Save DataFrame to CSV
df.to_csv('wikipedia_documents_labels.csv', index=False)

# Display the DataFrame
print(df.head())

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(documents, labels, test_size=0.2, random_state=42)

# Create a pipeline with CountVectorizer and Naive Bayes
nb_model = make_pipeline(CountVectorizer(), MultinomialNB())

# Create a pipeline with CountVectorizer and Logistic Regression
lr_model = make_pipeline(CountVectorizer(), LogisticRegression())

# Perform cross-validation on the training set
nb_cv_scores = cross_val_score(nb_model, X_train, y_train, cv=5, scoring='accuracy')
lr_cv_scores = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='accuracy')

# Display cross-validation scores
print("Naive Bayes Cross-Validation Scores:", nb_cv_scores)
print("Logistic Regression Cross-Validation Scores:", lr_cv_scores)

# Average cross-validation scores
print("Average Naive Bayes Cross-Validation Score:", np.mean(nb_cv_scores))
print("Average Logistic Regression Cross-Validation Score:", np.mean(lr_cv_scores))

# Train the models on the entire training set
nb_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)

# Evaluate the models on the test set
nb_test_predictions = nb_model.predict(X_test)
lr_test_predictions = lr_model.predict(X_test)

nb_test_accuracy = accuracy_score(y_test, nb_test_predictions)
lr_test_accuracy = accuracy_score(y_test, lr_test_predictions)

# Calculate precision and F1 Score
nb_precision = precision_score(y_test, nb_test_predictions)
nb_f1_score = f1_score(y_test, nb_test_predictions)

lr_precision = precision_score(y_test, lr_test_predictions)
lr_f1_score = f1_score(y_test, lr_test_predictions)

import pandas as pd
# Create a DataFrame
results_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'F1 Score'],
    'Naive Bayes': [nb_test_accuracy, nb_precision, nb_f1_score],
    'Logistic Regression': [lr_test_accuracy, lr_precision, lr_f1_score]
})

print(results_df)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Abel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Abel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                Text  Label
0  medic literatur scientif literatur medicin art...      1
1  list list journal book recommend small hospit ...      1
2  medicin case report detail report symptom sign...      1
3  counterblast tobacco treatis written king jame...      1
4  cross qualiti chasm new health system centuri ...      1
Naive Bayes Cross-Validation Scores: [0.91419142 0.92739274 0.93069307 0.92409241 0.91721854]
Logistic Regression Cross-Validation Scores: [0.90759076 0.95379538 0.93069307 0.92079208 0.9205298 ]
Average Naive Bayes Cross-Validation Score: 0.9227176360020108
Average Logistic Regression Cross-Validation Score: 0.9266802176906431
      Metric  Naive Bayes  Logistic Regression
0   Accuracy     0.918206             0.936675
1  Precision     0.929104             0.964706
2   F1 Score     0.941399             0.953488
