<a href="https://colab.research.google.com/github/Amirhatamian/NLP/blob/main/The_Classification_of_Texts_using_Wikipedia.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install wikipedia nltk scikit-learn PyPDF2




In [2]:
pip install Wikipedia-API



In [3]:
#Import Libraries and Download NLTK Data

import re
import string
import numpy as np
import wikipedia
import wikipediaapi
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import PyPDF2
import warnings

# Suppress specific warnings from a library
warnings.filterwarnings("ignore", category=UserWarning, module='wikipedia')

# Download NLTK data
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Initialize the Wikipedia API for the English language
wiki_wiki = wikipediaapi.Wikipedia('english')

# Function to retrieve text from a Wikipedia page
def get_wikipedia_text(page_title):
    # Retrieve the page object for the given title
    page = wiki_wiki.page(page_title)

    # Check if the page exists
    if not page.exists():
        return None

    # Return the text content of the page
    return page.text


In [5]:
#POS Tag Converter Function
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun if unknown

In [6]:
def extract_keywords(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    filtered_words = [
        lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos))
        for word, pos in tagged_words
        if word.isalnum() and word.lower() not in stop_words
    ]
    return filtered_words


In [7]:
def extract_nouns(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    nouns = [
        lemmatizer.lemmatize(word.lower(), get_wordnet_pos(pos))
        for word, pos in tagged_words
        if pos.startswith('N') and word.lower() not in stop_words and word.isalnum()
    ]
    return nouns


In [8]:
def get_wikipedia_text(topic):
    try:
        # Get the summary of the Wikipedia page for the topic
        summary = wikipedia.summary(topic)
        return summary
    except wikipedia.exceptions.DisambiguationError as e:
        # Handle disambiguation errors by choosing the first option
        return wikipedia.summary(e.options[0])
    except wikipedia.exceptions.PageError:
        # Handle page errors (e.g., page not found)
        return None

In [9]:
def extract_top_nouns(topics, num_top_nouns=10):
    all_nouns = []

    for topic in topics:
        text = get_wikipedia_text(topic)
        if text:
            all_nouns.extend(extract_nouns(text))

    # Remove common stopwords from the list of nouns
    stop_words = set(stopwords.words('english'))
    filtered_nouns = [noun for noun in all_nouns if noun.lower() not in stop_words]

    # Create a frequency distribution of the nouns
    nouns_freq_dist = FreqDist(filtered_nouns)
    # Get the most common nouns
    top_nouns = [word for word, _ in nouns_freq_dist.most_common(num_top_nouns)]

    return top_nouns

In [10]:
# Sample annotated keywords for geographic and non_geographic topics

geographic_topics = ['New York', 'Mount Everest', 'Sahara Desert', 'Amazon River', 'Paris']
non_geographic_topics = ['Quantum Mechanics', 'Artificial Intelligence', 'Shakespeare', 'Modern Art', 'Jazz Music']


In [11]:
top_geographic_keywords = extract_top_nouns(geographic_topics, num_top_nouns=10)
top_non_geographic_keywords = extract_top_nouns(non_geographic_topics, num_top_nouns=10)

print("Top geographic keywords:", top_geographic_keywords)
print("Top non-geographic keywords:", top_non_geographic_keywords)

Top geographic keywords: ['state', 'river', 'new', 'city', 'world', 'york', 'paris', 'desert', 'area', 'region']
Top non-geographic keywords: ['ai', 'jazz', 'art', 'quantum', 'shakespeare', 'work', 'theory', 'system', 'intelligence', 'physic']


In [12]:
# Combine all topics
all_topics = geographic_topics + non_geographic_topics
all_docs = []
all_labels = []

# Fetch text and extract nouns for each topic
for topic in all_topics:
    text = get_wikipedia_text(topic)
    if text:
        nouns = extract_nouns(text)
        all_docs.append(" ".join(nouns))
        # Label as 1 for geographic and 0 for non-geographic
        all_labels.append(1 if topic in geographic_topics else 0)

# Combine top keywords from both categories
all_top_keywords = top_geographic_keywords + top_non_geographic_keywords

# Vectorize the documents using the top keywords
vectorizer = CountVectorizer(vocabulary=all_top_keywords)
X = vectorizer.transform(all_docs)
y = all_labels

print("Feature matrix shape:", X.shape)
print("Labels:", y)

Feature matrix shape: (9, 20)
Labels: [1, 1, 1, 1, 0, 0, 0, 0, 0]


In [13]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Majority class naive classifier
class NaiveClassifier:
    def __init__(self):
        self.majority_class = None

    def fit(self, X, y):
        unique_classes, counts = np.unique(y, return_counts=True)
        self.majority_class = unique_classes[np.argmax(counts)]

    def predict(self, X):
        return np.full(X.shape[0], self.majority_class)

# Instantiate and train the naive classifier
naive_classifier = NaiveClassifier()
naive_classifier.fit(X_train, y_train)

# Predict on the test set
naive_predictions = naive_classifier.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, naive_predictions)
report = classification_report(y_test, naive_predictions, target_names=['non-geographic', 'geographic'], zero_division=0)

print(f'Accuracy: {accuracy * 100:.2f}%')
print('Classification Report:')
print(report)


Accuracy: 50.00%
Classification Report:
                precision    recall  f1-score   support

non-geographic       0.50      1.00      0.67         1
    geographic       0.00      0.00      0.00         1

      accuracy                           0.50         2
     macro avg       0.25      0.50      0.33         2
  weighted avg       0.25      0.50      0.33         2



In [15]:

# Instantiate the logistic regression model with a specific random state for reproducibility
logistic_model = LogisticRegression(random_state=42)

# Train the logistic regression model on the training data
logistic_model.fit(X_train, y_train)

# Make predictions on the test data
logistic_predictions = logistic_model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, logistic_predictions)
report = classification_report(y_test, logistic_predictions, zero_division=0)

# Print performance metrics
print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)


Logistic Regression Performance:
Accuracy: 50.00%
Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



In [23]:
def classify_pdf(pdf_file_path, vectorizer, classifier):
    # Try-except block for error handling
    try:
        pdf_text = ''  # Initialize pdf_text to accumulate text from all pages
        with open(pdf_file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text = page.extract_text()
                if text:
                    pdf_text += text
            print("Extracted text from PDF:", pdf_text)  # Debugging print statement

        # Check if pdf_text is not empty
        if not pdf_text:
            return "No extractable text found in the PDF."

        # Preprocess and vectorize the PDF text
        document_vectorized = vectorizer.transform([pdf_text])  # Ensure text is in a list format

        # Predict the class of the PDF document
        prediction = classifier.predict(document_vectorized)
        return 'The document is classified as geographic.' if prediction[0] == 1 else 'The document is classified as non-geographic.'
    except Exception as e:
        return f"An error occurred: {e}"