In [None]:
!pip install wikipedia

In [2]:
import wikipedia
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import random
import warnings
from bs4 import GuessedAtParserWarning

# Suppress the specific BeautifulSoup warning
warnings.filterwarnings('ignore', category=GuessedAtParserWarning)

In [6]:
# Download all required NLTK data
def download_nltk_resources():
    resources = [
        'punkt',
        'stopwords',
        'wordnet',
        'averaged_perceptron_tagger',
        'punkt_tab'
    ]
    for resource in resources:
        try:
            nltk.download(resource, quiet=True)
        except:
            print(f"Warning: Could not download {resource}")

# Download resources at initialization
download_nltk_resources()

In [7]:
class WikiTextClassifier:
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def collect_wikipedia_data(self, categories, samples_per_category=100):
        """
        Collect training data from Wikipedia
        """
        data = []
        labels = []

        for category, is_geographic in categories.items():
            # Get articles from category
            wikipedia.set_rate_limiting(True)  # Follow API etiquette
            articles = wikipedia.search(category, results=samples_per_category)

            for article in articles:
                try:
                    page = wikipedia.page(article)
                    data.append(page.content)
                    labels.append(1 if is_geographic else 0)
                except:
                    continue

        return data, labels

    def preprocess_text(self, text, use_stemming=False, use_lemmatization=False):
        """
        Preprocess text with optional stemming and lemmatization
        """
        # Tokenization
        tokens = word_tokenize(text.lower())

        # Remove stopwords
        tokens = [token for token in tokens if token not in self.stop_words]

        # Apply stemming or lemmatization
        if use_stemming:
            tokens = [self.stemmer.stem(token) for token in tokens]
        elif use_lemmatization:
            tokens = [self.lemmatizer.lemmatize(token) for token in tokens]

        return ' '.join(tokens)

    def train_classifier(self, X, y, model_type='naive_bayes',
                        use_stemming=False, use_lemmatization=False):
        """
        Train the classifier with specified options
        """
        # Preprocess all texts
        processed_texts = [
            self.preprocess_text(text, use_stemming, use_lemmatization)
            for text in X
        ]

        # Create vectorizer
        vectorizer = CountVectorizer() if model_type == 'naive_bayes' else TfidfVectorizer()
        X_vectorized = vectorizer.fit_transform(processed_texts)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X_vectorized, y, test_size=0.2, random_state=42
        )

        # Select and train model
        if model_type == 'naive_bayes':
            model = MultinomialNB()
        else:
            model = LogisticRegression(max_iter=1000)

        model.fit(X_train, y_train)

        # Evaluate
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)

        return {
            'model': model,
            'vectorizer': vectorizer,
            'accuracy': accuracy,
            'report': report
        }

    def predict(self, text, model_dict, use_stemming=False, use_lemmatization=False):
        """
        Predict the class of a new text
        """
        processed_text = self.preprocess_text(
            text, use_stemming, use_lemmatization
        )
        vectorized_text = model_dict['vectorizer'].transform([processed_text])
        prediction = model_dict['model'].predict(vectorized_text)
        return "Geographic" if prediction[0] == 1 else "Non-Geographic"

In [12]:
# Initialize classifier
classifier = WikiTextClassifier()

# Define categories for training
categories = {
    'Geography': True,
    'Mountains': True,
    'Cities': True,
    'Countries': True,
    'Technology': False,
    'Science': False,
    'Art': False,
    'Music': False
}

print("Collecting Wikipedia data...")
X, y = classifier.collect_wikipedia_data(categories, samples_per_category=70)

# Train different models and compare results
models = {
    'naive_bayes': {
        'basic': {},
        'stemming': {'use_stemming': True},
        'lemmatization': {'use_lemmatization': True}
    },
    'logistic_regression': {
        'basic': {},
        'stemming': {'use_stemming': True},
        'lemmatization': {'use_lemmatization': True}
    }
}

results = {}

for model_type in models:
    for variant, params in models[model_type].items():
        print(f"\nTraining {model_type} with {variant} configuration...")
        result = classifier.train_classifier(
            X, y, model_type=model_type, **params
        )
        results[f"{model_type}_{variant}"] = result
        print(f"Accuracy: {result['accuracy']:.4f}")
        print("Classification Report:")
        print(result['report'])

# Test the best model with some example texts
best_model = max(results.items(), key=lambda x: x[1]['accuracy'])
print(f"\nBest model: {best_model[0]}")

# Test examples
test_texts = [
    "Mount Everest is the highest mountain above sea level located in the Himalayas.",
    "The theory of relativity was proposed by Albert Einstein in 1915.",
]

for text in test_texts:
    prediction = classifier.predict(text, best_model[1])
    print(f"\nText: {text}")
    print(f"Prediction: {prediction}")



Collecting Wikipedia data...

Training naive_bayes with basic configuration...
Accuracy: 0.9583
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96        49
           1       0.98      0.94      0.96        47

    accuracy                           0.96        96
   macro avg       0.96      0.96      0.96        96
weighted avg       0.96      0.96      0.96        96


Training naive_bayes with stemming configuration...
Accuracy: 0.9583
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96        49
           1       0.98      0.94      0.96        47

    accuracy                           0.96        96
   macro avg       0.96      0.96      0.96        96
weighted avg       0.96      0.96      0.96        96


Training naive_bayes with lemmatization configuration...
Accuracy: 0.9583
Classification Report:
              precision    recall  f