In [11]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Ensure documents are strings and handle potential missing values
def preprocess_documents(documents):
    return [str(doc) for doc in documents]

class Word2VecVectorizer:
    def __init__(self):
        self.model = None
        self.dimension = 100  # Dimensionality of Word2Vec vectors.

    def fit(self, X, y=None):
        sentences = [word_tokenize(document.lower()) for document in preprocess_documents(X)]
        self.model = Word2Vec(sentences=sentences, vector_size=self.dimension, window=5, min_count=1, workers=4)
        return self

    def transform(self, X, y=None):
        def document_vector(doc):
            words = word_tokenize(doc.lower())
            valid_words = [word for word in words if word in self.model.wv]
            if valid_words:
                return np.mean([self.model.wv[word] for word in valid_words], axis=0)
            else:
                return np.zeros(self.dimension)
        return np.array([document_vector(document) for document in preprocess_documents(X)])

# Load dataset
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
texts = newsgroups_data.data
labels = newsgroups_data.target

# Preprocess and split dataset
texts_preprocessed = preprocess_documents(texts)
X_train, X_test, y_train, y_test = train_test_split(texts_preprocessed, labels, test_size=0.2, random_state=42)

# Define feature extraction methods and algorithms
feature_extractors = {
    'CountVectorizer': CountVectorizer(),
    'Word2Vec': Word2VecVectorizer()
}

algorithms = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machines': SVC(),
    'Decision Trees': DecisionTreeClassifier()
}

# Initialize results list
results = []

# Benchmarking
for fe_name, fe in feature_extractors.items():
    for algo_name, algo in algorithms.items():
        if fe_name == 'Word2Vec' and algo_name == 'Multinomial Naive Bayes':
            continue  # Skip MultinomialNB when using Word2Vec
        
        print(f"Processing {fe_name} with {algo_name}")
        pipeline = Pipeline([
            ('feature_extraction', fe),
            ('classifier', algo)
        ])
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        
        accuracy = accuracy_score(y_test, y_pred)
        results.append({
            'Feature Extractor': fe_name,
            'Algorithm': algo_name,
            'Accuracy': accuracy
        })

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)

# Optionally, find and print the best configuration based on accuracy
best_result = results_df.loc[results_df['Accuracy'].idxmax()]

# Display best configuration
print("\nBest Configuration:")
print(best_result)

# Save the results to a .txt file in tabular format
results_df.sort_values(by='Accuracy', ascending=False).to_csv('benchmark_results.txt', sep='\t', index=False)

print("Results saved to benchmark_results.txt.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processing CountVectorizer with Multinomial Naive Bayes
Processing CountVectorizer with Logistic Regression
Processing CountVectorizer with Support Vector Machines
Processing CountVectorizer with Decision Trees
Processing Word2Vec with Logistic Regression
Processing Word2Vec with Support Vector Machines
Processing Word2Vec with Decision Trees

Best Configuration:
Feature Extractor        CountVectorizer
Algorithm            Logistic Regression
Accuracy                        0.667905
Name: 1, dtype: object
Results saved to benchmark_results.txt.
