### Importing Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec


### Loading the dataset

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

In [3]:
print(f"Number of documents: {len(newsgroups_data.data)}")
print(f"Number of categories: {len(newsgroups_data.target_names)}")
print("Categories:", newsgroups_data.target_names)

Number of documents: 2034
Number of categories: 4
Categories: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']


### Spliting the Datasets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(newsgroups_data.data, newsgroups_data.target, test_size=0.2, random_state=42)

### CountVectorizer

In [None]:
algorithms = {
    'Multinomial Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Support Vector Machine': SVC(),
    'Decision Tree': DecisionTreeClassifier()
}

results = []
for name, model in algorithms.items():
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('clf', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append((name, 'CountVectorizer', accuracy, f1))
    print(f"{name} with CountVectorizer - Accuracy: {accuracy:.3f}, F1-score: {f1:.3f}")



Multinomial Naive Bayes with CountVectorizer - Accuracy: 0.801, F1-score: 0.801
Logistic Regression with CountVectorizer - Accuracy: 0.771, F1-score: 0.768
Support Vector Machine with CountVectorizer - Accuracy: 0.501, F1-score: 0.470
Decision Tree with CountVectorizer - Accuracy: 0.600, F1-score: 0.604


### TfidTransformer

In [None]:
for name, model in algorithms.items():
    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append((name, 'TfidfTransformer', accuracy, f1))
    print(f"{name} with TfidfTransformer - Accuracy: {accuracy:.3f}, F1-score: {f1:.3f}")


Multinomial Naive Bayes with TfidfTransformer - Accuracy: 0.749, F1-score: 0.695
Logistic Regression with TfidfTransformer - Accuracy: 0.811, F1-score: 0.803
Support Vector Machine with TfidfTransformer - Accuracy: 0.779, F1-score: 0.772
Decision Tree with TfidfTransformer - Accuracy: 0.582, F1-score: 0.573


### Word2Vec

In [None]:
X_train_tokenized = [doc.split() for doc in X_train]
X_test_tokenized = [doc.split() for doc in X_test]


word2vec_model = Word2Vec(sentences=X_train_tokenized, vector_size=100, window=5, min_count=2, workers=4)

def get_avg_word2vec_vectors(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

X_train_w2v = np.array([get_avg_word2vec_vectors(doc, word2vec_model) for doc in X_train_tokenized])
X_test_w2v = np.array([get_avg_word2vec_vectors(doc, word2vec_model) for doc in X_test_tokenized])


for name, model in algorithms.items():
    if name == 'Multinomial Naive Bayes':
        continue  
    model.fit(X_train_w2v, y_train)
    y_pred = model.predict(X_test_w2v)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append((name, 'Word2Vec', accuracy, f1))
    print(f"{name} with Word2Vec - Accuracy: {accuracy:.3f}, F1-score: {f1:.3f}")


Logistic Regression with Word2Vec - Accuracy: 0.484, F1-score: 0.437
Support Vector Machine with Word2Vec - Accuracy: 0.428, F1-score: 0.382
Decision Tree with Word2Vec - Accuracy: 0.373, F1-score: 0.372


### Doc2Vec

In [None]:

X_train_tagged = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(X_train)]
X_test_tagged = [TaggedDocument(words=doc.split(), tags=[i]) for i, doc in enumerate(X_test)]

doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=2, workers=4, epochs=20)
doc2vec_model.build_vocab(X_train_tagged)
doc2vec_model.train(X_train_tagged, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

X_train_d2v = np.array([doc2vec_model.infer_vector(doc.words) for doc in X_train_tagged])
X_test_d2v = np.array([doc2vec_model.infer_vector(doc.words) for doc in X_test_tagged])

for name, model in algorithms.items():
    if name == 'Multinomial Naive Bayes':
        continue 
    model.fit(X_train_d2v, y_train)
    y_pred = model.predict(X_test_d2v)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    results.append((name, 'Doc2Vec', accuracy, f1))
    print(f"{name} with Doc2Vec - Accuracy: {accuracy:.3f}, F1-score: {f1:.3f}")

Logistic Regression with Doc2Vec - Accuracy: 0.676, F1-score: 0.669
Support Vector Machine with Doc2Vec - Accuracy: 0.654, F1-score: 0.631
Decision Tree with Doc2Vec - Accuracy: 0.501, F1-score: 0.497
