In [62]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import spacy
import string
import re
import numpy as np
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")




In [63]:
# Load 20 Newsgroups dataset
categories = None  # Set to None to load all categories
dataset = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Split into train (60%), validation (20%), and test (20%)
train_texts, test_texts, train_labels, test_labels = train_test_split(dataset.data, dataset.target, test_size=0.4, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42)


In [64]:
def preprocess_text(text):
    """Preprocess text by tokenizing, lemmatizing, and removing stopwords using SpaCy."""
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(tokens)

# Apply preprocessing
train_texts = [preprocess_text(text) for text in train_texts]
val_texts = [preprocess_text(text) for text in val_texts]
test_texts = [preprocess_text(text) for text in test_texts]


In [65]:
# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_val = vectorizer.transform(val_texts)
X_test = vectorizer.transform(test_texts)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)


In [66]:
# Train Naïve Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

# Train Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)

# Train ANN model
ann_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, activation='relu', solver='adam')
ann_model.fit(X_train, y_train)


In [67]:
# Evaluate models
def evaluate_model(model, X, y_true, dataset_name):
    y_pred = model.predict(X)
    print(f"{dataset_name} Metrics for {model.__class__.__name__}:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='weighted'):.4f}\n")

# Report results for all models
models = {
    "Naïve Bayes": nb_model,
    "SVM": svm_model,
    "Logistic Regression": logreg_model,
    "ANN": ann_model
}

In [68]:
for name, model in models.items():
    evaluate_model(model, X_train, y_train, f"Training ({name})")
    evaluate_model(model, X_val, y_val, f"Validation ({name})")
    evaluate_model(model, X_test, y_test, f"Test ({name})")

print("All models trained and evaluated successfully!")


Training (Naïve Bayes) Metrics for MultinomialNB:
Accuracy: 0.7951
Precision: 0.8210
Recall: 0.7951
F1 Score: 0.7889

Validation (Naïve Bayes) Metrics for MultinomialNB:
Accuracy: 0.6811
Precision: 0.7092
Recall: 0.6811
F1 Score: 0.6739

Test (Naïve Bayes) Metrics for MultinomialNB:
Accuracy: 0.6923
Precision: 0.7019
Recall: 0.6923
F1 Score: 0.6826

Training (SVM) Metrics for SVC:
Accuracy: 0.9062
Precision: 0.9170
Recall: 0.9062
F1 Score: 0.9086

Validation (SVM) Metrics for SVC:
Accuracy: 0.6718
Precision: 0.6901
Recall: 0.6718
F1 Score: 0.6763

Test (SVM) Metrics for SVC:
Accuracy: 0.6761
Precision: 0.6892
Recall: 0.6761
F1 Score: 0.6784

Training (Logistic Regression) Metrics for LogisticRegression:
Accuracy: 0.8527
Precision: 0.8629
Recall: 0.8527
F1 Score: 0.8540

Validation (Logistic Regression) Metrics for LogisticRegression:
Accuracy: 0.6885
Precision: 0.7009
Recall: 0.6885
F1 Score: 0.6892

Test (Logistic Regression) Metrics for LogisticRegression:
Accuracy: 0.6944
Precision:

### Model Performance Comparison (TfIdf)

| Model                  | Dataset     | Accuracy | Precision | Recall | F1 Score |
|------------------------|------------|----------|-----------|--------|----------|
| **Naïve Bayes (MultinomialNB)** | Training   | 0.7951   | 0.8210    | 0.7951 | 0.7889   |
|                        | Validation | 0.6811   | 0.7092    | 0.6811 | 0.6739   |
|                        | Test       | 0.6923   | 0.7019    | 0.6923 | 0.6826   |
| **SVM (SVC)**          | Training   | 0.9062   | 0.9170    | 0.9062 | 0.9086   |
|                        | Validation | 0.6718   | 0.6901    | 0.6718 | 0.6763   |
|                        | Test       | 0.6761   | 0.6892    | 0.6761 | 0.6784   |
| **Logistic Regression**| Training   | 0.8527   | 0.8629    | 0.8527 | 0.8540   |
|                        | Validation | 0.6885   | 0.7009    | 0.6885 | 0.6892   |
|                        | Test       | 0.6944   | 0.7023    | 0.6944 | 0.6941   |
| **ANN (MLPClassifier)**| Training   | 0.9710   | 0.9809    | 0.9710 | 0.9738   |
|                        | Validation | 0.6588   | 0.6717    | 0.6588 | 0.6628   |
|                        | Test       | 0.6642   | 0.6746    | 0.6642 | 0.6667   |



## Word2Vec

In [55]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import spacy
import string
import re
import numpy as np
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")



In [13]:
# Load 20 Newsgroups dataset
categories = None  # Set to None to load all categories
dataset = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

In [14]:
# Split into train (60%), validation (20%), and test (20%)
train_texts, test_texts, train_labels, test_labels = train_test_split(dataset.data, dataset.target, test_size=0.4, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42)


In [41]:
def preprocess_text(text):
    """Preprocess text by tokenizing, lemmatizing, and removing stopwords using SpaCy."""
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return tokens

In [44]:

# Apply preprocessing
train_tokens = [preprocess_text(text) for text in train_texts]
val_tokens = [preprocess_text(text) for text in val_texts]
test_tokens = [preprocess_text(text) for text in test_texts]

In [45]:
# Train Word2Vec model
w2v_model = Word2Vec(sentences=train_tokens, vector_size=100, window=5, min_count=2, workers=4)

In [46]:
def get_average_word2vec(tokens, model, vector_size):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

In [47]:
# Convert text to Word2Vec embeddings
X_train = np.array([get_average_word2vec(tokens, w2v_model, 100) for tokens in train_tokens])
X_val = np.array([get_average_word2vec(tokens, w2v_model, 100) for tokens in val_tokens])
X_test = np.array([get_average_word2vec(tokens, w2v_model, 100) for tokens in test_tokens])


In [48]:
# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

y_train = np.array(train_labels)
y_val = np.array(val_labels)
y_test = np.array(test_labels)


In [56]:
# Train Naïve Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)


In [57]:
# Train SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

In [58]:
# Train Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train, y_train)

In [59]:
# Train ANN model
ann_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, activation='relu', solver='adam')
ann_model.fit(X_train, y_train)

In [60]:
# Evaluate models
def evaluate_model(model, X, y_true, dataset_name):
    y_pred = model.predict(X)
    print(f"{dataset_name} Metrics for {model.__class__.__name__}:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precision: {precision_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred, average='weighted'):.4f}")
    print(f"F1 Score: {f1_score(y_true, y_pred, average='weighted'):.4f}\n")

# Report results for all models
models = {
    "Naïve Bayes": nb_model,
    "SVM": svm_model,
    "Logistic Regression": logreg_model,
    "ANN": ann_model
}

In [61]:
for name, model in models.items():
    evaluate_model(model, X_train, y_train, f"Training ({name})")
    evaluate_model(model, X_val, y_val, f"Validation ({name})")
    evaluate_model(model, X_test, y_test, f"Test ({name})")

print("All models trained and evaluated successfully!")

Training (Naïve Bayes) Metrics for GaussianNB:
Accuracy: 0.3624
Precision: 0.3670
Recall: 0.3624
F1 Score: 0.3532

Validation (Naïve Bayes) Metrics for GaussianNB:
Accuracy: 0.3534
Precision: 0.3587
Recall: 0.3534
F1 Score: 0.3458

Test (Naïve Bayes) Metrics for GaussianNB:
Accuracy: 0.3589
Precision: 0.3640
Recall: 0.3589
F1 Score: 0.3511

Training (SVM) Metrics for SVC:
Accuracy: 0.5701
Precision: 0.5675
Recall: 0.5701
F1 Score: 0.5638

Validation (SVM) Metrics for SVC:
Accuracy: 0.5161
Precision: 0.5153
Recall: 0.5161
F1 Score: 0.5109

Test (SVM) Metrics for SVC:
Accuracy: 0.5095
Precision: 0.5090
Recall: 0.5095
F1 Score: 0.5054

Training (Logistic Regression) Metrics for LogisticRegression:
Accuracy: 0.5583
Precision: 0.5544
Recall: 0.5583
F1 Score: 0.5509

Validation (Logistic Regression) Metrics for LogisticRegression:
Accuracy: 0.5179
Precision: 0.5147
Recall: 0.5179
F1 Score: 0.5105

Test (Logistic Regression) Metrics for LogisticRegression:
Accuracy: 0.5122
Precision: 0.5090
R

### Model Performance Comparison (Word2Vec)

| Model                  | Dataset     | Accuracy | Precision | Recall | F1 Score |
|------------------------|------------|----------|-----------|--------|----------|
| **Naïve Bayes (GaussianNB)** | Training   | 0.3624   | 0.3670    | 0.3624 | 0.3532   |
|                        | Validation | 0.3534   | 0.3587    | 0.3534 | 0.3458   |
|                        | Test       | 0.3589   | 0.3640    | 0.3589 | 0.3511   |
| **SVM (SVC)**          | Training   | 0.5701   | 0.5675    | 0.5701 | 0.5638   |
|                        | Validation | 0.5161   | 0.5153    | 0.5161 | 0.5109   |
|                        | Test       | 0.5095   | 0.5090    | 0.5095 | 0.5054   |
| **Logistic Regression**| Training   | 0.5583   | 0.5544    | 0.5583 | 0.5509   |
|                        | Validation | 0.5179   | 0.5147    | 0.5179 | 0.5105   |
|                        | Test       | 0.5122   | 0.5090    | 0.5122 | 0.5063   |
| **ANN (MLPClassifier)**| Training   | 0.6757   | 0.6784    | 0.6757 | 0.6730   |
|                        | Validation | 0.4879   | 0.4892    | 0.4879 | 0.4848   |
|                        | Test       | 0.4812   | 0.4836    | 0.4812 | 0.4782   |

