In [73]:
import numpy as np
import pandas as pd
import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize

In [75]:
## Load 20 Newsgroups Dataset

data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [77]:
#Feature Extraction
vectorizers = {
    "CountVectorizer": CountVectorizer(),
    "TF-IDF": TfidfVectorizer()
}


In [79]:
models = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier()
}


In [81]:
# Benchmarking Models with Different Feature Extractors
results = []

for vect_name, vectorizer in vectorizers.items():
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    for model_name, model in models.items():
        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)
        acc = accuracy_score(y_test, y_pred)
        results.append({
            "Feature Extractor": vect_name,
            "Model": model_name,
            "Accuracy": round(acc * 100, 2)
        })


In [82]:
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)
print(df_results.to_string(index=False))


Feature Extractor               Model  Accuracy
           TF-IDF                 SVM     90.64
           TF-IDF Logistic Regression     89.39
  CountVectorizer Logistic Regression     88.54
  CountVectorizer         Naive Bayes     85.12
           TF-IDF         Naive Bayes     84.75
  CountVectorizer       Decision Tree     64.40
           TF-IDF       Decision Tree     61.80
  CountVectorizer                 SVM     35.89


In [93]:
#Load 20 Newsgroups Dataset
from sklearn.datasets import fetch_20newsgroups

# Load the training and test data
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

X_train_raw = newsgroups_train.data
X_test_raw = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [97]:

# Feature Extraction with CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(stop_words='english', max_features=5000)

X_train_count = count_vectorizer.fit_transform(X_train_raw)
X_test_count = count_vectorizer.transform(X_test_raw)

In [99]:
from gensim.models import Word2Vec
import numpy as np

# Tokenize documents
def tokenize(doc):
    return [word for word in doc.lower().split() if len(word) > 2]

tokenized_docs_train = [tokenize(doc) for doc in X_train_raw]
tokenized_docs_test = [tokenize(doc) for doc in X_test_raw]

# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_docs_train, vector_size=100, window=5, min_count=5)

# Function to get document vector (mean of word vectors)
def document_vector(doc):
    vectors = [w2v_model.wv[word] for word in doc if word in w2v_model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X_train_w2v = np.array([document_vector(doc) for doc in tokenized_docs_train])
X_test_w2v = np.array([document_vector(doc) for doc in tokenized_docs_test])

In [101]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Tag documents for Doc2Vec
train_tagged = [TaggedDocument(words=doc, tags=[i]) for i, doc in enumerate(tokenized_docs_train)]
test_tagged = [TaggedDocument(words=doc, tags=[i]) for i, doc in enumerate(tokenized_docs_test)]

# Train Doc2Vec model
d2v_model = Doc2Vec(vector_size=100, window=5, min_count=5, epochs=20)
d2v_model.build_vocab(train_tagged)
d2v_model.train(train_tagged, total_examples=d2v_model.corpus_count, epochs=d2v_model.epochs)

# Get vectors
X_train_d2v = np.array([d2v_model.dv[i] for i in range(len(train_tagged))])
X_test_d2v = np.array([d2v_model.infer_vector(doc.words) for doc in test_tagged])

In [103]:
##train classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

results = []

# Define a function to train and evaluate
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name, feature_name):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    results.append((feature_name, model_name, acc))

In [109]:
#For CountVectorizer:
evaluate_model(MultinomialNB(), X_train_count, y_train, X_test_count, y_test, 'Naive Bayes', 'CountVectorizer')

evaluate_model(LogisticRegression(max_iter=3000, solver='saga'), X_train_count, y_train, X_test_count, y_test, 'Logistic Regression', 'CountVectorizer')

evaluate_model(LinearSVC(max_iter=3000), X_train_count, y_train, X_test_count, y_test, 'SVM', 'CountVectorizer')

evaluate_model(DecisionTreeClassifier(), X_train_count, y_train, X_test_count, y_test, 'Decision Tree', 'CountVectorizer')



In [117]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_raw)
X_test_tfidf = tfidf_vectorizer.transform(X_test_raw)
# For TF-IDF:
evaluate_model(MultinomialNB(), X_train_tfidf, y_train, X_test_tfidf, y_test, 'Naive Bayes', 'TF-IDF')
evaluate_model(LogisticRegression(max_iter=1000), X_train_tfidf, y_train, X_test_tfidf, y_test, 'Logistic Regression', 'TF-IDF')
evaluate_model(LinearSVC(), X_train_tfidf, y_train, X_test_tfidf, y_test, 'SVM', 'TF-IDF')
evaluate_model(DecisionTreeClassifier(), X_train_tfidf, y_train, X_test_tfidf, y_test, 'Decision Tree', 'TF-IDF')

In [119]:
#For Word2Vec & Doc2Vec (No Naive Bayes because it's not compatible with negative values)
for feature_name, X_train_feat, X_test_feat in [('Word2Vec', X_train_w2v, X_test_w2v), ('Doc2Vec', X_train_d2v, X_test_d2v)]:
    evaluate_model(LogisticRegression(max_iter=1000), X_train_feat, y_train, X_test_feat, y_test, 'Logistic Regression', feature_name)
    evaluate_model(LinearSVC(), X_train_feat, y_train, X_test_feat, y_test, 'SVM', feature_name)
    evaluate_model(DecisionTreeClassifier(), X_train_feat, y_train, X_test_feat, y_test, 'Decision Tree', feature_name)

In [121]:
#creat table
import pandas as pd

df_results = pd.DataFrame(results, columns=['Feature', 'Model', 'Accuracy'])
print(df_results)

            Feature                Model  Accuracy
0   CountVectorizer          Naive Bayes  0.594663
1   CountVectorizer  Logistic Regression  0.433617
2   CountVectorizer                  SVM  0.552974
3   CountVectorizer        Decision Tree  0.436272
4   CountVectorizer          Naive Bayes  0.594663
5   CountVectorizer  Logistic Regression  0.433617
6   CountVectorizer                  SVM  0.552974
7   CountVectorizer        Decision Tree  0.442379
8            TF-IDF          Naive Bayes  0.650558
9            TF-IDF  Logistic Regression  0.647637
10           TF-IDF                  SVM  0.627058
11           TF-IDF        Decision Tree  0.434413
12         Word2Vec  Logistic Regression  0.317578
13         Word2Vec                  SVM  0.338821
14         Word2Vec        Decision Tree  0.169012
15          Doc2Vec  Logistic Regression  0.467207
16          Doc2Vec                  SVM  0.420738
17          Doc2Vec        Decision Tree  0.177111


In [142]:
df_results.to_csv("ElhamSaeidi_Task0_Text_Classification.txt", index=False, sep="\t")


In [144]:
!pip install python-docx
from docx import Document
doc = Document()
doc.add_heading('Text Classification Benchmark Results', 0)
table = doc.add_table(rows=1, cols=3)
table.style = 'Table Grid'
hdr_cells = table.rows[0].cells
hdr_cells[0].text = 'Feature'
hdr_cells[1].text = 'Model'
hdr_cells[2].text = 'Accuracy (%)'
for index, row in df_results.iterrows():
    row_cells = table.add_row().cells
    row_cells[0].text = str(row['Feature'])
    row_cells[1].text = str(row['Model'])
    row_cells[2].text = f"{row['Accuracy']:.2f}"
doc.save("ElhamSaeidi2_Task0_Text_Classification.docx")


