In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from gensim import corpora, models

from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import Word2Vec

In [2]:
from sklearn.svm import SVC

from sklearn.naive_bayes import GaussianNB

from sklearn import tree

from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import AdaBoostClassifier

from sklearn.neural_network import MLPClassifier

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

In [4]:
import numpy as np
import pandas as pd

In [5]:
svc = SVC()
gnb = GaussianNB()
dec_tree = tree.DecisionTreeClassifier()
rfc = RandomForestClassifier(max_depth=5, random_state=42)
abc = AdaBoostClassifier(n_estimators=100, random_state=42)
mlpc = MLPClassifier(alpha=1e-5, hidden_layer_sizes=(100,), random_state=42)

In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text)
    
    tokens = [word.lower() for word in tokens if word.isalnum()]
    
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words ]
    
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [7]:
def filter_text_by_pos(text, pos_to_keep):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    
    filtered_tokens = [token for token, pos in tagged_tokens if pos in pos_to_keep]
    
    return filtered_tokens

In [8]:
def tfidf_vectorize(data):
    if type(data[0]) == list:
        data = [' '.join(d) for d in data]
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    
    return tfidf_vectorizer.fit_transform(data).toarray()

In [9]:
def word2vec_vectorize(data):
    model = Word2Vec(sentences=data, vector_size=100, window=5, min_count=1, workers=4)

    def document_vector(model, tokenized_document):
        vectors = [model.wv[word] for word in tokenized_document if word in model.wv]
        return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)
    
    return np.array([document_vector(model, doc) for doc in data])

In [10]:
def LDA_vectorize(data):
    num_topics = 10
    text_vectors = []

    dictionary = corpora.Dictionary(data)
    bow_corpus = [dictionary.doc2bow(doc) for doc in data]

    model = models.LdaModel(bow_corpus, id2word=dictionary, num_topics=num_topics, passes=15)

    for doc_bow in bow_corpus:
        document_topics = model.get_document_topics(doc_bow, minimum_probability=0.0)
        document_topic_vector = [topic_prob for _, topic_prob in document_topics]
        text_vectors.append(document_topic_vector)

    return np.array(text_vectors)

In [11]:
def classify(model, train_data, test_data, train_target, test_target):
    model.fit(train_data, train_target)

    predictions = model.predict(test_data)

    f1 = f1_score(test_target, predictions, average='weighted')

    res = f'F1 Score: {f1:.4f}'
    print(res)
    return res

In [13]:
def main(dataset, tokinizers, classifiers):
    df = pd.DataFrame("", index=[tokinizer for tokinizer in tokinizers], columns=[classifier for classifier in classifiers])

    preprocessed_data = [preprocess_text(text) for text in dataset.data]
    
    n = len(preprocessed_data)
    print(n)

    noun_adj_data = [filter_text_by_pos(text, pos_to_keep=['NN', 'NNS', 'JJ', 'JJR', 'JJS']) for text in preprocessed_data]

    preprocessed_data = [text.split(' ') for text in preprocessed_data]
    
    list_of_data = {"ALL": preprocessed_data, "NOUNS and ADJ": noun_adj_data}

    for classifier in classifiers:
        print(classifier)

        for name, data in list_of_data.items():
            print("start calculate " + name)

            for tokinizer in tokinizers:
                print(tokinizer)
                vectors = tokinizers[tokinizer](data)

                train_data, test_data, train_target, test_target = train_test_split(
                        vectors, dataset.target, test_size=0.2, random_state=42)

                print(train_data.shape, test_data.shape)

            
                df.loc[tokinizer, classifier] = df.loc[tokinizer, classifier] + name + ": " + classify(classifiers[classifier], train_data, test_data, train_target, test_target) + "\n"

    return df

In [14]:
tokinizers = {
    "tfidf": tfidf_vectorize,
    "LDA": LDA_vectorize,
    "word2vec": word2vec_vectorize
}

classifiers = {
    "SVM": svc,
    "Naive Bayes": gnb,
    "Decision Trees": dec_tree,
    "Random Forest": rfc,
    "Ada Boost classifier": abc,
    "MLP": mlpc
}


In [15]:
categories_light = ["comp.graphics", "rec.autos", "sci.med",  "talk.politics.mideast"]
categories_hard = ["talk.politics.guns", "talk.politics.mideast",  "talk.politics.misc"]
categories = {"light": fetch_20newsgroups(subset="all",
                                remove=("headers", "footers", "quotes"),
                                categories=categories_light), 
              "hard": fetch_20newsgroups(subset="all",
                                remove=("headers", "footers", "quotes"),
                                categories=categories_hard)}

In [17]:
folder = "./20news_classifier/"

for data in categories:
    print(data)
    df = main(categories[data], tokinizers, classifiers)
    df.to_excel(folder + data + ".xlsx")

light
3893
SVM
start calculate ALL
tfidf
(3114, 5000) (779, 5000)
F1 Score: 0.8847
LDA
(3114, 10) (779, 10)
F1 Score: 0.7824
word2vec
(3114, 100) (779, 100)
F1 Score: 0.6786
start calculate NOUNS and ADJ
tfidf
(3114, 5000) (779, 5000)
F1 Score: 0.8822
LDA
(3114, 10) (779, 10)
F1 Score: 0.8135
word2vec
(3114, 100) (779, 100)
F1 Score: 0.7444
Naive Bayes
start calculate ALL
tfidf
(3114, 5000) (779, 5000)
F1 Score: 0.8488
LDA
(3114, 10) (779, 10)
F1 Score: 0.7954
word2vec
(3114, 100) (779, 100)
F1 Score: 0.6180
start calculate NOUNS and ADJ
tfidf
(3114, 5000) (779, 5000)
F1 Score: 0.8542
LDA
(3114, 10) (779, 10)
F1 Score: 0.8129
word2vec
(3114, 100) (779, 100)
F1 Score: 0.6149
Decision Trees
start calculate ALL
tfidf
(3114, 5000) (779, 5000)
F1 Score: 0.7381
LDA
(3114, 10) (779, 10)
F1 Score: 0.7702
word2vec
(3114, 100) (779, 100)
F1 Score: 0.6634
start calculate NOUNS and ADJ
tfidf
(3114, 5000) (779, 5000)
F1 Score: 0.7412
LDA
(3114, 10) (779, 10)
F1 Score: 0.7015
word2vec
(3114, 100) (7



F1 Score: 0.8269
word2vec
(3114, 100) (779, 100)




F1 Score: 0.7543
start calculate NOUNS and ADJ
tfidf
(3114, 5000) (779, 5000)
F1 Score: 0.8771
LDA
(3114, 10) (779, 10)




F1 Score: 0.8130
word2vec
(3114, 100) (779, 100)




F1 Score: 0.7920
hard
2625
SVM
start calculate ALL
tfidf
(2100, 5000) (525, 5000)
F1 Score: 0.8271
LDA
(2100, 10) (525, 10)
F1 Score: 0.6413
word2vec
(2100, 100) (525, 100)
F1 Score: 0.4385
start calculate NOUNS and ADJ
tfidf
(2100, 5000) (525, 5000)
F1 Score: 0.8262
LDA
(2100, 10) (525, 10)
F1 Score: 0.6447
word2vec
(2100, 100) (525, 100)
F1 Score: 0.4694
Naive Bayes
start calculate ALL
tfidf
(2100, 5000) (525, 5000)
F1 Score: 0.7408
LDA
(2100, 10) (525, 10)
F1 Score: 0.6178
word2vec
(2100, 100) (525, 100)
F1 Score: 0.5048
start calculate NOUNS and ADJ
tfidf
(2100, 5000) (525, 5000)
F1 Score: 0.7354
LDA
(2100, 10) (525, 10)
F1 Score: 0.6546
word2vec
(2100, 100) (525, 100)
F1 Score: 0.5761
Decision Trees
start calculate ALL
tfidf
(2100, 5000) (525, 5000)
F1 Score: 0.7058
LDA
(2100, 10) (525, 10)
F1 Score: 0.5770
word2vec
(2100, 100) (525, 100)
F1 Score: 0.5674
start calculate NOUNS and ADJ
tfidf
(2100, 5000) (525, 5000)
F1 Score: 0.6732
LDA
(2100, 10) (525, 10)
F1 Score: 0.5520
word2ve



F1 Score: 0.6105
word2vec
(2100, 100) (525, 100)
F1 Score: 0.6355
start calculate NOUNS and ADJ
tfidf
(2100, 5000) (525, 5000)
F1 Score: 0.8105
LDA
(2100, 10) (525, 10)




F1 Score: 0.6624
word2vec
(2100, 100) (525, 100)
F1 Score: 0.6723


