In [44]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import TfidfModel
import re

import warnings
warnings.filterwarnings("ignore")

lemmatizer = WordNetLemmatizer()
newsgroups_train = fetch_20newsgroups(subset='all',
                                      categories=['alt.atheism',
                                                  # 'comp.graphics',
                                                  # 'comp.os.ms-windows.misc',
                                                  'comp.sys.ibm.pc.hardware',
                                                  # 'comp.sys.mac.hardware',
                                                  # 'comp.windows.x',
                                                  'misc.forsale',
                                                  'rec.autos',
                                                  # 'rec.motorcycles',
                                                  'rec.sport.baseball'],
                                      remove=("header",))

In [2]:
def split_data(x):
    return train_test_split(x, newsgroups_train.target, test_size=0.2, random_state=42)

In [39]:
def get_res(typename, dataset, solver="adam",
            hidden_layer_sizes=(10, 10),
            activation="tanh", learning_rate="constant", max_iter=1000):

    x_train, x_test, y_train, y_test = split_data(dataset)

    clf = MLPClassifier(random_state=42,
                        solver=solver,
                        hidden_layer_sizes=hidden_layer_sizes,
                        max_iter=max_iter,
                        learning_rate=learning_rate,
                        activation=activation)

    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")

    if typename != "TFIDF":
        print(f"    {typename}:     {round(acc_test, 3)}")
    else:
        print(f"    {typename}:   {round(acc_test, 3)}")

In [4]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
special_sym = ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]", "\'", "{", "/", "^"
               "[", ".", "``", "\'\'", "--", "!", "-", "*", "..", "$", "}", "#", "="]
stop_words = stopwords.words('english') + special_sym


def delete_stopword_and_lemmatize(listw):
    res = []
    for word in listw:
        word = lemmatizer.lemmatize(word.lower())
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None\
            and not any(sym in word for sym in special_sym):
            res += [word]
    return res

In [9]:
def get_bow_var(bow, dictn):
    res = []
    for i in range(len(bow)):
        doc = [0 for j in range(len(dictn))]
        for val in bow[i]:
            doc[val[0]] = val[1]

        res += [doc]
    return res


def get_tfidf_var(tfidf, dictn):
    res = []
    for i in range(len(tfidf)):
        doc = [0 for j in range(len(dictn))]
        for val in tfidf[i]:
            doc[val[0]] = val[1]

        res += [doc]
    return res

In [10]:
def get_lsi(corpus, dictn):
    lsi_model = models.LsiModel(corpus=corpus, num_topics=20, id2word=dictn)
    lsi_res = []
    for i in range(len(corpus)):
        lsi_res += [[val[1] for val in lsi_model[corpus[i]]]]
    return lsi_res


def get_lda(corpus, dictn, alpha, bbeta):
    lda_model = models.LdaModel(corpus=corpus, num_topics=20, id2word=dictn, passes=10, alpha=alpha, eta=bbeta)
    lda_res = []
    for i in range(len(corpus)):
        lda_res += [[val[1] for val in lda_model.get_document_topics(corpus[i], minimum_probability=0.0)]]
    return lda_res

In [5]:
tokenize_data = [delete_stopword_and_lemmatize(nltk.word_tokenize(newsgroups_train.data[i]))
                 for i in range(len(newsgroups_train.data))]

In [13]:
dictn = corpora.Dictionary(tokenize_data)
dictn.filter_extremes(keep_n=1000)

bow = [dictn.doc2bow(doc) for doc in tokenize_data]

model = TfidfModel(bow)
tfidf = [model[doc] for doc in bow]

In [14]:
lsi = get_lsi(tfidf, dictn)
lda = get_lda(bow, dictn, 'symmetric', None)

In [30]:
embed = [get_bow_var(bow, dictn), get_tfidf_var(tfidf, dictn), lsi, lda]
names = ["BoW", "TFIDF", "LSI", "LDA"]

In [32]:
vec_hidden_layer_sizes = [(10, ), (30, ), (100, ), (10, 10)]

for hidden_layer in vec_hidden_layer_sizes:
    print(f"MLPClassifier {hidden_layer}:")
    for i in range(len(embed)):
        get_res(f"{names[i]}", embed[i], hidden_layer_sizes=hidden_layer)
    print("--------------------------------")

MLPClassifier (10,):
    BoW:     0.909
    TFIDF:   0.914
    LSI:     0.932
    LDA:     0.887
--------------------------------
MLPClassifier (30,):
    BoW:     0.914
    TFIDF:   0.91
    LSI:     0.939
    LDA:     0.887
--------------------------------
MLPClassifier (100,):
    BoW:     0.91
    TFIDF:   0.917




    LSI:     0.936




    LDA:     0.885
--------------------------------
MLPClassifier (10, 10):
    BoW:     0.901
    TFIDF:   0.908
    LSI:     0.937
    LDA:     0.888
--------------------------------


In [35]:
vec_activation = ["identity", "logistic", "tanh", "relu"]

for elem in vec_activation:
    print(f"MLPClassifier {elem}:")
    for i in range(len(embed)):
        get_res(f"{names[i]}", embed[i], activation=elem)
    print("--------------------------------")

MLPClassifier identity:
    BoW:     0.899
    TFIDF:   0.907
    LSI:     0.93
    LDA:     0.886
--------------------------------
MLPClassifier logistic:
    BoW:     0.906
    TFIDF:   0.909




    LSI:     0.936
    LDA:     0.887
--------------------------------
MLPClassifier tanh:
    BoW:     0.91
    TFIDF:   0.908
    LSI:     0.934




    LDA:     0.89
--------------------------------
MLPClassifier relu:
    BoW:     0.901
    TFIDF:   0.908
    LSI:     0.937
    LDA:     0.888
--------------------------------


In [46]:
vec_solver = ["adam", "sgd", "lbfgs"]

for elem in vec_solver:
    print(f"MLPClassifier {elem}:")
    for i in range(len(embed)):
        get_res(f"{names[i]}", embed[i], solver=elem)
    print("--------------------------------")

MLPClassifier adam:
    BoW:     0.91
    TFIDF:   0.908
    LSI:     0.934
    LDA:     0.89
--------------------------------
MLPClassifier sgd:
    BoW:     0.923
    TFIDF:   0.925
    LSI:     0.923
    LDA:     0.888
--------------------------------
MLPClassifier lbfgs:
    BoW:     0.886
    TFIDF:   0.916
    LSI:     0.909
    LDA:     0.852
--------------------------------


In [45]:
vec_learning_rate = ["constant", "invscaling", "adaptive"]

for elem in vec_learning_rate:
    print(f"MLPClassifier {elem}:")
    for i in range(len(embed)):
        get_res(f"{names[i]}", embed[i], learning_rate=elem, solver="sgd", max_iter=200)
    print("--------------------------------")

MLPClassifier constant:
    BoW:     0.93
    TFIDF:   0.883
    LSI:     0.834
    LDA:     0.87
--------------------------------
MLPClassifier invscaling:
    BoW:     0.303
    TFIDF:   0.236
    LSI:     0.231
    LDA:     0.231
--------------------------------
MLPClassifier adaptive:
    BoW:     0.93
    TFIDF:   0.883
    LSI:     0.834
    LDA:     0.87
--------------------------------
