In [83]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import TfidfModel
import re

lemmatizer = WordNetLemmatizer()
newsgroups_train = fetch_20newsgroups(subset='all',
                                      categories=['alt.atheism',
                                                  # 'comp.graphics',
                                                  # 'comp.os.ms-windows.misc',
                                                  'comp.sys.ibm.pc.hardware',
                                                  # 'comp.sys.mac.hardware',
                                                  # 'comp.windows.x',
                                                  'misc.forsale',
                                                  'rec.autos',
                                                  # 'rec.motorcycles',
                                                  'rec.sport.baseball'],
                                      remove=("header",))

In [2]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
special_sym = ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]", "\'", "{", "/", "^"
               "[", ".", "``", "\'\'", "--", "!", "-", "*", "..", "$", "}", "#", "="]
stop_words = stopwords.words('english') + special_sym


def delete_stopword_and_lemmatize(listw):
    res = []
    for word in listw:
        word = word.lower()
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None and not any(sym in word for sym in special_sym):
            res += [lemmatizer.lemmatize(word)]
    return res

In [3]:
def split_data(x):
    return train_test_split(x, newsgroups_train.target, test_size=0.2, random_state=42)

In [87]:
def get_res(typename, dataset):
    x_train, x_test, y_train, y_test = split_data(dataset)
    print(typename)

    clf = RandomForestClassifier(random_state=42, n_estimators=20, max_depth=20)
    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
    print(f"Random Forest:     {round(acc_test, 3)}")

    clf = GradientBoostingClassifier(random_state=42, n_estimators=20)
    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
    print(f"Gradient Boosting: {round(acc_test, 3)}")

    clf = AdaBoostClassifier(random_state=42, n_estimators=20)
    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
    print(f"Ada Boost:         {round(acc_test, 3)}")

In [4]:
tokenize_data = [nltk.word_tokenize(newsgroups_train.data[i])
                 for i in range(len(newsgroups_train.data))]

In [5]:
tokenize_data = [delete_stopword_and_lemmatize(tokenize_data[i])
                 for i in range(len(tokenize_data))]

In [89]:
dictn = corpora.Dictionary(tokenize_data)
dictn.filter_extremes(keep_n=1000)
corpus = [dictn.doc2bow(doc) for doc in tokenize_data]

model = TfidfModel(corpus)
tfidf_corpus = [model[doc] for doc in corpus]

In [92]:
lda_model = models.LdaModel(corpus=corpus, num_topics=10, id2word=dictn, passes=10)
dataset_lda = []
for i in range(len(tfidf_corpus)):
    dataset_lda += [[val[1] for val in lda_model.get_document_topics(tfidf_corpus[i], minimum_probability=0.0)]]
print(dataset_lda[10])

[0.01722332, 0.017223189, 0.017223632, 0.8449727, 0.017221857, 0.017222995, 0.01722346, 0.017223883, 0.017242787, 0.017222123]


In [94]:
lsi_model = models.LsiModel(corpus=tfidf_corpus, num_topics=20, id2word=dictn)
dataset_lsi = []
for i in range(len(tfidf_corpus)):
    dataset_lsi += [[val[1] for val in lsi_model[tfidf_corpus[i]]]]
print(dataset_lsi[10])

[0.18258500554530416, -0.1626441078055576, -0.2989941904448959, 0.02267313331031795, 0.14564292892960867, -0.03224879738798151, 0.2619429494278902, 0.07529082087413883, 0.15428767187113832, -0.005328632934410347, 0.027066969908637717, 0.11543915515531687, 0.11872725307238176, -0.09935099997329994, -0.040898144702486254, -0.09061539463671245, -0.06694165086890495, 0.03795901031733001, -0.04709598245349276, 0.1508209879416794]


In [90]:
dataset_tfidf = []
len_corpus = len(corpus)
for i in range(len(tfidf_corpus)):
    doc = [0 for j in range(len(dictn))]
    for val in tfidf_corpus[i]:
        doc[val[0]] = val[1]

    dataset_tfidf += [doc]
# print(dataset_tfidf[1])

In [95]:
get_res("LDA:", dataset_lda)
print("--------------------------------")
get_res("TF-IDF:", dataset_tfidf)
print("--------------------------------")
get_res("LSI:", dataset_lsi)

LDA:
Random Forest:     0.872
Gradient Boosting: 0.862
Ada Boost:         0.841
--------------------------------
TF-IDF:
Random Forest:     0.889
Gradient Boosting: 0.86
Ada Boost:         0.766
--------------------------------
LSI:
Random Forest:     0.924
Gradient Boosting: 0.91
Ada Boost:         0.888
