In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import TfidfModel
import re

lemmatizer = WordNetLemmatizer()
newsgroups_train = fetch_20newsgroups(subset='all',
                                      categories=['alt.atheism',
                                                  # 'comp.graphics',
                                                  # 'comp.os.ms-windows.misc',
                                                  'comp.sys.ibm.pc.hardware',
                                                  # 'comp.sys.mac.hardware',
                                                  # 'comp.windows.x',
                                                  'misc.forsale',
                                                  'rec.autos',
                                                  # 'rec.motorcycles',
                                                  'rec.sport.baseball'],
                                      remove=("header",))

In [2]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
special_sym = ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]", "\'", "{", "/", "^"
               "[", ".", "``", "\'\'", "--", "!", "-", "*", "..", "$", "}", "#", "="]
stop_words = stopwords.words('english') + special_sym


def delete_stopword_and_lemmatize(listw):
    res = []
    for word in listw:
        word = word.lower()
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None and not any(sym in word for sym in special_sym):
            res += [lemmatizer.lemmatize(word)]
    return res

In [3]:
def split_data(x):
    return train_test_split(x, newsgroups_train.target, test_size=0.2, random_state=42)

In [18]:
def get_res(typename, dataset):
    x_train, x_test, y_train, y_test = split_data(dataset)
    print(typename)

    clf = RandomForestClassifier(random_state=42, n_estimators=20, max_depth=20)
    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
    print(f"Random Forest:     {round(acc_test, 3)}")

    clf = GradientBoostingClassifier(random_state=42, n_estimators=20)
    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
    print(f"Gradient Boosting: {round(acc_test, 3)}")

    clf = AdaBoostClassifier(algorithm='SAMME', random_state=42, n_estimators=20)
    clf.fit(x_train, y_train)
    acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
    print(f"Ada Boost:         {round(acc_test, 3)}")

In [5]:
tokenize_data = [nltk.word_tokenize(newsgroups_train.data[i])
                 for i in range(len(newsgroups_train.data))]

In [6]:
tokenize_data = [delete_stopword_and_lemmatize(tokenize_data[i])
                 for i in range(len(tokenize_data))]

In [28]:
dictn = corpora.Dictionary(tokenize_data)
dictn.filter_extremes(keep_n=1000)
corpus = [dictn.doc2bow(doc) for doc in tokenize_data]

model = TfidfModel(corpus)
tfidf_corpus = [model[doc] for doc in corpus]

In [29]:
lda_model = models.LdaModel(corpus=corpus, num_topics=10, id2word=dictn, passes=10)
dataset_lda = []
for i in range(len(tfidf_corpus)):
    dataset_lda += [[val[1] for val in lda_model.get_document_topics(tfidf_corpus[i], minimum_probability=0.0)]]
print(dataset_lda[10])

[0.84501094, 0.017220367, 0.017221307, 0.017228091, 0.017219014, 0.017220123, 0.01722001, 0.017220618, 0.017219543, 0.017220045]


In [30]:
lsi_model = models.LsiModel(corpus=tfidf_corpus, num_topics=20, id2word=dictn)
dataset_lsi = []
for i in range(len(tfidf_corpus)):
    dataset_lsi += [[val[1] for val in lsi_model[tfidf_corpus[i]]]]
print(dataset_lsi[10])

[0.18259740416324924, -0.16277818225677726, 0.299710423266668, -0.02248283168371573, 0.14680910113075696, 0.03212949840495036, 0.2624441683535463, 0.06985811031617419, -0.15731505811568872, 0.017216645719953568, -0.018507433469920422, -0.1266067954797261, -0.11536093945849427, 0.10548113612751003, 0.04817964610728727, -0.1000407152544677, -0.07744382514678375, -0.04855047541183852, 0.04724870766508907, 0.1378512395467678]


In [31]:
dataset_tfidf = []
len_dict = len(dictn)
for i in range(len(tfidf_corpus)):
    doc = [0 for j in range(len_dict)]
    for val in tfidf_corpus[i]:
        doc[val[0]] = val[1]

    dataset_tfidf += [doc]
# print(dataset_tfidf[1])

In [32]:
get_res("LDA:", dataset_lda)
print("--------------------------------")
get_res("TF-IDF:", dataset_tfidf)
print("--------------------------------")
get_res("LSI:", dataset_lsi)

LDA:
Random Forest:     0.864
Gradient Boosting: 0.882
Ada Boost:         0.847
--------------------------------
TF-IDF:
Random Forest:     0.889
Gradient Boosting: 0.86
Ada Boost:         0.709
--------------------------------
LSI:
Random Forest:     0.932
Gradient Boosting: 0.915
Ada Boost:         0.886
