In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import TfidfModel
import re

lemmatizer = WordNetLemmatizer()
newsgroups_train = fetch_20newsgroups(subset='all',
                                      categories=['alt.atheism',
                                                  # 'comp.graphics',
                                                  # 'comp.os.ms-windows.misc',
                                                  'comp.sys.ibm.pc.hardware',
                                                  # 'comp.sys.mac.hardware',
                                                  # 'comp.windows.x',
                                                  'misc.forsale',
                                                  'rec.autos',
                                                  # 'rec.motorcycles',
                                                  'rec.sport.baseball'],
                                      remove=("header",))

In [2]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
special_sym = ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]", "\'", "{", "/", "^"
               "[", ".", "``", "\'\'", "--", "!", "-", "*", "..", "$", "}", "#", "="]
stop_words = stopwords.words('english') + special_sym


def delete_stopword_and_lemmatize(listw):
    res = []
    for word in listw:
        word = word.lower()
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None and not any(sym in word for sym in special_sym):
            res += [lemmatizer.lemmatize(word)]
    return res

In [3]:
def split_data(x):
    return train_test_split(x, newsgroups_train.target, test_size=0.2, random_state=42)

In [4]:
tokenize_data = [nltk.word_tokenize(newsgroups_train.data[i])
                 for i in range(len(newsgroups_train.data))]

In [5]:
tokenize_data = [delete_stopword_and_lemmatize(tokenize_data[i])
                 for i in range(len(tokenize_data))]

In [6]:
dictn = corpora.Dictionary(tokenize_data)
corpus = [dictn.doc2bow(doc) for doc in tokenize_data]

model = TfidfModel(corpus)
tfidf_corpus = [model[doc] for doc in corpus]
lda_model = models.LdaModel(corpus=tfidf_corpus, num_topics=100, id2word=dictn, passes=15)

In [7]:
lda_topics = lda_model.print_topics(num_words=10)
for topic in lda_topics:
    print(topic)

(96, '0.025*"callison" + 0.013*"james" + 0.012*"v6" + 0.011*"sc" + 0.011*"oklahoma" + 0.008*"hell" + 0.008*"gajarsky" + 0.008*"disc" + 0.007*"norman" + 0.007*"sho"')
(62, '0.019*"probe" + 0.012*"gt" + 0.006*"horn" + 0.005*"chuck" + 0.003*"ford" + 0.000*"suspension" + 0.000*"dealer" + 0.000*"definitely" + 0.000*"back" + 0.000*"kesler"')
(18, '0.019*"sony" + 0.002*"optical" + 0.000*"cd" + 0.000*"flopticals" + 0.000*"md" + 0.000*"drive" + 0.000*"floppy" + 0.000*"mb" + 0.000*"product" + 0.000*"21mb"')
(9, '0.000*"ascertained" + 0.000*"auctioned" + 0.000*"muhammad" + 0.000*"regulating" + 0.000*"breadth" + 0.000*"hindsight" + 0.000*"preconceived" + 0.000*"interviewed" + 0.000*"dinged" + 0.000*"dink"')
(38, '0.016*"boot" + 0.006*"processor" + 0.006*"pentium" + 0.006*"diskette" + 0.005*"linux" + 0.002*"gryphon" + 0.001*"president" + 0.001*"technology" + 0.001*"demo" + 0.000*"intel"')
(31, '0.016*"rice" + 0.014*"darice" + 0.014*"fred" + 0.007*"chevy" + 0.005*"leave" + 0.004*"marriage" + 0.004*"

In [43]:
dataset = []
for i in range(len(tfidf_corpus)):
    dataset += [[val[1] if val[1] > 0 else 0 for val in lda_model.get_document_topics(tfidf_corpus[i], minimum_probability=0.0)]]
print(dataset[10])

[0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.1002154, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.07837382, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.05689781, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884, 0.0017468884

In [46]:
x_train, x_test, y_train, y_test = split_data(dataset)
clf = DecisionTreeClassifier(random_state=42, max_depth=10)
clf.fit(x_train, y_train)
acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
print(round(acc_test, 3))

0.537
