In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import TfidfModel
import re

lemmatizer = WordNetLemmatizer()
newsgroups_train = fetch_20newsgroups(subset='all',
                                      categories=['alt.atheism',
                                                  # 'comp.graphics',
                                                  # 'comp.os.ms-windows.misc',
                                                  'comp.sys.ibm.pc.hardware',
                                                  # 'comp.sys.mac.hardware',
                                                  # 'comp.windows.x',
                                                  'misc.forsale',
                                                  'rec.autos',
                                                  # 'rec.motorcycles',
                                                  'rec.sport.baseball'])

In [16]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
special_sym = ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]", "\'", "{", "/",
               "[", ".", "``", "\'\'", "--", "!", "-", "*", "..", "$", "}", "#"]
stop_words = stopwords.words('english') + special_sym


def delete_stopword(listw):
    res = []
    for word in listw:
        word = word.lower()
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None and not any(sym in word for sym in special_sym):
            res += [word]
    return res

In [24]:
def split_data(x):
    return train_test_split(x, newsgroups_train.target, test_size=0.2, random_state=42)

In [33]:
tokenize_data = [nltk.word_tokenize(newsgroups_train.data[i])
                 for i in range(len(newsgroups_train.data))]

In [34]:
tokenize_data = [delete_stopword(tokenize_data[i])
                 for i in range(len(tokenize_data))]

In [35]:
tokenize_data = [list(map(lemmatizer.lemmatize, tokenize_data[i]))
                 for i in range(len(tokenize_data))]

In [36]:
dictn = corpora.Dictionary(tokenize_data)
corpus = [dictn.doc2bow(doc) for doc in tokenize_data]

model = TfidfModel(corpus)
tfidf_corpus = [model[doc] for doc in corpus]
lsi_model = models.LsiModel(corpus=tfidf_corpus, num_topics=20, id2word=dictn)

In [37]:
lsi_topics = lsi_model.print_topics(5, 10)
for topic in lsi_topics:
    print(topic)

(0, '0.149*"drive" + 0.126*"car" + 0.107*"would" + 0.107*"game" + 0.105*"scsi" + 0.099*"one" + 0.089*"year" + 0.087*"think" + 0.086*"ide" + 0.086*"god"')
(1, '0.362*"drive" + 0.355*"scsi" + 0.282*"ide" + 0.182*"controller" + 0.141*"card" + 0.140*"disk" + 0.126*"bus" + -0.115*"god" + 0.110*"hard" + 0.091*"pc"')
(2, '-0.239*"game" + 0.224*"god" + 0.176*"keith" + -0.172*"team" + 0.159*"atheist" + 0.154*"livesey" + 0.139*"moral" + 0.133*"morality" + -0.120*"player" + 0.111*"religion"')
(3, '-0.372*"car" + 0.259*"scsi" + 0.189*"ide" + 0.167*"game" + 0.152*"team" + -0.123*"engine" + 0.123*"drive" + -0.105*"sale" + -0.101*"price" + 0.098*"controller"')
(4, '-0.304*"car" + -0.285*"scsi" + -0.195*"ide" + 0.188*"card" + 0.179*"sale" + 0.160*"modem" + 0.154*"monitor" + -0.144*"drive" + 0.124*"port" + -0.115*"engine"')


In [38]:
dataset = []
for i in range(len(tfidf_corpus)):
    dataset += [[val[1] for val in lsi_model[tfidf_corpus[i]]]]
print(dataset[0])

[0.15143992853693206, 0.11363973659065095, -0.008078373834524312, -0.03973219747617656, 0.09728297231650411, -0.005544668444605429, -0.07259687093127483, 0.0020017166154886966, -0.014715945161779986, -0.004783723507454937, 0.0374402021042733, 0.005652878179057426, -0.003220218812507732, 0.01623850501452431, -0.004758589204285226, -0.030105179190181967, 0.009188308712651172, -0.01024088570539909, 0.0049509812510017065, -0.03728570470725917]


In [39]:
x_train, x_test, y_train, y_test = split_data(dataset)
clf = DecisionTreeClassifier(random_state=42, max_depth=20)
clf.fit(x_train, y_train)
acc_test = f1_score(y_true=y_test, y_pred=clf.predict(x_test), average="micro")
print(round(acc_test, 3))

0.898


In [142]:
idx_doc = 0
max_val = -100
for i in range(len(tfidf_corpus)):
    value = lsi_model[tfidf_corpus[i]][4][1]
    if value > max_val:
        max_val = value
        idx_doc = i

print(idx_doc, max_val)
print(newsgroups_train.data[idx_doc])

3996 0.20318114629433662
From: ralf@iqsc.COM (Ralf)
Subject: Items For Sale
Organization: IQ Software Corp.
Lines: 24

  ITEMS FOR SALE - PRICE LISTED OR BEST OFFER!!!!!!!

KFC SVGA Monitor 1024X768 .28DP Non-interlaced 14" 
Screen, still under warranty! (Brand New)          $ 290.00

1200 Baud Compuadd Modem Box/docs/software         $  20.00

CGA Monitor with Cga/Parallel Card                 $  30.00

SCO UNIX V3.2.2 Unlimited User OS, has the base
and extended Utilities, and UUCP                   $ 150.00

Turbo C/Turbo C++ The complete reference book.     $  15.00

Serial I/O Card  1 serial Port                     $  10.00

Joystick, three button                             $  10.00

IDE Controller with 2 Serial, 1 parallel and one
Game port  2 available.                            $  10.00Each

(210)545-4741, ask for Ralf



