In [126]:
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora, models
from gensim.models import TfidfModel
import re

lemmatizer = WordNetLemmatizer()
newsgroups_train = fetch_20newsgroups(subset='all',
                                      categories=['alt.atheism',
                                                  # 'comp.graphics',
                                                  # 'comp.os.ms-windows.misc',
                                                  'comp.sys.ibm.pc.hardware',
                                                  # 'comp.sys.mac.hardware',
                                                  # 'comp.windows.x',
                                                  'misc.forsale',
                                                  'rec.autos',
                                                  # 'rec.motorcycles',
                                                  'rec.sport.baseball'])

In [89]:
num_reg_exp = r'[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?'
special_sym = ["(", ")", ":", "@", "?", ",", "|", ">", "<", "]", "\'", "{", "/",
               "[", ".", "``", "\'\'", "--", "!", "-", "*", "..", "$", "}", "#"]
stop_words = stopwords.words('english') + special_sym


def delete_stopword(listw):
    res = []
    for word in listw:
        word = word.lower()
        if word not in stop_words and re.fullmatch(num_reg_exp, word) is None and not any(sym in word for sym in special_sym):
            res += [word]
    return res

In [128]:
tokenize_data = [nltk.word_tokenize(newsgroups_train.data[i])
                 for i in range(len(newsgroups_train.data))]

In [129]:
tokenize_data = [delete_stopword(tokenize_data[i])
                 for i in range(len(tokenize_data))]

In [130]:
tokenize_data = [list(map(lemmatizer.lemmatize, tokenize_data[i]))
                 for i in range(len(tokenize_data))]

In [131]:
dictn = corpora.Dictionary(tokenize_data)
corpus = [dictn.doc2bow(doc) for doc in tokenize_data]

model = TfidfModel(corpus)
tfidf_corpus = [model[doc] for doc in corpus]
lsi_model = models.LsiModel(corpus=tfidf_corpus, num_topics=5, id2word=dictn)

In [132]:
lsi_topics = lsi_model.print_topics(5, 10)
for topic in lsi_topics:
    print(topic)

(0, '0.149*"drive" + 0.126*"car" + 0.107*"would" + 0.107*"game" + 0.105*"scsi" + 0.098*"one" + 0.089*"year" + 0.087*"think" + 0.086*"ide" + 0.086*"get"')
(1, '-0.363*"drive" + -0.354*"scsi" + -0.282*"ide" + -0.182*"controller" + -0.140*"disk" + -0.140*"card" + -0.127*"bus" + 0.115*"god" + -0.110*"hard" + -0.092*"pc"')
(2, '0.237*"game" + -0.223*"god" + -0.175*"keith" + 0.173*"team" + -0.160*"atheist" + -0.154*"livesey" + -0.139*"moral" + -0.134*"morality" + 0.121*"player" + -0.111*"religion"')
(3, '-0.370*"car" + 0.259*"scsi" + 0.193*"ide" + 0.169*"game" + 0.151*"team" + -0.122*"engine" + 0.122*"drive" + -0.103*"sale" + -0.100*"price" + 0.098*"controller"')
(4, '-0.308*"car" + -0.287*"scsi" + -0.189*"ide" + 0.182*"card" + 0.180*"sale" + 0.170*"modem" + 0.159*"monitor" + -0.149*"drive" + 0.125*"port" + -0.114*"engine"')


In [133]:
for i in range(10):
    print(f"Doc Index: {i}, Topic Vector: {lsi_model[tfidf_corpus[i]]}")

Doc Index: 0, Topic Vector: [(0, 0.15151735935051247), (1, -0.11383056800484564), (2, 0.005119569201244371), (3, -0.039857019829875676), (4, 0.08761025527287739)]
Doc Index: 1, Topic Vector: [(0, 0.09930162345411232), (1, -0.023165984274095896), (2, 0.010791567144904816), (3, -0.03464169589832715), (4, 0.03584506590689165)]
Doc Index: 2, Topic Vector: [(0, 0.0953593973574975), (1, -0.07449306503570832), (2, 0.01665577923638875), (3, -0.0535742003332249), (4, 0.12819760869894087)]
Doc Index: 3, Topic Vector: [(0, 0.1367595326312493), (1, 0.02655179285240384), (2, 0.013351244956937252), (3, -0.21846570308518384), (4, -0.16438919222017676)]
Doc Index: 4, Topic Vector: [(0, 0.06027885431560701), (1, -0.009641001020987897), (2, 0.025999668450678984), (3, -0.047063559756884095), (4, 0.06053267689842014)]
Doc Index: 5, Topic Vector: [(0, 0.12518648568456606), (1, -0.1223388750426365), (2, -0.004568951829133662), (3, 0.0018349773645327501), (4, -0.027324303295748455)]
Doc Index: 6, Topic Vecto

In [142]:
idx_doc = 0
max_val = -100
for i in range(len(tfidf_corpus)):
    value = lsi_model[tfidf_corpus[i]][4][1]
    if value > max_val:
        max_val = value
        idx_doc = i

print(idx_doc, max_val)
print(newsgroups_train.data[idx_doc])

3996 0.20318114629433662
From: ralf@iqsc.COM (Ralf)
Subject: Items For Sale
Organization: IQ Software Corp.
Lines: 24

  ITEMS FOR SALE - PRICE LISTED OR BEST OFFER!!!!!!!

KFC SVGA Monitor 1024X768 .28DP Non-interlaced 14" 
Screen, still under warranty! (Brand New)          $ 290.00

1200 Baud Compuadd Modem Box/docs/software         $  20.00

CGA Monitor with Cga/Parallel Card                 $  30.00

SCO UNIX V3.2.2 Unlimited User OS, has the base
and extended Utilities, and UUCP                   $ 150.00

Turbo C/Turbo C++ The complete reference book.     $  15.00

Serial I/O Card  1 serial Port                     $  10.00

Joystick, three button                             $  10.00

IDE Controller with 2 Serial, 1 parallel and one
Game port  2 available.                            $  10.00Each

(210)545-4741, ask for Ralf



