### Parse dataset ###

In [1]:
import glob
import os
import re
import string

def get_file_content(fpath, enc='utf-8'):
    file = open(fpath, "r") #, encoding=enc
    content = file.read()
    file.close()
    return content.split('\n')

def spit(fpath, content):
    path = fpath
    file = open(fpath, "w", encoding='utf-8')
    file.write(content)
    file.close()

filenames = []
news_regex = r"^([^\s\r\n]+)\n\d+\n[^\s\r\n]+\n\n(#\w+\s)+\n=+\n.*$"
for filename in glob.iglob('/Users/defake/Documents/LentaDS/lenta_news_wo_author/**/*.txt', recursive=True):
    if os.path.isfile(filename):
        filenames.append(filename)

In [2]:
def parse_file_content(content):
    if content == '' or content == ['']:
            return None
    name = content[0].strip()
    #number = content[1]
    #time = content[2]
    tags = re.findall(r"#(\w+)", content[4].lower())
    text = content[6].lower().strip()
    return {'name': name,
            'tags': tags,
            'text': text}

parsed = [parse_file_content(get_file_content(f)) for f in filenames]
parsed = [x for x in parsed if x != None]

In [28]:
# calculate number of topics
topics = set()
for news in parsed:
    for t in news['tags']:
        if t not in topics:
            topics.add(t)

topics_n = len(topics)
topics_n

120

In [4]:
import nltk
import pymorphy2

stopwords = set(nltk.corpus.stopwords.words('russian'))
morph = pymorphy2.MorphAnalyzer()
tokenized_documents = [[morph.parse(word)[0].normal_form for word in re.findall(r'\w+', news['text']) if word not in stopwords] for news in parsed[:-1000]]
del filenames
del topics
del parsed
tokenized_documents[0][:10]

['сегодня',
 'областной',
 'центр',
 'сахалин',
 'курить',
 'получить',
 'статус',
 'очаг',
 'распространение',
 'холера']

In [5]:
from gensim.corpora import Dictionary

dictionary = Dictionary(tokenized_documents)
corpus = [dictionary.doc2bow(document) for document in tokenized_documents]

del tokenized_documents
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 3),
 (41, 1),
 (42, 1),
 (43, 2),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 1),
 (48, 1),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 2),
 (57, 2),
 (58, 1),
 (59, 1),
 (60, 1),
 (61, 3),
 (62, 3),
 (63, 2),
 (64, 1),
 (65, 1),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 1),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 1),
 (79, 2),
 (80, 1),
 (81, 1),
 (82, 1),
 (83, 2),
 (84, 1),
 (85, 2),
 (86, 3)]

In [6]:
import pickle

# Write the corpus and the dictionary to a file for a case...

with open('./bow_corpus', 'wb') as f:
    pickle.dump(corpus, f)
    
dictionary.save('./news_dict')

In [2]:
import pickle
from gensim.corpora import Dictionary

with open('./bow_corpus', 'rb') as f:
    corpus = pickle.load(f)

dictionary = Dictionary.load('./news_dict')

In [16]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 1),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 1),
 (13, 1),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 1),
 (23, 1),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 1),
 (33, 1),
 (34, 1),
 (35, 1),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 1),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 1),
 (47, 3),
 (48, 1),
 (49, 1),
 (50, 1),
 (51, 3),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 2),
 (65, 2),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 4),
 (70, 3),
 (71, 2),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 1),
 (76, 1),
 (77, 1),
 (78, 1),
 (79, 1),
 (80, 1),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 1),
 (85, 1),
 (86, 1),
 (87, 2),
 (88, 1),
 (89, 1),
 (90, 1),
 (91, 3),
 (92, 1),
 (93, 2),
 (94, 4)]

### Lda ###

In [14]:
def print_keywords(topic_terms):
    print("Keywords for topic:")
    for kwd, p in topic_terms:
        print(dictionary.id2token[int(kwd)])

def get_topic_for_text(model, text):
    doc = [morph.parse(word)[0].normal_form for word in re.findall(r'\w+', text.lower()) if word not in stopwords]
    bow = dictionary.doc2bow(doc)
    topics = model[bow]
    topic_num = max(topics, key=lambda x: x[1])
    return topic_num[0]

In [27]:
from gensim.models.ldamodel import LdaModel
lda = LdaModel(corpus, num_topics=topics_n, chunksize=2000, passes=1)

In [22]:
test_doc = get_file_content('./test_news1.txt')
print(test_doc)
test_doc = ''.join(get_file_content('./test_news1.txt'))
prob_topic = get_topic_for_text(lda, test_doc)
print_keywords(lda.get_topic_terms(prob_topic))

['во вторник верховный суд якутии возобновил процесс по делу об отмене регистрации кандидатом в президенты действующего главы республики михаила николаева, передает агентство "интерфакс" со ссылкой на источник в верховном суде. как сообщалось ранее, процесс был инициирован двумя жителями республики, однако приостанавливался в связи с запросом представителей николаева в конституционный суд рф.']
Keywords for topic:
россия
президент
год
новость
который
риа
владимир
правительство
глава
депутат


In [26]:
from gensim.models import LsiModel
lsi = LsiModel(corpus, num_topics=topics_n, chunksize=2000)

In [25]:
prob_topic = get_topic_for_text(lsi, test_doc)
print_keywords(lda.get_topic_terms(prob_topic))

Keywords for topic:
который
президент
заявить
сообщать
израильский
сша
израиль
палестинский
министр
страна
