In [1]:
import re
import math
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

from nltk.corpus import stopwords
from collections import Counter


## Создание модели

In [2]:
# NLTK Stop words
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [3]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')

In [4]:
# Convert to list
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]


In [5]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data_words = list(sent_to_words(data))

In [8]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)



In [9]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [11]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [12]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

Mallet упорно не работал ни в колабе, ни локально.

In [13]:
from pathlib import Path

In [14]:
filename = Path("mallet-2.0.8/mallet-2.0.8/bin/mallet")

In [15]:
mallet_path = str(filename) # update this path
import os
from gensim.models.wrappers import LdaMallet

#os.environ['MALLET_HOME'] = 'C:\\Users\\User\\mallet-2.0.8'
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


CalledProcessError: ignored

## Функция, считающая оптимальное количество топиков по coherence 

In [16]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [17]:
%%time
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=8)


CPU times: user 21min 32s, sys: 4min 52s, total: 26min 25s
Wall time: 20min 49s


In [18]:
# Print the coherence scores
limit=40; start=2; step=8;
x = range(start, limit, step)
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

Num Topics = 2  has Coherence Value of 0.4204
Num Topics = 10  has Coherence Value of 0.4973
Num Topics = 18  has Coherence Value of 0.4697
Num Topics = 26  has Coherence Value of 0.4281
Num Topics = 34  has Coherence Value of 0.4211


Получилось, что оптимально 10 топиков.

## Модель с "оптимальным" количеством топиков

In [19]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [20]:
pprint(lda_model.print_topics())

[(0,
  '0.028*"team" + 0.027*"year" + 0.026*"game" + 0.020*"play" + 0.018*"win" + '
  '0.014*"player" + 0.010*"run" + 0.010*"last" + 0.009*"good" + 0.009*"hit"'),
 (1,
  '0.022*"go" + 0.015*"time" + 0.013*"day" + 0.012*"come" + 0.011*"take" + '
  '0.011*"back" + 0.011*"get" + 0.009*"say" + 0.008*"see" + 0.008*"first"'),
 (2,
  '0.014*"space" + 0.008*"cost" + 0.008*"year" + 0.007*"high" + '
  '0.007*"research" + 0.007*"low" + 0.006*"item" + 0.006*"also" + 0.006*"test" '
  '+ 0.005*"large"'),
 (3,
  '0.033*"car" + 0.021*"drive" + 0.013*"bike" + 0.011*"power" + 0.011*"wire" + '
  '0.011*"slave" + 0.010*"reality" + 0.009*"speed" + 0.009*"engine" + '
  '0.009*"light"'),
 (4,
  '0.090*"ax" + 0.077*"max" + 0.018*"di_di" + 0.015*"tumor" + '
  '0.012*"homosexual" + 0.011*"gay" + 0.009*"taste" + 0.008*"liar" + '
  '0.007*"marry" + 0.006*"homosexuality"'),
 (5,
  '0.017*"government" + 0.013*"people" + 0.012*"gun" + 0.011*"state" + '
  '0.010*"kill" + 0.008*"year" + 0.007*"public" + 0.007*"attack"

## Выделение групп текстов по топикам

In [22]:
x=lda_model.show_topics(num_topics=10, num_words=15, formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

In [23]:
def make_groups_by_topic(texts, topics_words):
    groups_by_topic = {}
    text_num = 0
    for text in texts:
        text_num += 1
        topics = {}
  
        for topic in topics_words:
            score = 0
            groups_by_topic.setdefault(topic[0], [])
            for word in text:
                if word in topic[1]:
                    score += 2
    
            topics.update({topic[0] : score})

        for topic, score in topics.items():
            if score == max(topics.values()):
                best_topic = topic

        groups_by_topic[best_topic].append(text)
    return groups_by_topic

In [24]:
groups_by_topic = make_groups_by_topic(texts, topics_words)

## Выделение уникальных словарей по топику, подсчет tf_idf

In [25]:
for topic,group in groups_by_topic.items():
  unique_words = set()
  for text in group:
    uni = unique_words.union(set(text))
  group.insert(0, uni)

In [26]:
def computeIDF(documents,unique_words):

    N = len(documents)
    
    idfDict = dict.fromkeys(unique_words, 0)
    for document in documents:
        for word in document:
          if word in idfDict.keys():
              idfDict[word] += 1
          else:
            idfDict.setdefault(word, 1)
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [27]:
def computeTF(unique_words, text):
    tfDict = dict.fromkeys(unique_words,0)
    text_words = Counter(text)
    text_count = len(text)
    for word, count in text_words.items():
        if word not in tfDict.keys():
          tfDict.setdefault(word,1)
        tfDict[word] = count / float(text_count)

    return tfDict

In [28]:
def computeTFIDF(tfs, idfs):
    tfidf = {}
    for word, val in tfs.items():
        tfidf[word] = val * idfs[word]
    return tfidf

## Создание датафрейма с номером текста, номером топика, 5-ю словами с самыми высокими tf_idf

In [38]:
def tf_idf_and_top5(groups_by_topic):
    results = []
    text_num = 0
    for topic, group in groups_by_topic.items(): 
        word_set = group[0]
        idf_dict = computeIDF(group, word_set)
        for text in group[1:]:
            text_num += 1
            tfs = computeTF(word_set, text)
            tf_idf = computeTFIDF(tfs, idf_dict)
            top5 = list(dict(Counter(tf_idf).most_common(5)))
            results.append({'Text': text_num,'Topic': topic, '5 most tf_idf' : top5})
    return results

In [39]:
results = tf_idf_and_top5(groups_by_topic)
df = pd.DataFrame(results)
df.set_index('Text')

Unnamed: 0_level_0,Topic,5 most tf_idf
Text,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,"[bullpen, fetter, yesterday, crush, mockery]"
2,0,"[cin, snd, pitcher, atl, flo]"
3,0,"[perhaps, rookie, catcher, veteran, old]"
4,0,"[ranger, kravchuk, nagging, sather, scrooge]"
5,0,"[abbot, clobber, youngster, severalrs, herot]"
...,...,...
11310,9,"[leo, talent, suncd, programme, tool]"
11311,9,"[fake, jammer, duidelijke, mededele, deze]"
11312,9,"[oriole, night, castelletto, promotion, tor]"
11313,9,"[enviroleague, youth, adult, team, environment..."
