#### https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
#### https://blog.naver.com/mage7th/221394123886
참조

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
from gensim import corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
stop_words = stopwords.words('english')

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt

import logging
logging.basicConfig(format='%(asctims)s : %(levelname)s : %(message)s', 
                     level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
# Import Data

df = pd.read_json('http://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')

In [3]:
print(df.columns)
print("/////////////////")
print(df.target_names.unique())
df.head()

Index(['content', 'target', 'target_names'], dtype='object')
/////////////////
['rec.autos' 'comp.sys.mac.hardware' 'rec.motorcycles' 'misc.forsale'
 'comp.os.ms-windows.misc' 'alt.atheism' 'comp.graphics'
 'rec.sport.baseball' 'rec.sport.hockey' 'sci.electronics' 'sci.space'
 'talk.politics.misc' 'sci.med' 'talk.politics.mideast'
 'soc.religion.christian' 'comp.windows.x' 'comp.sys.ibm.pc.hardware'
 'talk.politics.guns' 'talk.religion.misc' 'sci.crypt']


Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
10,From: irwin@cmptrc.lonestar.org (Irwin Arnstei...,8,rec.motorcycles
100,From: tchen@magnus.acs.ohio-state.edu (Tsung-K...,6,misc.forsale
1000,From: dabl2@nlm.nih.gov (Don A.B. Lindbergh)\n...,2,comp.os.ms-windows.misc


In [4]:
data = df.content.values

In [5]:
data = [re.sub('\S*@\S*\s?','',sent) for sent in data]
data = [re.sub('\s+',' ',sent) for sent in data]
data = [re.sub("\'",'',sent) for sent in data]

print(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15 I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL ---- brought to you by your neighborhood Lerxst ---- ']


In [6]:
# Above result still looks dirty
# although the <user>@<mailaddress> / unnecessary blank is removed

# => try to use simple_preprocess from Gensim
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
# Return sends a specified value back to its caller 
# whereas Yield can produce a sequence of values. 
# We should use yield when we want to iterate over a sequence,
# but don’t want to store the entire sequence in memory.

In [7]:
data_words = list(sent_to_words(data))

In [8]:
print(data_words[:2])

[['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp', 'posting', 'host', 'rac', 'wam', 'umd', 'edu', 'organization', 'university', 'of', 'maryland', 'college', 'park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst'], ['from', 'guy', 'kuo', 'subje

### Build bi-gram / tri-gram

In [9]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)



In [10]:
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [11]:
print(trigram_mod[bigram_mod[data_words[0]]])

['from', 'wheres', 'my', 'thing', 'subject', 'what', 'car', 'is', 'this', 'nntp_posting_host', 'rac_wam_umd_edu', 'organization', 'university', 'of', 'maryland_college_park', 'lines', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'saw', 'the', 'other', 'day', 'it', 'was', 'door', 'sports', 'car', 'looked', 'to', 'be', 'from', 'the', 'late', 'early', 'it', 'was', 'called', 'bricklin', 'the', 'doors', 'were', 'really', 'small', 'in', 'addition', 'the', 'front_bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', 'this', 'is', 'all', 'know', 'if', 'anyone', 'can', 'tellme', 'model', 'name', 'engine', 'specs', 'years', 'of', 'production', 'where', 'this', 'car', 'is', 'made', 'history', 'or', 'whatever', 'info', 'you', 'have', 'on', 'this', 'funky', 'looking', 'car', 'please', 'mail', 'thanks', 'il', 'brought', 'to', 'you', 'by', 'your', 'neighborhood', 'lerxst']


### Drop the stopwords

In [28]:
def remove_stopwords(texts):
    return[[word for word in simple_preprocess(str(doc)) 
            if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


def do_lemmatization(texts, allowed_postags=['VB', 'JJ', 'NN', 'RB']):
    # maybe VB.* JJ.* ...
    # it is necessary to do more job 
    # but I will skip this in this tutorial
    texts_out = []
    
    for sent in texts:
        doc = [" ".join(sent)]
        doc = list(sent_to_words(doc))
#         print(doc)
       
        #print(doc)
        doc_tags = nltk.pos_tag(doc[0])
        temp_tags = []
        
        for tag in doc_tags:
            if tag[1] in allowed_postags:
                temp_tags.append(lemmatizer.lemmatize(tag[0]))

        texts_out.append(temp_tags)
        
    return texts_out

In [15]:
data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
data_words_trigrams = make_trigrams(data_words_nostops)

In [29]:
data_lemmarized = do_lemmatization(data_words_bigrams)

In [31]:
# data_words_bigrams[:10]
# data_lemmarized

In [138]:
# text = list(sent_to_words(["They refuse to permit us to obtain the refuse permit"]))
# print(text)
# a = nltk.pos_tag(text[0])
# print(a)

# print(len(a))
# # for test in text:
# #     text
# #     print(nltk.pos_tag(test))
        

In [32]:
# Build dictionary(id2word) and corpus
id2word = corpora.Dictionary(data_lemmarized)

texts = data_lemmarized

corpus = [id2word.doc2bow(text) for text in texts]

In [33]:
print(corpus[:1])

[[(0, 1), (1, 2), (2, 1), (3, 1), (4, 5), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1)]]


In [34]:
id2word[0]

'addition'

In [35]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('addition', 1),
  ('anyone', 2),
  ('body', 1),
  ('bricklin', 1),
  ('car', 5),
  ('day', 1),
  ('door', 1),
  ('early', 1),
  ('engine', 1),
  ('enlighten', 1),
  ('front_bumper', 1),
  ('funky', 1),
  ('history', 1),
  ('host', 1),
  ('info', 1),
  ('late', 1),
  ('lerxst', 1),
  ('mail', 1),
  ('model', 1),
  ('name', 1),
  ('neighborhood', 1),
  ('organization', 1),
  ('park', 1),
  ('please', 1),
  ('production', 1),
  ('rac_wam', 1),
  ('really', 1),
  ('rest', 1),
  ('saw', 1),
  ('separate', 1),
  ('small', 1),
  ('spec', 1),
  ('subject', 1),
  ('tellme', 1),
  ('thing', 1),
  ('umd_edu', 1),
  ('university', 1)]]

### Build the Topic Model

In [41]:
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=20,
    random_state=100,
    update_every=2,
    chunksize=100,
    passes=15,
    alpha='auto',
    per_word_topics=True
)

In [43]:
pprint(lda_model.print_topics())

[(0,
  '0.094*"subject" + 0.091*"organization" + 0.046*"university" + 0.039*"host" '
  '+ 0.035*"article" + 0.019*"anyone" + 0.017*"edu" + 0.016*"reply" + '
  '0.016*"please" + 0.011*"mail"'),
 (1,
  '0.011*"car" + 0.010*"first" + 0.010*"back" + 0.010*"year" + 0.009*"didnt" + '
  '0.009*"good" + 0.008*"game" + 0.008*"last" + 0.008*"day" + 0.007*"hockey"'),
 (2,
  '0.022*"year" + 0.014*"paul" + 0.013*"team" + 0.012*"season" + '
  '0.009*"baseball" + 0.008*"win" + 0.007*"radio" + 0.006*"last" + '
  '0.006*"league" + 0.006*"matthew"'),
 (3,
  '0.166*"max" + 0.015*"bike" + 0.014*"wire" + 0.010*"neutral" + 0.010*"dod" + '
  '0.009*"ground" + 0.009*"ride" + 0.008*"command" + 0.007*"doug" + '
  '0.007*"motor"'),
 (4,
  '0.020*"mark" + 0.013*"air" + 0.009*"circuit" + 0.009*"service" + '
  '0.009*"insurance" + 0.008*"oil" + 0.008*"tm" + 0.007*"hall" + '
  '0.007*"company" + 0.007*"child"'),
 (5,
  '0.025*"armenian" + 0.016*"turkish" + 0.011*"land" + 0.011*"van" + '
  '0.010*"serdar_argic" + 0.0

In [44]:
print('\nPerplexity: ' , lda_model.log_perplexity(corpus))


Perplexity:  -8.399559850210089


In [46]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmarized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.5423593957992933


### topic-keyword visualize

In [47]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


## LDA에 대한 최적의 토픽수를 찾는 방법은 무엇인가?

##### 토픽수(k)가 다른 여러 LDA 모델을 작성하고 가장 높은 일관성 값을 제공하는 LDA 모델을 선택하는 것.
##### 여러 주제에서 동일한 키워드가 반복되는 경우 'k'값이 너무 크다는 것 일 수 있다.

## 각 문서에서 지배적인 토픽찾기
#### 해당 문서에서 가장 높은 비율로 기여한 토픽 번를 찾아야 함.

## 각 토픽 별로 가장 대표적인 문서찾기
#### 토픽 키워드만으로는 토픽이 무엇인가 이해할 수 없을 수도 있다. 따라서 토픽을 이해라는데 도움이 되도록
#### 주어진 토픽이 가장 많이 기려한 문서를 찾으면 토픽을 추론 할 수 있을 것이다.

## 문서 전체적인 토픽 분포
#### 토픽의 양과 분포를 이해하여 토픽이 얼마나 넓게 논의 되었는지를 판단함.