In [1]:
%pylab inline

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import simple_preprocess
from gensim.models import TfidfModel
from pathlib import Path
from tempfile import gettempdir
from pyLDAvis.gensim import prepare as lda_prepare
import kaggle
import gensim
import nltk
import numpy as np
import pandas as pd
import pyLDAvis
import warnings

pd.options.display.float_format = '{:,.4f}'.format
nltk.download('wordnet')
kaggle.api.authenticate()
pyLDAvis.enable_notebook()
warnings.simplefilter('ignore')

Populating the interactive namespace from numpy and matplotlib


[nltk_data] Downloading package wordnet to /home/anderson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data 

In [2]:
dest_path = Path(gettempdir()) / 'news-headlines'
data_name = 'therohk/million-headlines'
kaggle.api.dataset_download_files(data_name, path=dest_path, unzip=True)

data = pd.read_csv(dest_path / 'abcnews-date-text.csv')
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


# Preprocess

## Stemmer Example

Stem은 단어에서 몇개의 글자들을 제거합니다. <br>
따라서 의미없는 단어나 오타등으로 이어질 수 있습니다. 

In [3]:
stemmer = SnowballStemmer('english')

original_words = ['caresses', 'flies', 'dies', 'mules', 'denied', 'died', 'agreed', 'owned',
                  'humbled', 'sized', 'meeting', 'stating', 'siezing', 'itemization', 'sensational',
                  'traditional', 'reference', 'colonizer', 'plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data={'stemmed': singles}, index=original_words).T

Unnamed: 0,caresses,flies,dies,mules,denied,died,agreed,owned,humbled,sized,meeting,stating,siezing,itemization,sensational,traditional,reference,colonizer,plotted
stemmed,caress,fli,die,mule,deni,die,agre,own,humbl,size,meet,state,siez,item,sensat,tradit,refer,colon,plot


## Lemmatize Example

Context를 이해하고, 단어를 의미있는 base 형태로 변경하며, 변경된 형태를 Lemma 라고 함. 

In [4]:
WordNetLemmatizer().lemmatize('found', pos='v')

'find'

## Tokenization

In [5]:
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def lemmatize_stemming(text):
    return stemmer.stem(lemmatizer.lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

doc_sample = data[data.index == 4310]['headline_text'].values[0]

print('original  :', doc_sample)
print('preprocess:', preprocess(doc_sample))

original  : ratepayers group wants compulsory local govt voting
preprocess: ['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']


In [6]:
docs = data['headline_text'].apply(preprocess)
docs.head()

0     [decid, communiti, broadcast, licenc]
1                        [wit, awar, defam]
2    [call, infrastructur, protect, summit]
3               [staff, aust, strike, rise]
4      [strike, affect, australian, travel]
Name: headline_text, dtype: object

## Bag of Words

In [7]:
def show_dict(data, n=10):
    i = iter(data.iteritems())

    for _ in range(10):
        k, v = next(i)
        print(k, v)

# To Dictionary
dictionary = gensim.corpora.Dictionary(docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

show_dict(dictionary)

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect


In [8]:
# Bag of Words
# [[(index, 횟수), ....], ...]
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]
bow_corpus[0]

[(0, 1), (1, 1), (2, 1), (3, 1)]

In [9]:
def display_bow(bow):
    for idx, cnt in bow:
        word = dictionary[idx]
        print(f'{idx:<6}: {word:15} | count:{cnt}')
    
display_bow(bow_corpus[502])

34    : council         | count:1
1216  : approv          | count:1
1217  : farm            | count:1
1218  : poultri         | count:1


## TF-IDF

In [10]:
tfidf = TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

print('corpus_tfidf size:', len(corpus_tfidf))oups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             retu
corpus_tfidf[0]

corpus_tfidf size: 1186018


[(0, 0.5850076620505259),
 (1, 0.38947256567331934),
 (2, 0.4997099083387053),
 (3, 0.5063271308533074)]

# LDA with Bag of Words

## Model

In [11]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
print('num terms :', lda_model.num_terms)
print('num topics:', lda_model.num_topics)

num terms : 14939
num topics: 10


## Feature Importance

In [12]:
def show_lds_feature_importance(lda_model, dictionary, k=10):
    topic_matrix = lda_model.get_topics()

    # argsort with descending order
    feat_impt_indices = (-topic_matrix).argsort(axis=1)[:, :k]
    feat_impt_words = [[dictionary[w] for w in sentence]
                       for sentence in feat_impt_indices]
    feat_impt = np.vstack([topic_matrix[i, idx]
                           for i, idx in enumerate(feat_impt_indices)])

    n_category = len(feat_impt)
    df = pd.concat([pd.DataFrame([feat_impt_words[i], feat_impt[i]], index=['word', 'impt']) for i in range(n_category)],
                   keys=[i for i in range(n_category)]).T

    vis_df = df.style.format('{:.4}') \
               .applymap(lambda x: 'background-color: #e6fcfc', subset=df.columns.get_loc_level('word', level=1)[0]) \
               .bar(subset=df.columns.get_loc_level('impt', level=1)[0], axis=1, width=100, color='#ee1f5f') \
               .set_properties(subset=df.columns.get_loc_level('impt', level=1)[0], width='0px') \
               .set_properties(subset=df.columns.get_loc_level('word', level=1)[0], width='0px')
    display(vis_df)


show_lds_feature_importance(lda_model, dictionary)

Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9
Unnamed: 0_level_1,word,impt,word,impt,word,impt,word,impt,word,impt,word,impt,word,impt,word,impt,word,impt,word,impt
0,help,0.02062,melb,0.0285,trum,0.04225,kill,0.02906,plan,0.01606,year,0.02842,repo,0.02482,cras,0.02314,worl,0.02918,poli,0.03318
1,fede,0.01815,coas,0.02109,aust,0.02114,adel,0.02354,chan,0.01563,news,0.02283,quee,0.02128,dona,0.02279,sout,0.02225,char,0.02737
2,miss,0.01652,bush,0.01955,open,0.01823,peop,0.01717,heal,0.01385,live,0.02032,leav,0.01672,die,0.02004,mark,0.01891,cour,0.02454
3,fami,0.01549,coun,0.0163,tasm,0.01663,atta,0.01603,wate,0.01334,time,0.01535,say,0.01409,roya,0.01539,prot,0.01732,murd,0.02271
4,tasm,0.01474,gold,0.01392,fina,0.01575,inte,0.01493,rura,0.01164,vict,0.01322,guil,0.01372,inve,0.01429,reco,0.01564,face,0.01609
5,busi,0.01448,farm,0.01284,game,0.01166,elec,0.01456,coun,0.01113,fall,0.01284,test,0.01217,hous,0.01271,aust,0.0156,woma,0.0157
6,life,0.01384,mill,0.01267,retu,0.01097,brea,0.0144,indi,0.01112,nati,0.01197,show,0.01206,comm,0.0124,bank,0.01394,alle,0.01566
7,gove,0.01367,drou,0.0123,forc,0.00945,hoba,0.01107,fund,0.01111,aust,0.01141,poli,0.01134,trai,0.01168,aust,0.01295,shoo,0.01566
8,chil,0.01305,scot,0.01219,aust,0.009392,part,0.01095,spea,0.01083,win,0.01135,pres,0.01107,stor,0.01151,west,0.01235,jail,0.01483
9,abus,0.01288,libe,0.01217,beat,0.009269,chri,0.0104,pric,0.01071,anno,0.01032,morr,0.011,road,0.01063,sydn,0.01107,sydn,0.01415


In [13]:
lda_vis = lda_prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.display(lda_vis)

# LDA with TF-IDF

## Model

In [14]:
oups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'),
                             retulda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

## Feature Importance

In [15]:
show_lds_feature_importance(lda_model_tfidf, dictionary)

Unnamed: 0_level_0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9
Unnamed: 0_level_1,word,impt,word,impt,word,impt,word,impt,word,impt,word,impt,word,impt,word,impt,word,impt,word,impt
0,weat,0.008608,gove,0.00937,kill,0.00762,coun,0.01858,elec,0.01667,trum,0.02628,cras,0.01534,char,0.01802,news,0.02109,dona,0.01635
1,mark,0.006392,heal,0.008337,vide,0.007298,hour,0.01409,drum,0.01306,bush,0.01215,die,0.01059,poli,0.0161,inte,0.01584,roya,0.01146
2,wall,0.006347,stor,0.007944,davi,0.007259,worl,0.008047,tues,0.008283,quee,0.00972,hoba,0.00969,murd,0.0157,rura,0.01259,clim,0.01067
3,shar,0.006042,fund,0.007218,prot,0.006296,aust,0.007774,labo,0.008117,vict,0.008719,fiji,0.00698,woma,0.01147,live,0.008945,comm,0.01061
4,aust,0.005757,budg,0.006093,poli,0.006063,expl,0.00618,libe,0.007505,turn,0.008041,viol,0.006743,cour,0.01129,chri,0.008335,spor,0.008572
5,wild,0.005639,coun,0.005529,atta,0.00593,aust,0.006023,part,0.006444,hill,0.007883,truc,0.006416,jail,0.01059,frid,0.007681,marc,0.007484
6,capi,0.005484,scho,0.005311,sept,0.005594,soci,0.005504,juli,0.006328,wedn,0.007757,age,0.006382,alle,0.01015,mond,0.007574,scot,0.007282
7,harv,0.004994,morr,0.005225,jam,0.005514,wome,0.005498,ener,0.006282,stab,0.007537,dome,0.00634,sent,0.008887,mich,0.007244,chan,0.007008
8,econ,0.004956,plan,0.005062,pris,0.005479,open,0.005224,say,0.006044,nort,0.007455,isla,0.006331,guil,0.008771,mark,0.006532,andr,0.00692
9,food,0.004749,cut,0.004914,bomb,0.005433,alan,0.004937,octo,0.005748,floo,0.007037,paci,0.006179,assa,0.008645,leag,0.006478,hist,0.006427


In [16]:
lda_vis_tfidf = lda_prepare(lda_model_tfidf, bow_corpus, dictionary)
pyLDAvis.display(lda_vis_tfidf)

# Inference

## Inference from Training Set

In [17]:
probs = lda_model_tfidf.inference(bow_corpus[:2])[0]
y_preds = np.argmax(probs, axis=1)
y_scores = probs[arange(len(y_preds)), y_preds]

print('y_preds:', y_preds)
print('y_scores:', y_scores)

y_preds: [6 7]
y_scores: [1.738477  1.0997581]


## Inference from Unseen Document

In [18]:
def inference(lda_model, docs):
    bows = [dictionary.doc2bow(preprocess(doc)) for doc in docs]
    probs = lda_model_tfidf.inference(bows)[0]
    y_preds = np.argmax(probs, axis=1)
    y_scores = probs[arange(len(y_preds)), y_preds]
    y_features = [lda_model.print_topic(y, 5) for y in y_preds]

    print('y_preds:', y_preds)
    print('y_scores:', y_scores)
    
    df = pd.DataFrame({'topic': y_preds, 'score': y_scores, 'feature': y_features})
    vis_df = df.style.set_properties(subset=['feature'], width='500px')
    display(vis_df)

unseen_document = ["Son Heung-min's new Tottenham deal close as Toby Alderweireld reacts to Jose Mourinho decision", 
                   'Heung-min Son thinks £23m Tottenham player is the best at one thing', 
                   'Superstar Son Heung-min is determined to win international ‘A’ match', 
                   'Spurs ace Son Heung-min says he misses £18m man ‘so much’, Mauricio Pochettino sold him']
    
inference(lda_model_tfidf, unseen_document)

y_preds: [4 8 6 8]
y_scores: [2.947288  2.7669137 1.8746536 4.099104 ]


Unnamed: 0,topic,score,feature
0,4,2.947288,"0.017*""elect"" + 0.013*""drum"" + 0.008*""tuesday"" + 0.008*""labor"" + 0.008*""liber"""
1,8,2.766914,"0.021*""news"" + 0.016*""interview"" + 0.013*""rural"" + 0.009*""live"" + 0.008*""christma"""
2,6,1.874654,"0.015*""crash"" + 0.011*""die"" + 0.010*""hobart"" + 0.007*""fiji"" + 0.007*""violenc"""
3,8,4.099104,"0.021*""news"" + 0.016*""interview"" + 0.013*""rural"" + 0.009*""live"" + 0.008*""christma"""
