In [15]:
%pylab inline

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from gensim.models import TfidfModel
from gensim.parsing.preprocessing import STOPWORDS
from pathlib import Path
from tempfile import gettempdir

import kaggle
import nltk
import gensim
import numpy as np
import pandas as pd
import warnings

kaggle.api.authenticate()
nltk.download('wordnet')
warnings.simplefilter('ignore')

Populating the interactive namespace from numpy and matplotlib


[nltk_data] Downloading package wordnet to /home/anderson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Data 

In [10]:
dest_path = Path(gettempdir()) / 'news-headlines'
data_name = 'therohk/million-headlines'
kaggle.api.dataset_download_files(data_name, path=dest_path, unzip=True)

data = pd.read_csv(dest_path / 'abcnews-date-text.csv')
data.head()

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


# Preprocess

## Tokenization

In [16]:
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def lemmatize_stemming(text):
    return stemmer.stem(lemmatizer.lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

doc_sample = data[data.index == 4310]['headline_text'].values[0]

print('original  :', doc_sample)
print('preprocess:', preprocess(doc_sample))

original  : ratepayers group wants compulsory local govt voting
preprocess: ['ratepay', 'group', 'want', 'compulsori', 'local', 'govt', 'vote']


In [17]:
docs = data['headline_text'].apply(preprocess)
docs.head()

0     [decid, communiti, broadcast, licenc]
1                        [wit, awar, defam]
2    [call, infrastructur, protect, summit]
3               [staff, aust, strike, rise]
4      [strike, affect, australian, travel]
Name: headline_text, dtype: object

## Bag of Words

In [18]:
def show_dict(data, n=10):
    i = iter(data.iteritems())

    for _ in range(10):
        k, v = next(i)
        print(k, v)

# To Dictionary
dictionary = gensim.corpora.Dictionary(docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

show_dict(dictionary)

0 broadcast
1 communiti
2 decid
3 licenc
4 awar
5 defam
6 wit
7 call
8 infrastructur
9 protect


In [20]:
# Bag of Words
# [[(index, 횟수), ....], ...]
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]
bow_corpus[0]

def display_bow(bow):
    for idx, cnt in bow:
        word = dictionary[idx]
        print(f'{idx:<6}: {word:15} | count:{cnt}')
    
display_bow(bow_corpus[502])

34    : council         | count:1
1216  : approv          | count:1
1217  : farm            | count:1
1218  : poultri         | count:1


## TF-IDF

In [23]:
tfidf = TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

print('corpus_tfidf size:', len(corpus_tfidf))
corpus_tfidf[0]

corpus_tfidf size: 1186018


[(0, 0.5850076620505259),
 (1, 0.38947256567331934),
 (2, 0.4997099083387053),
 (3, 0.5063271308533074)]

# LDA 