In [3]:
import pandas as pd
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import ast
import spacy

nltk.download('stopwords')
nltk.download('wordnet')
stemmer = WordNetLemmatizer()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
article_2015 = pd.read_csv('data/korea_herald_2015_30_ver_preprocessing.csv')
article_2016 = pd.read_csv('data/korea_herald_2016_30_ver_preprocessing.csv')
article_2017 = pd.read_csv('data/korea_herald_2017_30_ver_preprocessing.csv')

In [39]:
def preprocessing(rawtext):
    document = rawtext
    document = re.sub(r'said', '', document)

    dates = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'week', 'next', 'month', 'year']
    for date in dates:
        document = re.sub(r'{}'.format(date), '', document)
    
    document = re.sub(r'korea herald', ' ', document)
    document = re.sub(r'history textbooks', ' ', document)
    document = re.sub(r'history textbook', ' ', document)

    # Remove president name
    document = re.sub(r'south korea', ' ', document)
    document = re.sub(r'north korea', ' ', document)
    document = re.sub(r'seoul', ' ', document)

    document = re.sub(r'president park', ' ', document)
    document = re.sub(r'korean geun', ' ', document)
    document = re.sub(r'geun', ' ', document)
    document = re.sub(r'geun hye', ' ', document)
    document = re.sub(r'president hye', ' ', document)

    document = re.sub(r'president moon', ' ', document)
    document = re.sub(r'moon jae', ' ', document)
    document = re.sub(r'korean jae', ' ', document)
    # document = re.sub(r'geun hye', ' ', document)
    # document = re.sub(r'president hye', ' ', document)

    document = re.sub(r'\s+', ' ', document)
    return document
    
def remove_duplicates(candidates):
    result = []
    for cand in candidates:
        is_duplicate = False
        for cand2 in candidates:
            if cand == cand2:
                continue
            if cand in cand2:
                is_duplicate = True
                break
        if not is_duplicate:
            result.append(cand)
    return result
    
def capital(string):
    string_list = string.split()
    string_list = [s.capitalize() for s in string_list]
    return ' '.join(string_list)

In [6]:
#############################################
# 2015 koreaherald article topic extraction #
#############################################
updated_cluster_2015 = [[0], [1, 13, 16, 19], [2], [3, 5, 11], [4], [6], [7, 23], [8, 9], [10, 12], [14, 28, 29], [15], [17], [18, 24, 26, 27], [20], [21], [22], [25]]

for i in updated_cluster_2015:
    keyword_set = article_2015[article_2015['cluster'].isin(i)].keyword
    doc_num = len(keyword_set)
    # keyword_set = [preprocessing(keyword) for keyword_list in keyword_set for keyword in keyword_list]
    # keyword_text = ''.join(keyword_set)
    keyword_set = [preprocessing(keyword.replace(' ,', ' ')) for keyword in keyword_set]

    vectorizer = CountVectorizer(max_features=1500, ngram_range=(2, 5), min_df=1, max_df=doc_num/3, stop_words=stopwords.words('english'))
    X_count = vectorizer.fit_transform(keyword_set).toarray()
    X_tfidf = TfidfTransformer().fit_transform(X_count).toarray()

    result = pd.DataFrame(X_count, columns= vectorizer.get_feature_names())
    # print('max frequency', max(result.sum(axis=0)))
    result = list(result.sum(axis=0).sort_values(ascending=False).keys()[:100])

    print('=========={}:{}=========='.format(i, doc_num))
    # print(remove_duplicates(result)[:15])
    print('Topic:',capital(result[0]))
    print('Docs number:', doc_num)
print('============================')



Topic: China Launched
Docs number: 249
Topic: Nuclear Weapons
Docs number: 1049
Topic: Financial Regulator
Docs number: 179
Topic: Sex Slavery
Docs number: 588
Topic: Former President
Docs number: 268
Topic: Visit Korea
Docs number: 352
Topic: Arrest Warrant
Docs number: 477
Topic: Opposition Party
Docs number: 457
Topic: Fishing Boat
Docs number: 353
Topic: Korean Women
Docs number: 505
Topic: Opposition Lawmakers
Docs number: 186
Topic: Respiratory Syndrome
Docs number: 171
Topic: Military Parade
Docs number: 1150
Topic: Severe Drought
Docs number: 181
Topic: Korean War
Docs number: 362
Topic: Suicide Rate
Docs number: 172
Topic: Korean Government
Docs number: 284


In [40]:
#############################################
# 2015 koreaherald article topic extraction #
#############################################
updated_cluster_2016 = [[0, 1, 3, 6, 11, 13, 23, 28], [2, 17, 22], [4, 7, 20, 29], [5, 8], [9], [10, 18], [12], [14], [15], [16], [19], [21, 24], [24], [25], [26], [27]]

for i in updated_cluster_2016:
    keyword_set = article_2016[article_2016['cluster'].isin(i)].keyword
    doc_num = len(keyword_set)
    # keyword_set = [preprocessing(keyword) for keyword_list in keyword_set for keyword in keyword_list]
    # keyword_text = ''.join(keyword_set)
    keyword_set = [preprocessing(keyword.replace(' ,', ' ')) for keyword in keyword_set]

    vectorizer = CountVectorizer(max_features=1500, ngram_range=(2, 5), min_df=1, max_df=doc_num/3, stop_words=stopwords.words('english'))
    X_count = vectorizer.fit_transform(keyword_set).toarray()
    X_tfidf = TfidfTransformer().fit_transform(X_count).toarray()

    result = pd.DataFrame(X_count, columns= vectorizer.get_feature_names())
    # print('max frequency', max(result.sum(axis=0)))
    result = list(result.sum(axis=0).sort_values(ascending=False).keys()[:100])

    print('=========={}:{}=========='.format(i, doc_num))
    # print(remove_duplicates(result)[:15])
    print('Topic:', result[10:30])
    print('Docs number:', doc_num)
print('============================')



Topic: ['korea nuclear', 'nuclear weapons', 'missile launch', 'test nuclear', 'nuclear test nuclear', 'korea latest', 'bomb test', 'recent nuclear test', 'nuclear korea', 'test latest', 'hydrogen bomb', 'test last', 'nuclear envoy', 'missile launches', 'test nuclear test', 'nuclear test latest', 'nuclear test nuclear test', 'test latest nuclear', 'nuclear nuclear', 'test korea']
Docs number: 2230
Topic: ['former presidential', 'park impeachment', 'prosecutors indicted', 'acting president', 'corruption scandal involving', 'impeachment trial', 'president hwang', 'impeachment impeachment', 'presidential aide', 'president impeachment', 'surrounding president', 'presidential office', 'choi scandal', 'scandal surrounding president', 'impeachment president', 'political scandal', 'gangnam murder', 'alleged corruption', 'presidential secretary', 'cabinet meeting']
Docs number: 679
Topic: ['candidate donald trump', 'republican presidential candidate', 'nominee donald', 'presidential nominee', 't

In [41]:
#############################################
# 2017 koreaherald article topic extraction #
#############################################
updated_cluster_2017 = [[0], [1, 5, 8, 11], [2], [3], [4], [6], [7, 9], [10], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29]]

for i in updated_cluster_2017:
    keyword_set = article_2017[article_2017['cluster'].isin(i)].keyword
    doc_num = len(keyword_set)
    # keyword_set = [preprocessing(keyword) for keyword_list in keyword_set for keyword in keyword_list]
    # keyword_text = ''.join(keyword_set)
    keyword_set = [preprocessing(keyword.replace(' ,', ' ')) for keyword in keyword_set]

    vectorizer = CountVectorizer(max_features=1500, ngram_range=(2, 5), min_df=1, max_df=doc_num/3, stop_words=stopwords.words('english'))
    X_count = vectorizer.fit_transform(keyword_set).toarray()
    X_tfidf = TfidfTransformer().fit_transform(X_count).toarray()

    result = pd.DataFrame(X_count, columns= vectorizer.get_feature_names())
    # print('max frequency', max(result.sum(axis=0)))
    result = list(result.sum(axis=0).sort_values(ascending=False).keys()[:100])

    print('=========={}:{}=========='.format(i, doc_num))
    # print(remove_duplicates(result)[:15])
    print('Topic:', result[:10])
    print('Docs number:', doc_num)
print('============================')



Topic: ['conservative presidents', 'korean women', 'acting president', 'korean soldier', 'news korean', 'old daughter', 'two conservative', 'slavery died', 'two conservative presidents', 'stealing cash']
Docs number: 252
Topic: ['condemns korea', 'missile test', 'condemns korea missile', 'korea missile', 'latest missile', 'korea nuclear', 'nuclear issue', 'missile launch', 'nuclear test', 'korea condemns']
Docs number: 245
Topic: ['donald trump', 'president donald', 'president donald trump', 'presidential election', 'us president', 'korea korea', 'barack obama', 'winter olympics', 'acting president', 'president barack']
Docs number: 335
Topic: ['scientists develop', 'found dead', 'korean scientists', 'korean government', 'scientists developed', 'presidential contest', 'soldier defected', 'woman found dead', 'korean soldier', 'woman found']
Docs number: 232
Topic: ['former president', 'impeachment trial', 'arrest warrant', 'corruption scandal', 'bribery charges', 'presidential aide', 'p