In [2]:
import pandas as pd
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import ast
import spacy

nltk.download('stopwords')
nltk.download('wordnet')
stemmer = WordNetLemmatizer()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
article_2015 = pd.read_csv('data/korea_herald_2015_30_ver_preprocessing.csv')
article_2016 = pd.read_csv('data/korea_herald_2016_30_ver_preprocessing.csv')
article_2017 = pd.read_csv('data/korea_herald_2017_30_ver_preprocessing.csv')

In [93]:
def preprocessing(rawtext):
    document = rawtext
    document = re.sub(r'said', '', document)

    # Remove date
    dates = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'week', 'next', 'month', 'year']
    for date in dates:
        document = re.sub(r'{}'.format(date), '', document)
    
    # Remove publisher
    document = re.sub(r'korea herald', ' ', document)
    document = re.sub(r'history textbooks', ' ', document)
    document = re.sub(r'history textbook', ' ', document)

    # Remove president name
    # document = re.sub(r'president', ' ', document)
    document = re.sub(r'south korea', ' ', document)
    document = re.sub(r'north korea', ' ', document)
    document = re.sub(r'seoul', ' ', document)

    document = re.sub(r'president park', ' ', document)
    document = re.sub(r'korean geun', ' ', document)
    document = re.sub(r'geun', ' ', document)
    document = re.sub(r'geun hye', ' ', document)
    document = re.sub(r'president hye', ' ', document)

    document = re.sub(r'president moon', ' ', document)
    document = re.sub(r'moon jae', ' ', document)
    document = re.sub(r'korean jae', ' ', document)
    document = re.sub(r'korea', ' ', document)
    document = re.sub(r'korean', ' ', document)

    # document = re.sub(r'us donald', ' ', document)
    document = document.replace('us donald', ' ')
    document = document.replace('donald', ' ')
    document = document.replace('trump', ' ')
    document = document.replace('us', ' ')

    document = re.sub(r'kim jong', ' ', document)

    document = re.sub(r'\s+', ' ', document)
    return document
    
def remove_duplicates(candidates):
    result = []
    for cand in candidates:
        is_duplicate = False
        for cand2 in candidates:
            if cand == cand2:
                continue
            if cand in cand2:
                is_duplicate = True
                break
        if not is_duplicate:
            result.append(cand)
    return result
    
def capital(string):
    string_list = string.split()
    string_list = [s.capitalize() for s in string_list]
    return ' '.join(string_list)

In [114]:
#############################################
# 2015 koreaherald article topic extraction #
#############################################
updated_cluster_2015 = [[0], [1, 13, 16, 19], [2], [3, 5, 11], [4], [6], [7, 8, 9, 23], [8, 9], [10, 12], [14, 28, 29], [15], [17], [18, 24, 26, 27], [20], [21], [22], [25]]
top_2015 = dict()

for i in updated_cluster_2015:
    keyword_set = article_2015[article_2015['cluster'].isin(i)].keyword
    doc_num = len(keyword_set)
    # keyword_set = [preprocessing(keyword) for keyword_list in keyword_set for keyword in keyword_list]
    # keyword_text = ''.join(keyword_set)
    keyword_set = [preprocessing(keyword.replace(' ,', ' ')) for keyword in keyword_set]

    vectorizer = CountVectorizer(max_features=1500, ngram_range=(2, 5), min_df=1, max_df=doc_num/3, stop_words=stopwords.words('english'))
    X_count = vectorizer.fit_transform(keyword_set).toarray()
    X_tfidf = TfidfTransformer().fit_transform(X_count).toarray()

    result = pd.DataFrame(X_count, columns= vectorizer.get_feature_names())
    # print('max frequency', max(result.sum(axis=0)))
    result = list(result.sum(axis=0).sort_values(ascending=False).keys()[:100])

    print('=========={}:{}=========='.format(i, doc_num))
    # print(remove_duplicates(result)[:15])
    print('Topic:', result[:10])
    print('Docs number:', doc_num)
    top_2015[capital(result[0])] = doc_num
print('============================')



Topic: ['china launched', 'barack obama', 'president barack', 'nuclear envoy', 'stage winter drills', 'president barack obama', 'postpone trip', 'trip postpone trip', 'winter drills', 'trip postpone']
Docs number: 249
Topic: ['nuclear weapons', 'nuclear test', 'military drills', 'ballistic missile', 'nuclear tests', 'new nuclear', 'nuclear envoy', 'nuclear warheads', 'pend nuclear', 'nuclear talks']
Docs number: 1049
Topic: ['financial regulator', 'youth unemployment', 'fourth largest', '17 billion', 'fourth largest economy', 'climate change', 'top 30', '20 million', 'largest economy', 'antitr watchdog']
Docs number: 179
Topic: ['sex slavery', 'sex slaves', 'japan wartime', 'sexual slavery', 'wartime sex', 'wartime sexual', 'wartime sex slavery', 'wartime sexual slavery', 'japanese prime', 'sexual enslavement']
Docs number: 588
Topic: ['former president', 'president kim', 'foreign minister', 'presidential spokesman', 'presidential office', 'late president', 'winter olympics', 'presiden

In [111]:
#############################################
# 2016 koreaherald article topic extraction #
#############################################
updated_cluster_2016 = [[0, 1, 3, 6, 11, 12, 13, 23, 28], [2, 10, 17, 18, 22, 27], [4, 7, 20, 29], [5, 8], [9], [14], [15], [16], [19], [21], [24], [25], [26]]
top_2016 = dict()
for i in updated_cluster_2016:
    keyword_set = article_2016[article_2016['cluster'].isin(i)].keyword
    doc_num = len(keyword_set)
    # keyword_set = [preprocessing(keyword) for keyword_list in keyword_set for keyword in keyword_list]
    # keyword_text = ''.join(keyword_set)
    keyword_set = [preprocessing(keyword.replace(' ,', ' ')) for keyword in keyword_set]

    vectorizer = CountVectorizer(max_features=1500, ngram_range=(2, 5), min_df=1, max_df=doc_num/3, stop_words=stopwords.words('english'))
    X_count = vectorizer.fit_transform(keyword_set).toarray()
    X_tfidf = TfidfTransformer().fit_transform(X_count).toarray()

    result = pd.DataFrame(X_count, columns= vectorizer.get_feature_names())
    # print('max frequency', max(result.sum(axis=0)))
    result = list(result.sum(axis=0).sort_values(ascending=False).keys()[:100])

    print('=========={}:{}=========='.format(i, doc_num))
    # print(remove_duplicates(result)[:15])
    print('Topic:', result[:10])
    print('Docs number:', doc_num)
    top_2016[capital(result[0])] = doc_num
print('============================')



Topic: ['nuclear test', 'fourth nuclear', 'fourth nuclear test', 'latest nuclear', 'fifth nuclear', 'fifth nuclear test', 'ballistic missile', 'latest nuclear test', 'rocket launch', 'recent nuclear']
Docs number: 2500
Topic: ['corruption scandal', 'scandal involving', 'prosecutors raided', 'opposition parties', 'involving president', 'scandal surrounding', 'peddling scandal', 'former presidential', 'opposition party', 'scandal involving president']
Docs number: 1269
Topic: ['presidential race', 'presidential election', 'presidential candidate', 'republican presidential', 'barack obama', 'president barack', 'president barack obama', 'race presidential', 'presidential race presidential', 'republican presidential candidate']
Docs number: 734
Topic: ['sex slavery', 'sexual slavery', 'wartime sexual', 'defectors defectors', 'wartime sexual slavery', 'japan wartime', 'sex slaves', 'slavery victims', 'three nobel', 'late president']
Docs number: 567
Topic: ['jail term', 'found dead', 'sexual

In [112]:
#############################################
# 2017 koreaherald article topic extraction #
#############################################
updated_cluster_2017 = [[0], [1, 5, 8, 11, 12, 25, 28], [2, 19], [3], [4, 15], [6], [7, 9, 14, 17, 18, 20, 22, 23, 24], [10], [13], [16], [21], [26], [27], [29]]
top_2017 = dict()

for i in updated_cluster_2017:
    keyword_set = article_2017[article_2017['cluster'].isin(i)].keyword
    doc_num = len(keyword_set)
    # keyword_set = [preprocessing(keyword) for keyword_list in keyword_set for keyword in keyword_list]
    # keyword_text = ''.join(keyword_set)
    keyword_set = [preprocessing(keyword.replace(' ,', ' ')) for keyword in keyword_set]

    vectorizer = CountVectorizer(max_features=1500, ngram_range=(2, 5), min_df=1, max_df=doc_num/3, stop_words=stopwords.words('english'))
    X_count = vectorizer.fit_transform(keyword_set).toarray()
    X_tfidf = TfidfTransformer().fit_transform(X_count).toarray()

    result = pd.DataFrame(X_count, columns= vectorizer.get_feature_names())
    # print('max frequency', max(result.sum(axis=0)))
    result = list(result.sum(axis=0).sort_values(ascending=False).keys()[:100])

    print('=========={}:{}=========='.format(i, doc_num))
    # print(remove_duplicates(result)[:15])
    print('Topic:', result[:30])
    print('Docs number:', doc_num)
    top_2017[capital(result[0])] = doc_num
print('============================')



Topic: ['conservative presidents', 'acting president', 'old daughter', 'two conservative', 'slavery died', 'two conservative presidents', 'impeached former', 'stealing cash', 'new astronaut', 'minister vows', 'female pilots', 'presidential race', 'vacation looming', 'ho eplants', 'female assassins', 'energy minister', 'grieving mother', 'army soldier', '11 hamsters', 'presidential front', 'revenge porn', 'bashes obama', 'nephew tired', 'emergency surgery', 'epidemic cows', 'commit suicide', '000 contract', 'new education', 'eplants looking', 'sia trip']
Docs number: 252
Topic: ['nuclear test', 'ballistic missile', 'missile launch', 'latest missile', 'missile test', 'latest nuclear', 'nuclear weapons', 'intercontinental ballistic', 'latest nuclear test', 'military drills', 'test latest', 'sixth nuclear', 'sixth nuclear test', 'latest missile test', 'ballistic missiles', 'intercontinental ballistic missile', 'latest missile launch', 'nuclear missile', 'missile tests', 'nuclear nuclear', 

In [113]:
top10_2015 = sorted(top_2015.items(), reverse=True, key=lambda x: x[1])[:10]
top10_2016 = sorted(top_2016.items(), reverse=True, key=lambda x: x[1])[:10]
top10_2017 = sorted(top_2017.items(), reverse=True, key=lambda x: x[1])[:10]
print('[2015] Top10 Topic')
for topic in top10_2015:
    print(topic, end=' ')
print()
print()
print('[2016] Top10 Topic')
for topic in top10_2016:
    print(topic, end=' ')
print()
print()
print('[2017] Top10 Topic')
for topic in top10_2017:
    print(topic, end=' ')
print()

[2015] Top10 Topic
('Military Parade', 1150) ('Nuclear Weapons', 1049) ('Sex Slavery', 588) ('Female Activists', 505) ('Arrest Warrant', 477) ('Opposition Party', 457) ('2015 World', 362) ('Fishing Boat', 353) ('Foreign Minister', 352) ('Air Force', 284) 

[2016] Top10 Topic
('Nuclear Test', 2500) ('Corruption Scandal', 1269) ('Presidential Race', 734) ('Sex Slavery', 567) ('Chinese Tourists', 245) ('College Student', 206) ('Found Dead', 204) ('Record High', 198) ('Magnitude Earthquake', 196) ('Survey Showed', 186) 

[2017] Top10 Topic
('Presidential Election', 2651) ('Nuclear Test', 2126) ('Winter Olympics', 703) ('Corruption Scandal', 645) ('Top Diplomat', 383) ('Half Brother', 327) ('Sewol Ferry', 287) ('Conservative Presidents', 252) ('Air Pollution', 237) ('Scientists Develop', 232) 
