In [1]:
import pandas as pd
import re
import nltk
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import ast
import spacy

nltk.download('stopwords')
nltk.download('wordnet')
stemmer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
target = pd.read_csv('data/korea_herald_2015_30_ver_preprocessing.csv')

In [53]:
def preprocessing(rawtext):
    document = rawtext
    document = re.sub(r'said', '', document)

    dates = ['monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'week', 'next', 'month', 'year']
    for date in dates:
        document = re.sub(r'{}'.format(date), '', document)
    
    document = re.sub(r'korea herald', ' ', document)
    document = re.sub(r'history textbooks', ' ', document)
    document = re.sub(r'history textbook', ' ', document)

    # Remove president name
    document = re.sub(r'president park', ' ', document)
    document = re.sub(r'south korea', ' ', document)
    document = re.sub(r'north korea', ' ', document)
    document = re.sub(r'seoul', ' ', document)
    document = re.sub(r'korean geun', ' ', document)
    document = re.sub(r'geun', ' ', document)
    document = re.sub(r'geun hye', ' ', document)
    document = re.sub(r'president hye', ' ', document)

    document = re.sub(r'\s+', ' ', document)
    return document
    
def remove_duplicates(candidates):
    result = []
    for cand in candidates:
        is_duplicate = False
        for cand2 in candidates:
            if cand == cand2:
                continue
            if cand in cand2:
                is_duplicate = True
                break
        if not is_duplicate:
            result.append(cand)
    return result
    

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

updated_cluster = [[0], [1, 13, 16, 19], [2], [3, 5, 11], [4], [6], [7, 23], [8, 9], [10, 12], [14, 28, 29], [15], [17], [18, 26, 27], [20], [21], [22], [24], [25]]

for i in updated_cluster:
    keyword_set = target[target['cluster'].isin(i)].keyword
    doc_num = len(keyword_set)
    # keyword_set = [preprocessing(keyword) for keyword_list in keyword_set for keyword in keyword_list]
    # keyword_text = ''.join(keyword_set)
    keyword_set = [preprocessing(keyword.replace(' ,', ' ')) for keyword in keyword_set]

    vectorizer = CountVectorizer(max_features=1500, ngram_range=(2, 5), min_df=1, max_df=doc_num/3, stop_words=stopwords.words('english'))
    X_count = vectorizer.fit_transform(keyword_set).toarray()
    X_tfidf = TfidfTransformer().fit_transform(X_count).toarray()

    result = pd.DataFrame(X_count, columns= vectorizer.get_feature_names())
    print('max frequency', max(result.sum(axis=0)))
    result = list(result.sum(axis=0).sort_values(ascending=False).keys()[:100])

    print('=========={}:{}=========='.format(i, doc_num))
    # print(remove_duplicates(result)[:15])
    print(result[:20])
print('============================')



max frequency 9
['china launched', 'korean war', 'donald trump', 'barack obama', 'president barack', 'korea japan', 'nuclear envoy', 'hye pledged', 'postpone trip', 'trip postpone', 'trip postpone trip', 'foreign minister', 'runner donald', 'runner donald trump', 'stage winter drills', 'president barack obama', 'winter drills', 'stage winter', 'china launched china', 'china china']
max frequency 54
['nuclear weapons', 'nuclear test', 'korea nuclear', 'military drills', 'ballistic missile', 'nuclear tests', 'korea china', 'inter korean', 'nuclear korea', 'new nuclear', 'nuclear warheads', 'nuclear envoy', 'suspend nuclear', 'nuclear talks', 'rocket launch', 'urges korea', 'nuclear program', 'nuclear deal', 'suspend nuclear tests', 'korea korea']
max frequency 10
['financial regulator', 'youth unemployment', 'fourth largest', 'top 30', 'climate change', '20 million', 'korea top', 'largest economy', '17 billion', 'fourth largest economy', 'high youth', '4th highest', 'antitrust watchdog',