In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
import ast
import json
import numpy as np
import os
import datetime
from keybert import KeyBERT
import spacy
import re

In [2]:

# match left and right single quotes
single_quote_expr = re.compile(r'[\u2018\u2019]', re.U)
# match all non-basic latin unicode
unicode_chars_expr = re.compile(r'[\u0080-\uffff]', re.U)
ne_type = ['ORG','GPE','PERSON','NORP']

def cleanse_unicode(s):
    if not s:
        return ""

    temp = single_quote_expr.sub("'", s, re.U)
    temp = unicode_chars_expr.sub("", temp, re.U)
    return temp

class data():
    def __init__(self, pt):
        self.path = pt
        self.file_list = os.listdir(pt)

#add "ne" and "keyword" to data file

now = datetime.datetime.now()
print(now)
data_path = os.getcwd() + "/data"
Data = data(data_path)
kw_model = KeyBERT('distilbert-base-nli-mean-tokens')
sp = spacy.load('en_core_web_sm')

for fp in Data.file_list:
    with open(Data.path +"/" + fp, 'r') as f:
        df=pd.read_csv(f)

    df = df[['title', 'author', 'time', 'description', 'body', 'section','month','year']].dropna()
    print(df)

    keywords = []
    nes = []

    for i in range(df.shape[0]):
        a = df.iloc[i]
        temp_s = a['title'] + ". " + a['description']
        temp_s = temp_s.replace("[Newsmaker]","").replace("[Weekender]","").replace("(Yonhap)","")
        # temp = []
        # for j in range(1,6):
        keyword = kw_model.extract_keywords(temp_s, keyphrase_ngram_range=(1,4), stop_words=None, use_mmr=True, diversity=0.1)
        keyword = " ,".join([word[0] for word in keyword])
        # print(keyword)
        # temp = temp + keyword
        keywords.append(keyword)

        ne = [sp(a['description'])]
        ne = [(e.text, e.lemma_, e.label_) for entities in ne for e in entities.ents]
        ne = [n[1] for n in ne if n[2] in ne_type]
        nes.append(ne)

        if i % 100 == 0:
            now = datetime.datetime.now()
            print(now)
            print(i)

    df['keyword'] = keywords
    df['ne'] = nes
    print(df)
    fp = fp.replace(".csv","")
    df.to_csv(fp +"_ver_1.csv")



2021-05-20 17:40:50.499658
                                                  title  author  \
0     People's Party members support Ahn's push for ...  Yonhap   
1     [Newsmaker] Panamanian vessel probed over susp...  Yonhap   
2     Hong Kong ship crew questioned in S. Korea for...     AFP   
3     Additional bird flu case confirmed at duck far...  Yonhap   
...                                                 ...     ...   
9121        NK leader set to deliver New Year's address     임정요   
9122  S. Korea's Marine Corps to deploy new guided m...     임정요   
9123  Defense chief stresses new year poses new chal...     임정요   
9124  N. Korea kicks off 2017 with large-scale firew...     임정요   
9125  Acting president stresses security of country ...     임정요   

                     time                                        description  \
0     2017-12-31 16:18:00  The leader of the center-left People's Party g...   
1     2017-12-31 14:55:00  PYEONGTAEK  -- South Korea has seized and insp..

                                                  title author  \
0     Chinese airlines withdraw plans for chartered ...    조정은   
1     Ex-health minister grilled over scandal involv...    조정은   
2     Highly pathogenic strain of bird flu found in ...    조정은   
3     S. Koreans demand president's removal on New Y...    조정은   
4     S. Korea's pension fund chief formally arreste...    조정은   
...                                                 ...    ...   
7480  [Photo News] N.K. leader visits the mortuary o...    정주원   
7481  North Korean leader to deliver New Year’s addr...    정주원   
7482  Korean-American group to campaign for 'comfort...    정주원   
7483  Major shift in N.K. policy on Seoul could poin...    정주원   
7484  JCS Chairman Lee inspects readiness at NLL abo...    정주원   

                     time                                        description  \
0     2016-12-31 16:36:00  Chinese airlines have withdrawn plans to run c...   
1     2016-12-31 16:14:00  South Korea's specia

                                                  title  author  \
0            S. Korea, China establish military hotline     김영원   
2     Presidential office refutes rumors on Seoul-To...  KH디지털2   
3                    Assembly ends with little achieved     이주희   
4           Non-emergency patients to face ER fee hikes     이주희   
5     Former teacher dedicates 30 years to charity work     이주희   
...                                                 ...     ...   
7151  N. Korean leader's speech arouses cautious opt...  KH디지털2   
7152  N. Korean leader open to inter-Korean summit t...  KH디지털2   
7153  Ex-U.S. envoy calls for clearer communication ...  KH디지털2   
7154           U.S. imposes sanctions on N. Korean firm  KH디지털2   
7155  Park calls for military readiness amid tension...  KH디지털2   

                     time                                        description  \
0     2015-12-31 16:52:00  South Korea and China set up a hotline between...   
2     2015-12-31 15:52:00  Cheong W

In [25]:
#functions and models for clustering

def revise_word(a):
    if "www" in a:
        return None
    return a.replace("'s","").rstrip()

class BoW():
    def __init__(self):
        self.dic = {}
        self.size = 0

    def add_dic(self,words):
        for word in words:
            word = revise_word(word)
            if word in self.dic or word is None:
                pass
            else:
                self.dic[word] = self.size
                self.size = self.size + 1

    def make_vec(self,words):
        shape = (self.size,)
        zeros_tensors = torch.zeros(shape)
        for word in words:
            word = revise_word(word)
            if word in self.dic:
                zeros_tensors[self.dic[word]] += 1/10
            elif word is None:
                pass
            else:
                raise Exception("왜 딕셔너리에 없죠?")
        return zeros_tensors

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [26]:
#make vector encoding for clustering
def make_vector(fp):
    bow = BoW()
    data = pd.read_csv(fp)
    dumps = data[:]['ne']
    dic = []
    for d in dumps:
        d = ast.literal_eval(d)
        bow.add_dic(d)
    vectors = []

    for i in range(data.shape[0]):
        a = data.iloc[i]

        ne = ast.literal_eval(a["ne"])
        ne_outputs = bow.make_vec(ne)


        key = a["keyword"]
        inputs = tokenizer(key, return_tensors='pt')
        key_outputs = model(**inputs).last_hidden_state[0][0]
        vector = torch.cat((ne_outputs,key_outputs),0)
        vectors.append(vector)

    data['vector'] = [v.detach().numpy() for v in vectors]

    return data,vectors

In [27]:
from sklearn.cluster import KMeans

year = [2015,2016,2017]

for y in year:
    data,vectors = make_vector('koreaherald_{}_ver_1.csv'.format(y))

    print(data)
    vectors = [v.detach().numpy() for v in vectors]
    kmeans = KMeans(n_clusters=15)
    kmeans.fit(vectors)
    print(kmeans.labels_)

    data['cluster'] = kmenas.labels_

    df.to_csv(fp +"_ver_2.csv")



      Unnamed: 0                                              title  author  \
0              0         S. Korea, China establish military hotline     김영원   
1              2  Presidential office refutes rumors on Seoul-To...  KH디지털2   
2              3                 Assembly ends with little achieved     이주희   
3              4        Non-emergency patients to face ER fee hikes     이주희   
4              5  Former teacher dedicates 30 years to charity work     이주희   
...          ...                                                ...     ...   
6978        7151  N. Korean leader's speech arouses cautious opt...  KH디지털2   
6979        7152  N. Korean leader open to inter-Korean summit t...  KH디지털2   
6980        7153  Ex-U.S. envoy calls for clearer communication ...  KH디지털2   
6981        7154           U.S. imposes sanctions on N. Korean firm  KH디지털2   
6982        7155  Park calls for military readiness amid tension...  KH디지털2   

                     time                          