In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans 
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
import ast
import json
import numpy as np
import os
import datetime
from keybert import KeyBERT
import spacy
import re

In [4]:
# GPU_NUM = 1# 원하는 GPU 번호 입력
# device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
# torch.cuda.set_device(device) # change allocation of current GPU
# print ('Current cuda device ', torch.cuda.current_device()) # check

# # Additional Infos
# if device.type == 'cuda':
#     print(torch.cuda.get_device_name(GPU_NUM))
#     print('Memory Usage:')
#     print('Allocated:', round(torch.cuda.memory_allocated(GPU_NUM)/1024**3,1), 'GB')
#     print('Cached:   ', round(torch.cuda.memory_cached(GPU_NUM)/1024**3,1), 'GB')

In [5]:
#functions and models for clustering

def revise_word(a):
    if "www" in a:
        return None
    return a.replace("'s","").rstrip()

class BoW():
    def __init__(self):
        self.dic = {}
        self.size = 0

    def add_dic(self,words):
        for word in words:
            word = revise_word(word)
            if word in self.dic or word is None:
                pass
            else:
                self.dic[word] = self.size
                self.size = self.size + 1

    def make_vec(self,words):
        shape = (self.size,)
        zeros_tensors = torch.zeros(shape, dtype=torch.float64)
        for word in words:
            word = revise_word(word)
            if word in self.dic:
                with torch.no_grad():
                    zeros_tensors[self.dic[word]] = zeros_tensors[self.dic[word]] + 1
            elif word is None:
                pass
            else:
                raise Exception("No data in dictionary")
        return zeros_tensors

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

#make vector encoding for clustering
def make_vector(fp):
    bow = BoW()
    data = pd.read_csv(fp)
    dumps = data[:]['ne']
    dic = []
    for d in dumps:
        d = ast.literal_eval(d)
        bow.add_dic(d[0])
    vectors = []

    for i in range(data.shape[0]):
        a = data.iloc[i]

        ne = ast.literal_eval(a["ne"])
        ne_outputs = bow.make_vec(ne)


        key = a["keyword"]
        inputs = tokenizer(key, return_tensors='pt')
        key_outputs = model(**inputs).last_hidden_state[0][0]
        with torch.no_grad():
            vector = torch.cat((ne_outputs,key_outputs),0) 
        vectors.append(vector)
    
    data['vector'] = [v.detach().numpy() for v in vectors]

    return data,vectors

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:

# match left and right single quotes
single_quote_expr = re.compile(r'[\u2018\u2019]', re.U)
# match all non-basic latin unicode
unicode_chars_expr = re.compile(r'[\u0080-\uffff]', re.U)
ne_type = ['ORG','GPE','PERSON','NORP']

def cleanse_unicode(s):
    if not s:
        return ""

    temp = single_quote_expr.sub("'", s, re.U)
    temp = unicode_chars_expr.sub("", temp, re.U)
    return temp

class data():
    def __init__(self, pt):
        self.path = pt
        self.file_list = os.listdir(pt)

#add "ne" and "keyword" to data file

now = datetime.datetime.now()
print(now)
data_path = os.getcwd() + "/data"
Data = data(data_path)
kw_model = KeyBERT('distilbert-base-nli-mean-tokens')
sp = spacy.load('en_core_web_sm')

for fp in os.listdir(data_path):
    print(fp)
    if fp != '.ipynb_checkpoints':
        bow = BoW()
        with open(data_path + "/" + fp, "r") as f:
            df = pd.read_csv(f)

        df = df[['title', 'author', 'time', 'description', 'body', 'section','month','year']].dropna()
        print(df)

        keywords = []
        nes = []

        for i in range(df.shape[0]):
            a = df.iloc[i]
            temp_s = a['title'] + ". " + a['description']

            keyword = kw_model.extract_keywords(temp_s, keyphrase_ngram_range=(1,4), stop_words=None, use_mmr=True, diversity=0.1)
            keyword = " ,".join([word[0] for word in keyword])

            keywords.append(keyword)

            ne = [sp(a['description'])]
            ne = [(e.text, e.lemma_, e.label_) for entities in ne for e in entities.ents]
            ne = [n[1] for n in ne if n[2] in ne_type]
            nes.append(ne)

            if i % 100 == 0:
                now = datetime.datetime.now()
                print(now)
                print(i)
        df['keyword'] = keywords
        df['ne'] = nes
        
        for d in nes:
            bow.add_dic(d[0])
        vectors = []
        
        for i in range(len(nes)):
            ne = nes[i]
            key = keywords[i]
            ne_outputs = bow.make_vec(ne)
            inputs = tokenizer(key, return_tensors='pt')
            key_outputs = model(**inputs).last_hidden_state[0][0]
            with torch.no_grad():
                vector = torch.cat((ne_outputs,key_outputs),0) 
            vectors.append(vector)
        df['vector'] = vectors
        vectors = [v.detach().numpy() for v in vectors]
        kmeans = KMeans(n_clusters=20)
        kmeans.fit(vectors)
        df['cluster'] = kmeans.labels_
        
        data.to_csv(data_path + fp)


2021-06-19 01:16:40.852222
koreaherald_2017.csv
                                                  title  author  \
0     People's Party members support Ahn's push for ...  Yonhap   
1     [Newsmaker] Panamanian vessel probed over susp...  Yonhap   
2     Hong Kong ship crew questioned in S. Korea for...     AFP   
3     Additional bird flu case confirmed at duck far...  Yonhap   
...                                                 ...     ...   
9121        NK leader set to deliver New Year's address     임정요   
9122  S. Korea's Marine Corps to deploy new guided m...     임정요   
9123  Defense chief stresses new year poses new chal...     임정요   
9124  N. Korea kicks off 2017 with large-scale firew...     임정요   
9125  Acting president stresses security of country ...     임정요   

                     time                                        description  \
0     2017-12-31 16:18:00  The leader of the center-left People's Party g...   
1     2017-12-31 14:55:00  PYEONGTAEK  -- South Korea 

ValueError: Length of values (101) does not match length of index (100)