In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
import ast
import json
import numpy as np
import os
import datetime
from keybert import KeyBERT
import spacy
import re

In [2]:
GPU_NUM = 1# 원하는 GPU 번호 입력
device = torch.device(f'cuda:{GPU_NUM}' if torch.cuda.is_available() else 'cpu')
torch.cuda.set_device(device) # change allocation of current GPU
print ('Current cuda device ', torch.cuda.current_device()) # check

# Additional Infos
if device.type == 'cuda':
    print(torch.cuda.get_device_name(GPU_NUM))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(GPU_NUM)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(GPU_NUM)/1024**3,1), 'GB')

Current cuda device  1
GeForce RTX 2070 SUPER
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


GeForce RTX 3090 with CUDA capability sm_86 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.
If you want to use the GeForce RTX 3090 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



In [13]:
#functions and models for clustering

def revise_word(a):
    if "www" in a:
        return None
    return a.replace("'s","").rstrip()

class BoW():
    def __init__(self):
        self.dic = {}
        self.size = 0

    def add_dic(self,words):
        for word in words:
            word = revise_word(word)
            if word in self.dic or word is None:
                pass
            else:
                self.dic[word] = self.size
                self.size = self.size + 1

    def make_vec(self,words):
        shape = (self.size,)
        zeros_tensors = torch.zeros(shape, dtype=torch.float64)
        for word in words:
            word = revise_word(word)
            if word in self.dic:
                with torch.no_grad():
                    zeros_tensors[self.dic[word]] = zeros_tensors[self.dic[word]] + 1
            elif word is None:
                pass
            else:
                raise Exception("No data in dictionary")
        return zeros_tensors

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

year = [2015,2016,2017]

for y in year:
    data,vectors = make_vector('koreaherald_{}_30_ver_1.csv'.format(y))
    
    vectors = [v.detach().numpy() for v in vectors]
    kmeans = KMeans(n_clusters=30)
    kmeans.fit(vectors)
    print(kmeans.labels_)
    
    data['cluster'] = kmeans.labels_
    data.to_csv("koreaherald_{}".format(y) +"_30_ver_2.csv")



In [65]:
from tqdm import tqdm

data_path = os.getcwd() + '/data'
data_path_30 = os.getcwd() + "/data/sub_cluster"
for y in tqdm(year, desc= 'year'):
    fp = data_path + '/koreaherald_{}.csv'.format(y)
    with open(fp, "r") as f:
        df = pd.read_csv(f)
    df = df[['title', 'description', 'body','keyword','ne','cluster','vector','author','section','month','year']].dropna()
    cluster = df[:]['cluster']
    for i in tqdm(set(cluster.tolist()),desc='cluster'):
        d = df[df.cluster == i][:]
        d.to_csv(data_path_30+"/{}".format(y)+'/korea_herald_c_{}.csv'.format(i))

year:   0%|          | 0/3 [00:00<?, ?it/s]
cluster:   0%|          | 0/13 [00:00<?, ?it/s][A
cluster:   8%|▊         | 1/13 [00:00<00:01,  6.60it/s][A
cluster: 100%|██████████| 13/13 [00:00<00:00, 37.26it/s][A
year:  33%|███▎      | 1/3 [00:00<00:01,  1.86it/s]
cluster:   0%|          | 0/18 [00:00<?, ?it/s][A
cluster:   6%|▌         | 1/18 [00:00<00:01,  9.77it/s][A
cluster:  44%|████▍     | 8/18 [00:00<00:00, 41.32it/s][A
cluster: 100%|██████████| 18/18 [00:00<00:00, 56.15it/s][A
year:  67%|██████▋   | 2/3 [00:01<00:00,  1.95it/s]
cluster:   0%|          | 0/15 [00:00<?, ?it/s][A
cluster:  20%|██        | 3/15 [00:00<00:00, 18.96it/s][A
cluster: 100%|██████████| 15/15 [00:00<00:00, 40.39it/s][A
year: 100%|██████████| 3/3 [00:01<00:00,  1.86it/s]


In [68]:
for y in year:
    p = data_path_30 + '/{}'.format(y)
    for fp in os.listdir(p):
        print(fp)
        if fp == '.ipynb_checkpoints':
            pass
        else:
            with open(p +"/"+ fp) as f:
                df = pd.read_csv(f)
            df = df[['title', 'description', 'body','keyword','ne','cluster','vector','author','section','month','year']].dropna()

            keywords = []
            nes = []

            for i in range(df.shape[0]):
                a = df.iloc[i]
                temp_s = a['body']

                keyword = kw_model.extract_keywords(temp_s, keyphrase_ngram_range=(1,5), stop_words=None, use_mmr=True, diversity=0.1)
                keyword = " ,".join([word[0] for word in keyword])

                keywords.append(keyword)

                ne = [sp(a['body'])]
                ne = [(e.text, e.lemma_, e.label_) for entities in ne for e in entities.ents]
                ne = [(n[1],n[2]) for n in ne if n[2] in ne_type]
                nes.append(ne)

                if i % 100 == 0:
                    now = datetime.datetime.now()
                    print(now)
                    print(i)
            df['keyword_e'] = keywords
            df['ne_e'] = nes
            print(df)
            df.to_csv(p+"/"+fp)
        

korea_herald_c_0.csv
2021-06-08 22:47:23.822126
0
2021-06-08 22:47:55.778554
100
2021-06-08 22:48:32.135413
200
2021-06-08 22:49:09.158398
300
2021-06-08 22:49:47.746098
400
2021-06-08 22:50:24.344144
500
2021-06-08 22:51:03.374763
600
2021-06-08 22:51:40.183689
700
2021-06-08 22:52:16.306163
800
2021-06-08 22:52:54.219243
900
2021-06-08 22:53:32.670643
1000
2021-06-08 22:54:11.292297
1100
2021-06-08 22:54:48.206920
1200
2021-06-08 22:55:28.664107
1300
2021-06-08 22:56:05.993895
1400
2021-06-08 22:56:41.986362
1500
2021-06-08 22:57:20.229565
1600
2021-06-08 22:57:54.823927
1700
2021-06-08 22:58:29.632862
1800
2021-06-08 22:59:07.565628
1900
2021-06-08 22:59:43.936230
2000
2021-06-08 23:00:21.986753
2100
2021-06-08 23:00:59.789174
2200
2021-06-08 23:01:40.528509
2300
2021-06-08 23:02:20.542356
2400
2021-06-08 23:03:00.228203
2500
2021-06-08 23:03:37.170879
2600
                                                  title  \
0                     Inter-Korean ties face rocky road   
1        

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x95 in position 11: invalid start byte

In [71]:
import csv

for y in year:
    p = data_path_30 + '/{}'.format(y)
    for i in range(data.shape[0]):
        a = data.iloc[i]

        ne = ast.literal_eval(a["ne"])
        ne_outputs = bow.make_vec(ne)
    for fp in os.listdir(p):
        print(fp)
        if fp == '.ipynb_checkpoints':
            pass
        else:
            data,vectors = make_vector(p+ '/' + fp)

            vectors = [v.detach().numpy() for v in vectors]
            kmeans = KMeans(n_clusters=20)
            kmeans.fit(vectors)
            print(kmeans.labels_)

            data['cluster_e'] = kmeans.labels_
 
            data.to_csv(p + '/' + fp)

korea_herald_c_0.csv
[ 8 17  3 ...  5 19  3]


In [88]:
import pickle
for y in year:
    p = data_path_30 + '/{}'.format(y)
    bow = BoW()
    for fp in os.listdir(p):
        if 'csv' in fp:
            print(fp)
            with open(p +"/"+ fp) as f:
                df = pd.read_csv(f)
            df = df[['title', 'description', 'body','keyword','ne','cluster','vector','author','section','month','year','ne_e','keyword_e']].dropna()
            dumps = df[:]['ne_e']
            for d in dumps:
                d = ast.literal_eval(d)
                bow.add_dic(d)
    for fp in os.listdir(p):
        if 'csv' in fp:
            print(fp)
            vectors = []
            with open(p +"/"+ fp) as f:
                df = pd.read_csv(f)
            df = df[['title', 'description', 'body','keyword','ne','cluster','vector','author','section','month','year','ne_e','keyword_e']].dropna()
            for i in range(df.shape[0]):    
                a = df.iloc[i]

                ne = ast.literal_eval(a["ne_e"])
                ne_outputs = bow.make_vec(ne)


                key = a["keyword"]
                inputs = tokenizer(key, return_tensors='pt')
                key_outputs = model(**inputs).last_hidden_state[0][0]
                with torch.no_grad():
                    vector = torch.cat((ne_outputs,key_outputs),0) 
                vectors.append(vector)
            fp = fp.replace('csv','txt')
            with open(p+"/"+fp , 'wb') as lf:
                pickle.dump(vectors, lf)
        

korea_herald_c_0.csv
korea_herald_c_9.csv
korea_herald_c_7.csv
korea_herald_c_14.csv
korea_herald_c_29.csv
korea_herald_c_1.csv
korea_herald_c_10.csv
korea_herald_c_3.csv
korea_herald_c_8.csv
korea_herald_c_2.csv
korea_herald_c_28.csv
korea_herald_c_17.csv
korea_herald_c_5.csv
korea_herald_c_0.csv
korea_herald_c_9.csv
korea_herald_c_7.csv
korea_herald_c_14.csv
korea_herald_c_29.csv
korea_herald_c_1.csv
korea_herald_c_10.csv
korea_herald_c_3.csv
korea_herald_c_8.csv
korea_herald_c_2.csv
korea_herald_c_28.csv
korea_herald_c_17.csv
korea_herald_c_5.csv
korea_herald_c_26.csv
korea_herald_c_4.csv
korea_herald_c_18.csv
korea_herald_c_0.csv
korea_herald_c_9.csv
korea_herald_c_7.csv
korea_herald_c_14.csv
korea_herald_c_15.csv
korea_herald_c_10.csv
korea_herald_c_27.csv
korea_herald_c_8.csv
korea_herald_c_19.csv
korea_herald_c_2.csv
korea_herald_c_28.csv
korea_herald_c_5.csv
korea_herald_c_21.csv
korea_herald_c_16.csv
korea_herald_c_25.csv
korea_herald_c_26.csv
korea_herald_c_4.csv
korea_herald