In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from transformers import BertTokenizer, BertModel
import torch
import pandas as pd
import ast

In [2]:
import json
import numpy as np
import os
import datetime
from keybert import KeyBERT
import spacy
import re

# match left and right single quotes
single_quote_expr = re.compile(r'[\u2018\u2019]', re.U)
# match all non-basic latin unicode
unicode_chars_expr = re.compile(r'[\u0080-\uffff]', re.U)
ne_type = ['ORG','GPE','PERSON','NORP']

def cleanse_unicode(s):
    if not s:
        return ""

    temp = single_quote_expr.sub("'", s, re.U)
    temp = unicode_chars_expr.sub("", temp, re.U)
    return temp

class data():
    def __init__(self, pt):
        self.path = pt
        self.file_list = os.listdir(pt)



now = datetime.datetime.now()
print(now)
data_path = os.getcwd() + "/data"
Data = data(data_path)
kw_model = KeyBERT('distilbert-base-nli-mean-tokens')
sp = spacy.load('en_core_web_sm')

for fp in Data.file_list:
    with open(Data.path +"/" + fp, 'r') as f:
        df=pd.read_csv(f)
    
    df = df[['title', 'author', 'time', 'description', 'body', 'section','month','year']].dropna()
    print(df)
    
    keywords = []
    nes = []

    for i in range(df.shape[0]):
        a = df.iloc[i]
        temp_s = a['title'] + ". " + a['description']
        temp_s = temp_s.replace("[Newsmaker]","").replace("[Weekender]","").replace("(Yonhap)","")
        # temp = []
        # for j in range(1,6):
        keyword = kw_model.extract_keywords(temp_s, keyphrase_ngram_range=(1,4), stop_words=None, use_mmr=True, diversity=0.1)
        keyword = " ,".join([word[0] for word in keyword])
        # print(keyword)
        # temp = temp + keyword
        keywords.append(keyword)

        ne = [sp(a['description'])]
        ne = [(e.text, e.lemma_, e.label_) for entities in ne for e in entities.ents]
        ne = [n[1] for n in ne if n[2] in ne_type]
        nes.append(ne)

        if i % 100 == 0:
            now = datetime.datetime.now()
            print(now)
            print(i)

    df['keyword'] = keywords
    df['ne'] = nes
    print(df)
    df.to_csv(fp +"_ver_1.csv")




2021-05-20 17:36:45.433180
0       The leader of the center-left People's Party g...
1       PYEONGTAEK  -- South Korea has seized and insp...
2       The crew of a Hong Kong-registered ship have b...
3       South Korea has confirmed a fresh case of avia...
                              ...                        
9121    North Korean leader Kim Jong-un is set to deli...
9122    South Korea's Marine Corps will get new guided...
9123    South Korea's defense chief stressed Sunday th...
9124    North Korea kicked off the new year with a lar...
9125    South Korea's acting President and Prime Minis...
Name: description, Length: 9123, dtype: object


KeyError: 'title'

In [2]:

def revise_word(a):
    if "www" in a:
        return None
    return a.replace("'s","").rstrip()

class BoW():
    def __init__(self):
        self.dic = {}
        self.size = 0

    def add_dic(self,words):
        for word in words:
            word = revise_word(word)
            if word in self.dic or word is None:
                pass
            else:
                self.dic[word] = self.size
                self.size = self.size + 1

    def make_vec(self,words):
        shape = (self.size,)
        zeros_tensors = torch.zeros(shape)
        for word in words:
            word = revise_word(word)
            if word in self.dic:
                zeros_tensors[self.dic[word]] += 1
            elif word is None:
                pass
            else:
                raise Exception("왜 딕셔너리에 없죠?")
        return zeros_tensors


In [4]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
bow = BoW()

data = pd.read_csv("./Data0_ver_1.csv")

dumps = data[:]['ne']
dic = []
for d in dumps:
    d = ast.literal_eval(d)
    bow.add_dic(d)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
vectors = []
for i in range(data.shape[0]):
    a = data.iloc[i]

    ne = ast.literal_eval(a["ne"])
    ne_outputs = bow.make_vec(ne)


    key = a["keyword"]
    inputs = tokenizer(key, return_tensors='pt')
    key_outputs = model(**inputs).last_hidden_state[0][0]
    vector = torch.cat((ne_outputs,key_outputs),0)
    vectors.append(vector)

data['vector'] = vectors

In [6]:
print(data)

      Unnamed: 0                                              title  \
0              0      A snapshot of multiculturalism in South Korea   
1              1                   [Weekender] Korea’s dynamic 2017   
2              2  People's Party members support Ahn's push for ...   
3              3  [Newsmaker] Panamanian vessel probed over susp...   
4              4  Hong Kong ship crew questioned in S. Korea for...   
...          ...                                                ...   
2995        2995                 Eyes on validity of Samsung merger   
2996        2996  Number of visitors to overpass-turned-park top...   
2997        2997  N. Korean propaganda leaflets on ‘missile succ...   
2998        2998  Row over smoking indoors ends in attempted murder   
2999        2999  [Newsmaker] Ahn elected new People's Party leader   

             author                 time  \
0     Lee Sun-young  2018-01-01 17:07:00   
1       Choi He-suk  2018-01-01 13:22:00   
2            Yo

In [61]:
import numpy as np
from kmeans_pytorch import kmeans

cluster_ids_x, cluster_centers = kmeans(X=torch.stack(vectors),num_clusters = 10, distance='euclidean',device=torch.device('cuda:1'))

[running kmeans]: 0it [00:00, ?it/s, center_shift=3608.985596, iteration=1, tol=0.000100]

running k-means on cuda:1..


[running kmeans]: 28it [00:02, 10.44it/s, center_shift=0.000000, iteration=29, tol=0.000100]

In [9]:
data['cluster'] = cluster_ids_x

NameError: name 'cluster_ids_x' is not defined

In [63]:
print(data)

      Unnamed: 0                                              title  \
0              0      A snapshot of multiculturalism in South Korea   
1              1                   [Weekender] Korea’s dynamic 2017   
2              2  People's Party members support Ahn's push for ...   
3              3  [Newsmaker] Panamanian vessel probed over susp...   
4              4  Hong Kong ship crew questioned in S. Korea for...   
...          ...                                                ...   
2995        2995                 Eyes on validity of Samsung merger   
2996        2996  Number of visitors to overpass-turned-park top...   
2997        2997  N. Korean propaganda leaflets on ‘missile succ...   
2998        2998  Row over smoking indoors ends in attempted murder   
2999        2999  [Newsmaker] Ahn elected new People's Party leader   

             author                 time  \
0     Lee Sun-young  2018-01-01 17:07:00   
1       Choi He-suk  2018-01-01 13:22:00   
2            Yo

In [8]:

for k in range(10):
    df_0 = data.loc[data['cluster'] == k]

    df_des = list(df_0[' description'][:])

    vocab = {}

    n = 3

    for doc in df_des:
        if type(doc) == type('str'):
            a = [w.replace(".","").replace(",","").replace("!","").replace("?","").replace("'","").replace('"',"").replace("\n","") for w in doc.split()]
            for i in range(len(a)-n+1):
                if n != 1:
                    word = " ".join(a[i:i+n])
                if word in vocab:
                    vocab[word] += 1
                else:
                    vocab[word] = 1

                    
    vocab = vocab.items()

    vocab = sorted(vocab,key=lambda x:x[1], reverse=True)
    
    print(vocab[0:5])
    
    n = 5
    vocab = {}
    for doc in df_des:
        if type(doc) == type('str'):
            a = [w.replace(".","").replace(",","").replace("!","").replace("?","").replace("'","").replace('"',"").replace("\n","") for w in doc.split()]
            for i in range(len(a)-n+1):
                if n != 1:
                    word = " ".join(a[i:i+n])
                if word in vocab:
                    vocab[word] += 1
                else:
                    vocab[word] = 1
                    
    vocab = vocab.items()

    vocab = sorted(vocab,key=lambda x:x[1], reverse=True)

    print(vocab[0:5])
    print("------------------------------")
 


AttributeError: type object 'data' has no attribute 'loc'