In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from konlpy.tag import Komoran

import os

from keybert import KeyBERT
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('./Data/SBS.csv')
data = data.iloc[4282:, :]

data.head(1)

Unnamed: 0,title,headline,date,link,content,category,site
4282,"슈팅 0개로 침묵한 손흥민, '북런던 더비' 5∼6점대 평점",잉글랜드 프리미어리그 '북런던 더비'에서 골 침묵을 지킨 손흥민이 현지 매체로부터 ...,2022.10.01,https://news.sbs.co.kr//news/endPage.do?news_i...,잉글랜드 프리미어리그 '북런던 더비'에서 골 침묵을 지킨 손흥민(30·토트넘)이...,스포츠,sbs


In [3]:
stopwords = pd.read_csv('./Data/stopwords.csv').noun.tolist()

len(stopwords)

9584

In [4]:
data.date = data.date.str.split('.').str[1]

In [5]:
data_month = data.groupby('date').get_group('10')
len(data_month)

6552

In [6]:
categories = data_month.category.unique()

len(categories)
# '사회', '국제', '스포츠', '생활·문화', '경제', '정치', '연예'

7

In [7]:
komoran = Komoran()

In [8]:
datas = []

for category in categories:
    try:
        datas.append(data_month.groupby('category').get_group(category).content)
    except (KeyError):
        pass

In [9]:
datas_nouns = []

for contents in datas:
    content_nouns = ''
    
    for text in contents:
        poses = komoran.pos(text)
        
        for pos in poses:
            if len(pos[0]) < 3:
                continue
            
            if pos[-1] == 'NNP':
                if pos[0] not in stopwords:
                    content_nouns += pos[0] + ' '
            
    datas_nouns.append(content_nouns)

In [10]:
# model = KeyBERT('distilbert-base-nli-mean-tokens')

model = KeyBERT('bert-base-nli-mean-tokens')

In [11]:
datas_keywords = []

for content_nouns in datas_nouns:
    datas_keywords.append(model.extract_keywords(content_nouns, keyphrase_ngram_range=(1,1), top_n=20, use_mmr=True))

In [12]:
result_keywords = []

category_idx = 0

for keywords in datas_keywords:
    for keyword in keywords:
        result_keywords.append(('10', categories[category_idx], ) + keyword) # (date, category, keyword, importance)

    category_idx += 1

In [13]:
result = pd.DataFrame(result_keywords, columns=['month', 'category', 'keyword', 'importance'])

result

Unnamed: 0,month,category,keyword,importance
0,10,스포츠,한국프로축구연맹,0.8569
1,10,스포츠,사우스캐롤라이나,0.8564
2,10,스포츠,고척스카이돔,0.8490
3,10,스포츠,챔피언스리그,0.8480
4,10,스포츠,펜실베이니아주,0.8472
...,...,...,...,...
135,10,연예,라스베이거스,0.8131
136,10,연예,손해배상,0.7969
137,10,연예,블랙박스,0.7887
138,10,연예,대주주,0.7808


In [14]:
result.to_csv('./Data/keywords(month).csv', index=False)

In [15]:
import pymysql

In [16]:
keywords = pd.read_csv('./Data/keywords(month).csv')

keywords

Unnamed: 0,month,category,keyword,importance
0,10,스포츠,한국프로축구연맹,0.8569
1,10,스포츠,사우스캐롤라이나,0.8564
2,10,스포츠,고척스카이돔,0.8490
3,10,스포츠,챔피언스리그,0.8480
4,10,스포츠,펜실베이니아주,0.8472
...,...,...,...,...
135,10,연예,라스베이거스,0.8131
136,10,연예,손해배상,0.7969
137,10,연예,블랙박스,0.7887
138,10,연예,대주주,0.7808


In [17]:
sql_insert = 'INSERT INTO keywords_month (month, category, keyword, importance) VALUES (%s, %s, %s, %s)'

In [18]:
conn = pymysql.connect(host='localhost', user='root', password='1234', db='dins', charset='utf8')
cursor = conn.cursor()


for index, row in keywords.iterrows():
    cursor.execute(sql_insert, (row.month, row.category, row.keyword, row.importance))

conn.commit()
conn.close()

In [106]:
# keywords = model.extract_keywords(contents_nouns, keyphrase_ngram_range=(1,1), top_n=15, use_mmr=True)

# keywords