In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from konlpy.tag import Okt

import os

from keybert import KeyBERT
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [64]:
data = pd.read_csv('./Data/SBS.csv')

data.head(1)

Unnamed: 0,title,headline,date,link,content,category,site
0,"이란 남부 원유시설서 화재…업체 대표 ""정체불명 세력이 방화""",이란 남부 원유시설에서 불이 나 한때 생산이 중단됐다고 현지 언론이 13일 보도했습...,2022.09.13,https://news.sbs.co.kr//news/endPage.do?news_i...,이란 남부 원유시설에서 불이 나 한때 생산이 중단됐다고 현지 언론이 13일(현지...,국제,sbs


In [65]:
dates = ['2022.10.24', '2022.10.25', '2022.10.26', '2022.10.27',
         '2022.10.28', '2022.10.29', '2022.10.30']

In [66]:
data_date= []

for date in dates:
    data_date.append(data.groupby('date').get_group(date))

In [67]:
data_week = pd.concat(data_date, axis=0)

In [68]:
len(data_week)

1694

In [70]:
categories = data_week.category.unique()

len(categories)
# '사회', '국제', '스포츠', '생활·문화', '경제', '정치', '연예'

7

In [71]:
okt = Okt()

In [72]:
datas = []

for category in categories:
    try:
        datas.append(data_week.groupby('category').get_group(category).content)
    except (KeyError):
        pass

In [77]:
datas_nouns = []

for contents in datas:
    content_nouns = ''
    
    for text in contents:
        text_nouns = okt.nouns(text)
        
        for noun in text_nouns:
            content_nouns += noun + ' '
            
    datas_nouns.append(content_nouns)

In [78]:
model = KeyBERT('distilbert-base-nli-mean-tokens')

In [79]:
datas_keywords = []

for content_nouns in datas_nouns:
    datas_keywords.append(model.extract_keywords(content_nouns, keyphrase_ngram_range=(1,1), top_n=20, use_mmr=True))

In [80]:
result_keywords = []

category_idx = 0

for keywords in datas_keywords:
    for keyword in keywords:
        result_keywords.append(('2022.10.24', '2022.10.30', categories[category_idx], ) + keyword) # (date_start, date_end, category, keyword, importance)

    category_idx += 1

In [82]:
result = pd.DataFrame(result_keywords, columns=['date_start', 'date_end', 'category', 'keyword', 'importance'])

result

Unnamed: 0,date_start,date_end,category,keyword,importance
0,2022.10.24,2022.10.30,국제,대륙간탄도미사일,0.8705
1,2022.10.24,2022.10.30,국제,원자력발전소,0.8624
2,2022.10.24,2022.10.30,국제,소셜네트워크서비스,0.8542
3,2022.10.24,2022.10.30,국제,독립국가연합,0.8463
4,2022.10.24,2022.10.30,국제,로스앤젤레스,0.8378
...,...,...,...,...,...
135,2022.10.24,2022.10.30,연예,쇼트프로그램,0.7943
136,2022.10.24,2022.10.30,연예,엠씨더맥스,0.7932
137,2022.10.24,2022.10.30,연예,스타크래프트,0.7892
138,2022.10.24,2022.10.30,연예,도로교통법,0.7780


In [83]:
result.to_csv('./Data/keywords(week).csv', index=False)

In [84]:
import pymysql

In [85]:
keywords = pd.read_csv('./Data/keywords(week).csv')

keywords

Unnamed: 0,date_start,date_end,category,keyword,importance
0,2022.10.24,2022.10.30,국제,대륙간탄도미사일,0.8705
1,2022.10.24,2022.10.30,국제,원자력발전소,0.8624
2,2022.10.24,2022.10.30,국제,소셜네트워크서비스,0.8542
3,2022.10.24,2022.10.30,국제,독립국가연합,0.8463
4,2022.10.24,2022.10.30,국제,로스앤젤레스,0.8378
...,...,...,...,...,...
135,2022.10.24,2022.10.30,연예,쇼트프로그램,0.7943
136,2022.10.24,2022.10.30,연예,엠씨더맥스,0.7932
137,2022.10.24,2022.10.30,연예,스타크래프트,0.7892
138,2022.10.24,2022.10.30,연예,도로교통법,0.7780


In [88]:
sql_insert = 'INSERT INTO keywords_week (date_start, date_end, category, keyword, importance) VALUES (%s, %s, %s, %s, %s)'

In [89]:
conn = pymysql.connect(host='localhost', user='root', password='1234', db='dins', charset='utf8')
cursor = conn.cursor()


for index, row in keywords.iterrows():
    cursor.execute(sql_insert, (row.date_start, row.date_end, row.category, row.keyword, row.importance))

conn.commit()
conn.close()

In [106]:
# keywords = model.extract_keywords(contents_nouns, keyphrase_ngram_range=(1,1), top_n=15, use_mmr=True)

# keywords