In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from konlpy.tag import Komoran

import os

from keybert import KeyBERT
import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('./Data/SBS.csv')
data = data.iloc[4282:, :]

data.head(1)

Unnamed: 0,title,headline,date,link,content,category,site
4282,"슈팅 0개로 침묵한 손흥민, '북런던 더비' 5∼6점대 평점",잉글랜드 프리미어리그 '북런던 더비'에서 골 침묵을 지킨 손흥민이 현지 매체로부터 ...,2022.10.01,https://news.sbs.co.kr//news/endPage.do?news_i...,잉글랜드 프리미어리그 '북런던 더비'에서 골 침묵을 지킨 손흥민(30·토트넘)이...,스포츠,sbs


In [5]:
stopwords = pd.read_csv('./Data/stopwords.csv').noun.tolist()

len(stopwords)

9584

In [3]:
dates = ['2022.10.24', '2022.10.25', '2022.10.26', '2022.10.27',
         '2022.10.28', '2022.10.29', '2022.10.30']

In [4]:
data_date= []

for date in dates:
    data_date.append(data.groupby('date').get_group(date))

In [6]:
data_week = pd.concat(data_date, axis=0)

In [7]:
len(data_week)

1694

In [8]:
categories = data_week.category.unique()

len(categories)
# '사회', '국제', '스포츠', '생활·문화', '경제', '정치', '연예'

7

In [9]:
komoran = Komoran()

In [10]:
datas = []

for category in categories:
    try:
        datas.append(data_week.groupby('category').get_group(category).content)
    except (KeyError):
        pass

In [11]:
datas_nouns = []

for contents in datas:
    content_nouns = ''
    
    for text in contents:
        poses = komoran.pos(text)
        
        for pos in poses:
            if len(pos[0]) < 3:
                continue
            
            if pos[-1] == 'NNP':
                if pos[0] not in stopwords:
                    content_nouns += pos[0] + ' '
            
    datas_nouns.append(content_nouns)

In [12]:
# model = KeyBERT('distilbert-base-nli-mean-tokens')

model = KeyBERT('bert-base-nli-mean-tokens')

In [13]:
datas_keywords = []

for content_nouns in datas_nouns:
    datas_keywords.append(model.extract_keywords(content_nouns, keyphrase_ngram_range=(1,1), top_n=20, use_mmr=True))

In [14]:
result_keywords = []

category_idx = 0

for keywords in datas_keywords:
    for keyword in keywords:
        result_keywords.append(('2022.10.24', '2022.10.30', categories[category_idx], ) + keyword) # (date_start, date_end, category, keyword, importance)

    category_idx += 1

In [15]:
result = pd.DataFrame(result_keywords, columns=['date_start', 'date_end', 'category', 'keyword', 'importance'])

result

Unnamed: 0,date_start,date_end,category,keyword,importance
0,2022.10.24,2022.10.30,국제,국제원자력기구,0.9002
1,2022.10.24,2022.10.30,국제,한국수력원자력,0.8972
2,2022.10.24,2022.10.30,국제,연방준비제도,0.8864
3,2022.10.24,2022.10.30,국제,국제통화기금,0.8850
4,2022.10.24,2022.10.30,국제,석유수출국기구,0.8791
...,...,...,...,...,...
135,2022.10.24,2022.10.30,연예,이상민,0.7982
136,2022.10.24,2022.10.30,연예,스트리밍,0.7916
137,2022.10.24,2022.10.30,연예,항공사,0.7906
138,2022.10.24,2022.10.30,연예,초등학교,0.7891


In [16]:
result.to_csv('./Data/keywords(week).csv', index=False)

In [17]:
import pymysql

In [18]:
keywords = pd.read_csv('./Data/keywords(week).csv')

keywords

Unnamed: 0,date_start,date_end,category,keyword,importance
0,2022.10.24,2022.10.30,국제,국제원자력기구,0.9002
1,2022.10.24,2022.10.30,국제,한국수력원자력,0.8972
2,2022.10.24,2022.10.30,국제,연방준비제도,0.8864
3,2022.10.24,2022.10.30,국제,국제통화기금,0.8850
4,2022.10.24,2022.10.30,국제,석유수출국기구,0.8791
...,...,...,...,...,...
135,2022.10.24,2022.10.30,연예,이상민,0.7982
136,2022.10.24,2022.10.30,연예,스트리밍,0.7916
137,2022.10.24,2022.10.30,연예,항공사,0.7906
138,2022.10.24,2022.10.30,연예,초등학교,0.7891


In [19]:
sql_insert = 'INSERT INTO keywords_week (date_start, date_end, category, keyword, importance) VALUES (%s, %s, %s, %s, %s)'

In [20]:
conn = pymysql.connect(host='localhost', user='root', password='1234', db='dins', charset='utf8')
cursor = conn.cursor()


for index, row in keywords.iterrows():
    cursor.execute(sql_insert, (row.date_start, row.date_end, row.category, row.keyword, row.importance))

conn.commit()
conn.close()

In [106]:
# keywords = model.extract_keywords(contents_nouns, keyphrase_ngram_range=(1,1), top_n=15, use_mmr=True)

# keywords