#### import

In [1]:
import os
%matplotlib inline
import sys
from collections import Counter
from typing import List
import csv
import re
# TextRank
from gensim.summarization.summarizer import summarize
# kiwi: Tokenizer
from kiwipiepy import Kiwi
# keyBert
from keybert import KeyBERT
from transformers import BertModel
from tqdm.notebook import tqdm

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
module_path = '/'.join(os.getcwd().split("\\")[:-1])
sys.path.append(module_path)
sys.path.append(module_path + '/crawling')

### data 전처리

#### Tokenize

In [None]:
kiwi = Kiwi()
# 명사 추출 함수
def noun_extractor(text):
    results = []
    result = kiwi.analyze(text)
    # print(result)
    for token, pos, _, _ in result[0][0]:
        if len(token) != 1 and pos.startswith('N') or pos.startswith('SL'):
            results.append(token)
    return results

#### 불용어(stopwords) 제거

##### stopwords: list

In [None]:
stopwords = [
    # channel
    "11번가",
    "11st-tech",
    "29CM",
    "29cm",
    "AWS",
    "amazon",
    "class101",
    "GS리테일",
    "gsretail",
    "NHN",
    "nhncloud",
    "ToastUI",
    "네이버D2",
    "d2",
    "naver",
    "네이버클라우드",
    "naver-cloud-platform",
    "네이버플레이스",
    "naver-place-dev",
    "넷마블",
    "netmarble",
    "다나와",
    "danawalab",
    "당근마켓",
    "daangn",
    "데보션",
    "devocean",
    "sk",
    "데브시스터즈",
    "devsisters",
    "드라마앤컴퍼니",
    "dramancompany",
    "라인",
    "linecorp",
    "레모네이드",
    "lemonade-engineering",
    "롯데on",
    "lotteon",
    "루닛",
    "lunit",
    "마이리얼트립",
    "myrealtrip-product",
    "메가존클라우드",
    "ctc-mzc",
    "무신사",
    "musinsa",
    "버즈빌",
    "buzzvil",
    "브랜디",
    "brandi",
    "사람인",
    "saramin",
    "숨고",
    "soomgo",
    "스마일게이트AI",
    "smilegate",
    "스케터랩",
    "scatterlab",
    "스타일쉐어",
    "styleshare",
    "쏘카",
    "socarcorp",
    "아이디어스",
    "idus",
    "야놀자",
    "yanolja",
    "야놀자클라우드",
    "yanoljacloud-tech",
    "엔라이즈",
    "nrise",
    "여기어때",
    "gccompany",
    "오일나우",
    "왓챠",
    "watcha",
    "요기요",
    "yogiyo",
    "우아한형제들",
    "woowahan",
    "원티드",
    "wantedjobs",
    "지마켓",
    "gmarket",
    "직방",
    "zigbang",
    "카카오",
    "kakao",
    "카카오엔터프라이즈",
    "kakaoenterprise",
    "카카오페이",
    "kakaopay",
    "컬리",
    "kurly",
    "코인원",
    "coinone",
    "쿠팡",
    "coupang-engineering",
    "크몽",
    "kmong",
    "클라우드메이트",
    "cloudmt",
    "테이블링",
    "tabling",
    "토스",
    "toss",
    "포스타입",
    "postype",
    "하이퍼커넥트",
    "hyperconnect",
    "헤이딜러",
    "prnd",
    "화해",
    "hwahae",
]

stopwords.extend([
    '사용', '개발자', '사용자', '기반', '프로젝트', '이용', '코드', '기술', '서비스', '활용', '적용', '개발', '소개', '실행', '안녕하세요', 'line', '진행', '관리', '과정', 'developer', '비즈니스', '프로그래밍', '회사', '소프트웨어', '유저',   '프로그램', 'user',  'service',  '포스팅', 'tech',  '다운로드', '저장소', 'google', '동료',  'project', '방법',  'japanese', 'code', '효율', '도구', '공유', '코딩', '관련', '제품', '이미지', '화면', '얘기', '접근', '광고', '리멤버', '대화', '조직', '링크', '컴퍼니', '오픈', '엔지니어', '문서', 'post', '태그', 'hwang', '예산', '가시', '바탕', 'part', '자료', '버즈빌', '스타트업',
])

### keyword 추출

In [None]:
def keyword_extractor(bert:str, documents:List[str]): # -> (List[Tuple[str, float]] | List[List[Tuple[str, float]]]):
    model = BertModel.from_pretrained(bert)
    model = model.cuda()
    kw_model = KeyBERT(model)
    keywords = kw_model.extract_keywords(documents, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=20)
    return keywords

In [None]:
def extract_keyword(base_path:str, row:List[str]):
    title = row[1]
    created_date = row[3]
    channel = row[2]
    categories = row[4].split()
    # print(channel + '_' + title + ' '.join(categories))
    # "Design", "Product", "Culture", "Conference" 태그만 달려있는 글 분석 제외
    if len(categories) != 0 and len(set(categories) - set(["Design", "Product", "Culture", "Conference"])) == 0: continue
    file_name = re.sub('[\/:*?"<>|],', "", channel + '_' + title)
    content_path = base_path + created_date + '/' + re.sub('&', '_', file_name) + '.txt'
    file = open(content_path, 'r', encoding='utf-8')
    text = file.read()
    if len(text.split('\n')) > 50:
        summary = summarize(text)
    else:
        summary = text
    nouns = noun_extractor(summary)
    doc = ' '.join(nouns)
    return keyword_extractor('skt/kobert-base-v1', doc)

### DB에 저장

In [None]:
from crawling import config


base_path = '../../../data/content/'
csv_path = '../../data/crawling_data.csv'

conn = config.connect()
curs = conn.cursor()

f = open(csv_path, "r", encoding="utf-8")

csvReader = csv.reader(f)

documents = []
titles = []
for row in csvReader:   # source, title, created_date, channel, categories = row[4].split()
    if len(row) == 0:
        continue
    
    select_content_id_sql = """select id from content where source = %s"""
    curs.execute(select_content_id_sql, (row[0],))
    content_id = curs.fetchone()[0]

    select_tag_id_sql = """select id from tag where name = %s"""
    curs.execute(select_tag_id_sql, (row[4],))
    tag_id = curs.fetchone()[0]

    insert_content_tag = """insert into content_tag (content_id, tag_id) values (%s, %s)"""
    tags = extract_keyword(base_path, row)
    for tag in tags:
        curs.execute(insert_content_tag, (content_id, tag_id))
    print(tag_id)
    conn.commit()