#### import

In [5]:
import os
%matplotlib inline
import sys
from collections import Counter
from typing import List
import csv
import re
# TextRank
from gensim.summarization.summarizer import summarize
# kiwi: Tokenizer
from kiwipiepy import Kiwi
# keyBert
from keybert import KeyBERT
from transformers import BertModel
from tqdm.notebook import tqdm

In [6]:
module_path = '/'.join(os.getcwd().split("\\")[:-1])
sys.path.append(module_path)
sys.path.append(module_path + '/crawling')

### data 전처리

#### Tokenize

In [7]:
kiwi = Kiwi()
# 명사 추출 함수
def noun_extractor(text):
    results = []
    result = kiwi.analyze(text)
    # print(result)
    for token, pos, _, _ in result[0][0]:
        if len(token) != 1 and pos.startswith('N') or pos.startswith('SL'):
            results.append(token)
    return results

#### 불용어(stopwords) 제거

##### stopwords: list

In [8]:
stopwords = [
    # channel
    "11번가",
    "11st-tech",
    "29CM",
    "29cm",
    "AWS",
    "amazon",
    "class101",
    "GS리테일",
    "gsretail",
    "NHN",
    "nhncloud",
    "ToastUI",
    "네이버D2",
    "d2",
    "naver",
    "네이버클라우드",
    "naver-cloud-platform",
    "네이버플레이스",
    "naver-place-dev",
    "넷마블",
    "netmarble",
    "다나와",
    "danawalab",
    "당근마켓",
    "daangn",
    "데보션",
    "devocean",
    "sk",
    "데브시스터즈",
    "devsisters",
    "드라마앤컴퍼니",
    "dramancompany",
    "라인",
    "linecorp",
    "레모네이드",
    "lemonade-engineering",
    "롯데on",
    "lotteon",
    "루닛",
    "lunit",
    "마이리얼트립",
    "myrealtrip-product",
    "메가존클라우드",
    "ctc-mzc",
    "무신사",
    "musinsa",
    "버즈빌",
    "buzzvil",
    "브랜디",
    "brandi",
    "사람인",
    "saramin",
    "숨고",
    "soomgo",
    "스마일게이트AI",
    "smilegate",
    "스케터랩",
    "scatterlab",
    "스타일쉐어",
    "styleshare",
    "쏘카",
    "socarcorp",
    "아이디어스",
    "idus",
    "야놀자",
    "yanolja",
    "야놀자클라우드",
    "yanoljacloud-tech",
    "엔라이즈",
    "nrise",
    "여기어때",
    "gccompany",
    "오일나우",
    "왓챠",
    "watcha",
    "요기요",
    "yogiyo",
    "우아한형제들",
    "woowahan",
    "원티드",
    "wantedjobs",
    "지마켓",
    "gmarket",
    "직방",
    "zigbang",
    "카카오",
    "kakao",
    "카카오엔터프라이즈",
    "kakaoenterprise",
    "카카오페이",
    "kakaopay",
    "컬리",
    "kurly",
    "코인원",
    "coinone",
    "쿠팡",
    "coupang-engineering",
    "크몽",
    "kmong",
    "클라우드메이트",
    "cloudmt",
    "테이블링",
    "tabling",
    "토스",
    "toss",
    "포스타입",
    "postype",
    "하이퍼커넥트",
    "hyperconnect",
    "헤이딜러",
    "prnd",
    "화해",
    "hwahae",
]

stopwords.extend([
    '사용', '개발자', '사용자', '기반', '프로젝트', '이용', '코드', '기술', '서비스', '활용', '적용', '개발', '소개', '실행', '안녕하세요', 'line', '진행', '관리', '과정', 'developer', '비즈니스', '프로그래밍', '회사', '소프트웨어', '유저',   '프로그램', 'user',  'service',  '포스팅', 'tech',  '다운로드', '저장소', 'google', '동료',  'project', '방법',  'japanese', 'code', '효율', '도구', '공유', '코딩', '관련', '제품', '이미지', '화면', '얘기', '접근', '광고', '리멤버', '대화', '조직', '링크', '컴퍼니', '오픈', '엔지니어', '문서', 'post', '태그', 'hwang', '예산', '가시', '바탕', 'part', '자료', '버즈빌', '스타트업',
])

### keyword 추출

In [9]:
def keyword_extractor(bert:str, documents:List[str]): # -> (List[Tuple[str, float]] | List[List[Tuple[str, float]]]):
    model = BertModel.from_pretrained(bert)
    model = model.cuda()
    kw_model = KeyBERT(model)
    keywords = kw_model.extract_keywords(documents, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=20)
    return keywords

In [10]:
def extract_keyword(base_path:str, row:List[str]):
    title = row[1]
    created_date = row[3]
    channel = row[2]
    categories = row[4].split()
    # print(channel + '_' + title + ' '.join(categories))
    # "Design", "Product", "Culture", "Conference" 태그만 달려있는 글 분석 제외
    # if len(categories) != 0 and len(set(categories) - set(["Design", "Product", "Culture", "Conference"])) == 0: continue
    file_name = re.sub('[\/:*?"<>|],', "", channel + '_' + title)
    content_path = base_path + created_date + '/' + re.sub('&', '_', file_name) + '.txt'
    file = open(content_path, 'r', encoding='utf-8')
    text = file.read()
    if len(text.split('\n')) > 50:
        summary = summarize(text)
    else:
        summary = text
    nouns = noun_extractor(summary)
    doc = ' '.join(nouns)
    return keyword_extractor('skt/kobert-base-v1', doc)

In [11]:
# print(os.getcwd())
# base_path = "/content/drive/MyDrive/쳇, 6pt: 비스킷(biskuit)/dataset/content/"
import csv
import re

base_path = '../../../data/content/'
csv_path = '../../data/crawling_data_keyword.csv'

f = open(csv_path, "r", encoding="utf-8")

csvReader = csv.reader(f)

def extract_keyword(base_path:str):
    documents = []
    URIs = []
    susseed = 0
    fileNotFoundError = 0
    osError = 0
    for row in tqdm(csvReader):
        if len(row) == 0:
            continue
        source = row[0]
        title = row[1]
        created_date = row[3]
        channel = row[2]
        # categories = row[4].split()
        # "Design", "Product", "Culture", "Conference" 태그만 달려있는 글 분석 제외
        # if len(categories) != 0 and len(set(categories) - set(["Design", "Product", "Culture", "Conference"])) == 0: continue
        file_name = re.sub('[\/:*?"<>|],', "", channel + '_' + title)
        content_path = base_path + created_date + '/' + file_name + '.txt'
        try:
            file = open(content_path, 'r', encoding='utf-8')
        except FileNotFoundError:
            fileNotFoundError += 1
            continue
        except OSError:
            osError += 1
            continue
        text = file.read()
        if len(text.split('\n')) > 50:
            summary = summarize(text)
        else:
            summary = text
        # nouns = noun_extractor(text)
        nouns = noun_extractor(summary)
        text = ' '.join(nouns)
        documents.append(text)
        URIs.append(source)
        susseed += 1
    print("susseed:", susseed)
    print("fileNotFoundError:", fileNotFoundError)
    print("osError:", osError)
    return keyword_extractor('skt/kobert-base-v1', documents), URIs

## DB에 저장

### tag와 content_tag 저장

In [12]:
extracted_keywords, sources = extract_keyword(base_path)

0it [00:00, ?it/s]

['This', 'post', 'is', 'also', 'available', 'in', 'the', 'following', 'languages', 'Japanese', 'EnglishLINE', 'Developer', 'Workshop', 'Hackathon', 'LINE', 'LINE', 'Engineering', '안녕하세요', '라인', '주식회사', '주니어', '개발자', 'James', '라인', '개발자', '워크샵', '해커톤', 'LINE', 'Developer', 'Workshop', 'Hackathon', '경험', '생각', '여러분', '공유', '작성', '라인', '주식회사', '개국', '이용자', '모바일', '메신저', '라인', 'LINE', '개발', '서비스', '회사', 'LINE', '개발', '세계', '이용자', '모바일', '경험', '향유', '밤낮', '협력', 'LINE', '개발자', '템포', '의미', '팀워크', '개발', '개발자', '개발자', '축제', '개발자', '워크샵', 'Developer', 'Workshop', '워크샵', '일정', '진행', '개발자', '교외', '휴양지', '시간', '워크샵', '해커톤', 'Hackathon', '포함', '프로그램', '일상', '모임', '개발', '자유', '대화', '토론', '서로', '이해', '계기', '번째', '아이디어', '한데', '개발자', '워크샵', '워크샵', '지난해', '성과', '논의', '올해', '달성', '방향', '토의', 'LINE', '글로벌', '시장', '급속', '성장', '년도', 'LINE', '매출', 'JPY', '기록', '사용자', 'Monthly', 'Active', 'User', 'MAU', '증가', '올해', '말일', '기준', '달성', '기념비', '성과', 'LINE', '근무', '다양', '부서', '개발자', '협업', '결과', '이야기', '부서', '조직', 

### content_tag 저장

In [14]:
from crawling import config

conn = config.connect()
curs = conn.cursor()


for tags, source in zip(extracted_keywords, sources):
    insert_tag_sql = """insert ignore into tag (name) value (%s)"""

    select_content_id_sql = """select id from content where source = %s"""

    select_tag_id_sql = """select id from tag where name = %s"""

    insert_content_tag = """insert into content_tag (content_id, tag_id) values (%s, %s)"""
    for tag in tags:
        curs.execute(insert_tag_sql, (tag[0]))
        curs.execute(select_content_id_sql, (source))
        data = curs.fetchone()
        if data is None:
            continue
        else:
            content_id = data[0]
        curs.execute(select_tag_id_sql, (tag[0]))
        data = curs.fetchone()
        if data is None:
            continue
        else:
            tag_id = data[0]
        # content_id = curs.fetchone()[0]
        curs.execute(insert_content_tag, (content_id, tag_id))
        print(tag_id)
    conn.commit()
conn.close()

e:\study\SSAFY\git\S08P22A706\backend\content API\\src\config.ini
9917
9598
9919
4061
4012
4951
6444
4292
7466
9926
4964
4014
4018
9930
4595
9527
4309
4064
7469
9936
9930
4345
4048
4309
4027
4025
4018
4188
9290
12646
6412
4197
12649
5144
12213
4193
12653
4013
6252
12656
4365
4366
4323
4368
4251
4224
4371
4372
4318
4374
4375
4376
4377
4378
4026
4380
4381
4181
4383
4384
7932
4577
7768
4718
5703
14278
14279
3980
14281
4014
14283
4048
5621
4978
7220
4358
9527
14290
14291
4178
5685
5722
4306
14296
5114
4012
4292
14300
5620
14302
4350
4018
6188
6508
6961
14308
4009
4978
4938
4728
14313
9214
14315
7445
14317
4017
14319
4567
14321
11563
7313
13187
14325
4013
8214
14328
4136
7273
10629
7219
13554
11571
3990
3980
3992
14338
14339
4061
14341
14342
4326
8235
14345
14346
4012
12
14349
4596
11464
7942
4707
7203
14355
13554
10742
4725
8697
4731
3980
4218
14363
4596
14365
14339
3974
5615
4965
7315
14371
14372
12122
4592
12732
4776
4566
14378
14135
3980
14381
12587
4595
4735
5036
6095
4048
14138
7547
4