# WordCloud Creator

In [None]:
# import libraries
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
import re
import konlpy
from collections import Counter
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
import numpy as np
from PIL import Image

---

## User setting variables

### User-Agent
자신의 User-Agent를 확인한 후, 지정해주어야 로봇으로의 인식을 방지할 수 있음  
자신의 User-Agent는 [what is my user agent](https://www.whatismybrowser.com/detect/what-is-my-user-agent/) 에서 확인할 수 있음

In [None]:
# set User-Agent

# for Windows
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'}

# for MacOS
# headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}

## Keyword & numbers
* 워드클라우드를 만들고자 하는 키워드 단어를 입력    
* 크롤링 하고자 하는 뉴스 제목의 수를 입력

In [None]:
# set keyword & numbers
keyword = '포도'
numbers = 1000

## Font
* 사용하고자 하는 한국어 폰트의 경로를 지정

In [None]:
# set Korean font path

# for Windows
FONT_PATH = "C:\Windows\Fonts\malgun.ttf"

# for MacOS
# FONT_PATH = '/System/Library/Fonts/AppleSDGothicNeo.ttc'

## Mask
* 사용하고자 하는 마스크 이미지의 경로를 지정
* 이미지를 다운받아 'Mask' 폴더에 넣은 후
* 'Mask/{mask_image_name}.{extension}' 형식으로 이미지 불러오기

In [None]:
# set mask image's path

mask = np.array(Image.open('Mask/grape.jpg'))

---

In [None]:
# set needed variables
number_list = []
for i in range(int(numbers/10)):
    number_list.append(i*10+1)
f = open(f"Data/news_titles_{keyword}.txt", 'w')

In [None]:
# crawl news titles and write to text file
for page in tqdm(number_list):
    try:
        url = f'https://search.naver.com/search.naver?where=news&sm=tab_pge&query={keyword}&start={page}'
        res = requests.get(url, headers=headers)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'lxml')

        titles = soup.select("#main_pack > section > div > div.group_news > ul > li")

        for title in titles:
            news_title = title.find('a', attrs={"class": "news_tit"})['title']
            f.write(f'{news_title}\n')
    except:
        continue
f.close()

In [None]:
# read news titles
with open(f"Data/news_titles_{keyword}.txt") as f:
    text = f.readlines()

In [None]:
# pre-process data
text_strip = list([i.strip() for i in text if i != '\n'])
text_join = ' '.join(text_strip)
text_filtered = re.sub(r'[^\d\s\w]', ' ', text_join)

In [None]:
# use komoran
komoran = konlpy.tag.Komoran()
komoran_pos = komoran.pos(text_filtered)
komoran_nouns = komoran.nouns(text_filtered)

In [None]:
# delete stopword
stop_words = ['이오', '에다', '하오', '마오']
unique_Noun_words = set(komoran_nouns)
for word in unique_Noun_words:
    if word in stop_words or len(word) == 1:
        while word in komoran_nouns:
            komoran_nouns.remove(word)

In [None]:
# analysis frequency
c = Counter(komoran_nouns)
frequent = c.most_common(20)

In [None]:
# create word cloud

wordcloud = WordCloud(
    max_font_size=60, 
    width=300,
    height=300,
    relative_scaling='auto', 
    font_path=FONT_PATH, 
    background_color='white',
    color_func=ImageColorGenerator(mask),
    mask=mask
    )
wordcloud.generate_from_frequencies(c) 
wordcloud.to_image()


In [None]:
# save word cloud image
wordcloud.to_file(f'Image/WordCloud_{keyword}.png')