In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

%config InlineBackend.figure_format='retina'

!apt -qq -y install fonts-nanum

import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=10)
plt.rc('font', family='NanumBarunGothic')
mpl.font_manager._rebuild()

In [None]:
!curl -s https://raw.githubusercontent.com/teddylee777/machine-learning/master/99-Misc/01-Colab/mecab-colab.sh | bash

In [None]:
import urllib.request

raw = urllib.request.urlopen('https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt').readlines()
print(raw[:5])

In [None]:
raw = [x.decode() for x in raw[1:]]

reviews = []
for i in raw:
    reviews.append(i.split('\t')[1])

print(reviews[:5])    

In [None]:
from konlpy.tag import Mecab
tagger = Mecab()

reviews_nouns = []
for review in reviews:
    for noun in tagger.nouns(review):
        reviews_nouns.append(noun)

reviews_nouns[:10]

In [None]:
stop_words = '전 난 일 걸 뭐 줄 만 건 분 개 끝 잼 이거 번 중 듯 때 게 내 말 나 수 거 점 것 후 이 애 씨 속 뿐 밋 그 급 ㄷ 데'
stop_words = stop_words.split(' ')
print(stop_words)

In [None]:
reviews_nouns = []

for review in reviews:
    for noun in tagger.nouns(review):
        if noun not in stop_words:
            reviews_nouns.append(noun)
            
reviews_nouns[:10]

In [None]:
from collections import Counter

reviews_nouns_counter = Counter(reviews_nouns)
top_reviews_nouns = dict(reviews_nouns_counter.most_common(100))
top_reviews_nouns

In [None]:
import numpy as np

plt.rcParams['font.size'] = 12

y_pos = np.arange(len(top_reviews_nouns))

plt.figure(figsize=(12, 20))
plt.barh(y_pos, top_reviews_nouns.values())
plt.title('Word Count')
plt.yticks(y_pos, top_reviews_nouns.key())
plt.show() .

In [None]:
!pip install squarify

In [None]:
import squarify

plt.rcParams['figure.figsize'] = (14, 14)
plt.rcParams['font.size'] = 20

norm = mpl.colors.Normalize(vmin=min(top_reviews_nouns.values()),
                            vmax=max(top_reviews_nouns.values()))
colors = [mpl.cm.Reds(norm(value)) for value in top_reviews_nouns.values()]

squarify.plot(label=top_reviews_nouns.keys(),
              sizes=top_reviews_nouns.values(),
              color=colors,
              alpha=.7);

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud

wc = WordCloud(background_color='white', font_path='./font/NanumBarunGothic.ttf')
wc.generate_from_frequencies(top_reviews_nouns)

In [None]:
figure = plt.figure(figsize=(12, 12))
ax = figure.add_subplot(1, 1, 1)
ax.axis('off')
ax.imshow(wc)
plt.show()

In [None]:
import os 
import sys
import urllib.request
import pandas as pd
import json
import re

client_id = 'DROAWyjYSQZL8FUNzrl6'
client_secret = 'O4PFLCpmWq'

query = urllib.parse.quote(input("검색어를 입력하세요: "))
idx = 0
display = 100
start = 1
end = 1000

web_df = pd.DataFrame(columns=('Title', 'Link', 'Description'))

for start_index in range(start, end, display):

    url = 'https://openapi.naver.com/v1/search/news?query=' + query \
    + '&display=' + str(display) + '&start=' + str(start_index)
    # 네이버 최초 페이지에서 데이터 수집 시 'news' -> 'webkr'로 변경
    # Datetime(이수안 '텍스트 수집 및 키워드 분석 한번에 끝내기 1:06:58)
    # pubDate = datetime

    request = urllib.request.Request(url)
    request.add_header('X-Naver-Client-id', client_id)
    request.add_header('X-Naver-Client-Secret', client_secret)
    response = urllib.request.urlopen(request)
    rescode = response.getcode()

    if(rescode == 200):
        response_body = response.read()
        response_dict = json.loads(response_body.decode('utf-8'))
        items = response_dict['items']
        for item_index in range(0, len(items)):
            remove_tag = re.compile('<.*?>')
            title = re.sub(remove_tag, '', items[item_index]['title'])
            link = items[item_index]['link']
            description = re.sub(remove_tag, '', items[item_index]['description'])
            web_df.loc[idx] = [title, link, description]
            idx += 1
    else:
        print("Error Code: " + rescode)

web_df

In [None]:
web_df.to_csv('./df1.csv',index=False, encoding = 'utf-8-sig')
# 이름 수정 시 '.df.csv 이름 변경

In [None]:
web = []
for d in web_df.Description:
    web.append(d)

print(web[:100])   

In [None]:
stop_words = '분야 수 것 등 기반 일 년 말 문 월 이 명 깨 개 중 범 부 딥 빅 책 분 봉 차 나 내 전 억 대 형 선 사 폼 원 번'
stop_words = stop_words.split(' ')
print(stop_words)

In [None]:
!apt-get update
!apt-get install g++ openjdk-8-jdk 
!pip3 install konlpy JPype1-py3
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [None]:
!pip install konlpy
from konlpy.tag import Mecab

tagger = Mecab()
web_nouns = []
for w in web:
    for noun in tagger.nouns(w):
        if noun not in stop_words:
            web_nouns.append(noun)

web_nouns[:10]

In [None]:
from collections import Counter
web_nouns_counter = Counter(web_nouns)
top_web_nouns = dict(web_nouns_counter.most_common(100))
top_web_nouns

In [None]:
web_df.to_csv('./df.csv',index=False, encoding = 'utf-8-sig')

In [None]:
import matplotlib.pyplot as plt
import numpy

plt.rcParams['font.size'] = 12

y_pos = numpy.arange(len(top_web_nouns))

plt.figure(figsize=(12, 24))
plt.barh(y_pos, top_web_nouns.values())
plt.title('Word Count')
plt.yticks(y_pos, top_web_nouns.keys())
plt.show()

In [None]:
import numpy as np



In [None]:
data = pd.DataFrame(columns=['Title', ' Original Link', 'Link', 'Description', 'Publication Date'], index = range(10))
data

In [None]:
data_dict = {'Title':city}
data = pd.DataFrame(data_dict)
data

In [None]:
news = []
for n in news_df.Title:
    news.append(n)

print(news[:5])    

In [None]:
tagger = Mecab()
news_nouns = []
for n in news:
    for noun in tagger.nouns(n):
        if noun not in stop_words:
            news_nouns.append(noun)

    news_nouns[:5]       

In [None]:
stop_words = '인공지능 인공 지능 분야 수 것 등 기반 일 년 말 문 월 이 명 깨 개 중 범 부 딥 빅 책 분 봉 차 나 내 전 억 대 형 선 사 폼 원 번'
stop_words = stop_words.split(' ')
print(stop_words)

In [None]:
import matplotlib.pyplot as plt
plt.title('안녕');

In [None]:
!apt-get update -qq
!apt-get install fonte-nanum* -qq

In [None]:
import requests

# 특정 URL에 접속하는 요청(Request) 객체를 생성합니다.
request = requests.get('http://www.dowellcomputer.com/main.jsp')

# 접속한 이후의 웹 사이트 소스코드를 추출합니다
html = request.text.strip()

print(html)

In [None]:
import requests
from bs4 import BeautifulSoup

# 특정 URL에 접속하는 요청(Request) 객체를 생성합니다.
request = requests.get('http://www.dowellcomputer.com/main.jsp')
# 접속한 이후의 웹 사이트 소스코드를 추출합니다.
html = request.text
# HTML 소스코드를 파이썬 BeatifulSoup 객체로 변환합니다.
soup = BeautifulSoup(html, 'html.parser')

# <a> 태그를 포함하는 요소를 추출합니다.
links = soup.select('td > a')

# 모든 링크에 하나씩 접근합니다.
for link in links:
  # 링크가 href 속성을 가지고 있다면
  if link.has_attr('href'):
    # href 속성의 값으로 notice라는 문자가 포함되어 있다면
    if link.get('href').find('notice') != -1:
      print(link.text)

In [None]:
!pip install selenium
!apt-get update
!apt install chromium-chromedriver

from selenium import webdriver
from urllib.request import urlopen
from bs4 import BeautifulSoup as bs
from urllib.parse import quote_plus
from selenium.webdriver.common.keys import Keys
import time
from IPython.display import Image
import urllib

from google.colab import drive
drive.mount('/content/gdrive')
#셀레니움으로 크롬 열기
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', chrome_options=chrome_options)

In [None]:
import request
from bs4 import BeautifulSoup

res = requests.get('')
soup = BeautifulSoup

## **GUI 프로그래밍**

```
# 코드로 형식 지정됨
```



In [None]:
# tkinter

from tkinter import *

root = tk()
root.mainloop()