<a href="https://colab.research.google.com/github/Dawon00/Data_Mining/blob/main/Getting_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Reading Files

In [None]:
import sys
from collections import Counter


def get_domain(email_address):
  """split on '@' and return the last piece"""
  return email_address.lower().split("@")[-1]

with open('email_addresses.txt', 'r') as f:
  domain_counts = Counter(get_domain(line.strip())
                          for line in f
                          if "@" in line)
  
print(domain_counts)

Counter({'naver.com': 3, 'hufs.ac.kr': 2})


In [None]:
!cat /content/email_addresses.txt

asdf@hufs.ac.kr
asdfasd@hufs.ac.kr
dfasdfalksd@naver.com
dlafkja;lkdsjf@naver.com
dkkdfdkkdkdk@naver.com

#Delimited Files
csv file : These files are very often either comma-separated or tab-separated.


In [None]:
!cat tab_delimited_stock_prices.txt

6/20/2014	AAPL	90.91
6/20/2014	MSFT	41.68
6/20/2014	FB	64.5
6/19/2014	AAPL	91.86
6/19/2014	MSFT	41.51
6/19/2014	FB	64.34

In [None]:
import csv
with open('tab_delimited_stock_prices.txt', 'r') as f:
  reader = csv.reader(f, delimiter='\t')
  for row in reader:
    date = row[0]
    symbol = row[1]
    closing_price = float(row[2])
    print(date, symbol, closing_price)


6/20/2014 AAPL 90.91
6/20/2014 MSFT 41.68
6/20/2014 FB 64.5
6/19/2014 AAPL 91.86
6/19/2014 MSFT 41.51
6/19/2014 FB 64.34


In [None]:
!cat colon_delimited_stock_prices.txt


date:symbol:closing_price
6/20/2014:AAPL:90.91
6/20/2014:MSFT:41.68
6/20/2014:FB:64.5
6/19/2014:AAPL:91.86
6/19/2014:MSFT:41.51
6/19/2014:FB:64.34

In [None]:
with open('colon_delimited_stock_prices.txt', 'r') as f:
   reader = csv.DictReader(f, delimiter=':')
   for row in reader:
      date = row["date"]
      symbol = row["symbol"]
      closing_price = float(row["closing_price"])
      print(date, symbol, closing_price)


6/20/2014 AAPL 90.91
6/20/2014 MSFT 41.68
6/20/2014 FB 64.5
6/19/2014 AAPL 91.86
6/19/2014 MSFT 41.51
6/19/2014 FB 64.34


#Scraping the Web

In [None]:
from bs4 import BeautifulSoup
html = """
<html>
 <head>
  <title>A web page</title>
 </head>
 <body>
  <p id="author">Joel Grus</p>
  <p id="subject">Data Science</p>
  <p class="price">30</p>
 </body>
</html>"""
soup = BeautifulSoup(html, 'html5lib')

In [None]:
soup.title

<title>A web page</title>

In [None]:
soup.title.text

'A web page'

In [None]:
soup.p

<p id="author">Joel Grus</p>

In [None]:
soup.body.p

<p id="author">Joel Grus</p>

In [None]:
soup.body('p') #다 찾기

[<p id="author">Joel Grus</p>,
 <p id="subject">Data Science</p>,
 <p class="price">30</p>]

In [None]:
soup.body('p')[1].text

'Data Science'

In [None]:
soup.body('p')[-1]

<p class="price">30</p>

In [None]:
for i, p in enumerate(soup.body('p')):
  print('paragraph {}: {}'.format(i, p.text))


paragraph 0: Joel Grus
paragraph 1: Data Science
paragraph 2: 30


In [None]:
soup.p['id']

'author'

In [None]:
soup('p', {'id':'author'})

[<p id="author">Joel Grus</p>]

In [None]:
soup('p', 'price')

[<p class="price">30</p>]

In [None]:
soup.text

'\n  A web page\n \n \n  Joel Grus\n  Data Science\n  30\n \n'

In [None]:
first_paragraph = soup.find('p') # or just soup.p
print(first_paragraph)
print(type(first_paragraph))

<p id="author">Joel Grus</p>
<class 'bs4.element.Tag'>


In [None]:
first_paragraph_text = soup.p.text
first_paragraph_text

'Joel Grus'

In [None]:
first_paragraph_words = soup.p.text.split()
first_paragraph_words

['Joel', 'Grus']

In [None]:
first_paragraph_id = soup.p['id'] # raises KeyError if no 'id'
first_paragraph_id
#type(soup.p)

'author'

In [None]:
first_paragraph_id2 = soup.p.get('id') # returns None if no 'id'
first_paragraph_id2

'author'

In [None]:
all_paragraphs = soup.find_all('p') # or just soup('p')
all_paragraphs

[<p id="author">Joel Grus</p>,
 <p id="subject">Data Science</p>,
 <p class="price">30</p>]

In [None]:
soup('p')

[<p id="author">Joel Grus</p>,
 <p id="subject">Data Science</p>,
 <p class="price">30</p>]

In [None]:
soup('p', {'id':'subject'})

[<p id="subject">Data Science</p>]

In [None]:
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]
paragraphs_with_ids

[<p id="author">Joel Grus</p>, <p id="subject">Data Science</p>]

In [None]:
important_paragraphs = soup('p', {'class' : 'important'})

In [None]:
import requests

html = requests.get("http://www.naver.com").text
soup = BeautifulSoup(html, 'html5lib')

In [None]:
spans_inside_divs = [span
 for div in soup('div') # for each <div> on the page
 for span in div('span')] # find each <span> inside it

spans_inside_divs

[<span>뉴스스탠드 바로가기</span>,
 <span>주제별캐스트 바로가기</span>,
 <span>타임스퀘어 바로가기</span>,
 <span>쇼핑캐스트 바로가기</span>,
 <span>로그인 바로가기</span>,
 <span class="blind">NAVER whale</span>,
 <span class="_1syGnXOL _3di88A4c" data-clk="dropbanner1b" style="padding-right: 20px; color: black; padding-left: 20px"><span>두 개의 탭을 한 화면에서, </span><strong style="color: #f26c92">스마트한 듀얼탭</strong><span> 쇼핑하기</span></span>,
 <span>두 개의 탭을 한 화면에서, </span>,
 <span> 쇼핑하기</span>,
 <span style="background-color: #f26c92">다운로드</span>,
 <span class="blind">네이버</span>,
 <span class="blind">쥬니어네이버</span>,
 <span class="blind">해피빈</span>,
 <span class="blind">검색</span>,
 <span class="ico_search_submit"></span>,
 <span class="blind">한글 입력기</span>,
 <span class="ico_keyboard"></span>,
 <span class="blind">자동완성 레이어</span>,
 <span class="ico_arr"></span>,
 <span class="fix"><span class="common_ico_kwd"><i class="imsc ico_search"></i></span><span>@txt@</span></span>,
 <span class="common_ico_kwd"><i class="imsc ico_search"></i></spa

#Using APIs
##JSON as Tree Data Model
* MongoDB is JSON-style tree database
* JSON as Python dictionay of dictionaries of dictionaries ...

In [None]:
import json
serialized = """{ "title" : "Data Science Book",
 "author" : "Joel Grus",
 "publicationYear" : 2014,
 "topics" : [ "data", "science", "data science"] }"""
 
# parse the JSON to create a Python dict
deserialized = json.loads(serialized)

In [None]:
deserialized['title']

'Data Science Book'

In [None]:
if "data science" in deserialized["topics"]:
 print(deserialized)

{'title': 'Data Science Book', 'author': 'Joel Grus', 'publicationYear': 2014, 'topics': ['data', 'science', 'data science']}


##XML as Tree Data Model


In [None]:
xml_text = """
<Book>
 <Title>Data Science Book</Title>
 <Author>Joel Grus</Author>
 <PublicationYear>2014</PublicationYear>
 <Topics>
 <Topic>data</Topic>
 <Topic>science</Topic>
 <Topic>data science</Topic>
 </Topics>
</Book>
"""

In [None]:
soup = BeautifulSoup(xml_text, 'lxml')
soup.book

<book>
<title>Data Science Book</title>
<author>Joel Grus</author>
<publicationyear>2014</publicationyear>
<topics>
<topic>data</topic>
<topic>science</topic>
<topic>data science</topic>
</topics>
</book>

In [None]:
soup.book.title

<title>Data Science Book</title>

In [None]:
soup.book.title.text

'Data Science Book'

In [None]:
soup.topics('topic')

[<topic>data</topic>, <topic>science</topic>, <topic>data science</topic>]

In [None]:
soup.book.topics('topic')[1].text

'science'

In [None]:
soup.book('topic')[-1]

<topic>data science</topic>

In [None]:
for i, topic in enumerate(soup.book('topic')):
 print('topic {}: {}'.format(i, topic.text))

topic 0: data
topic 1: science
topic 2: data science


 Find the title's text of all bookss whose author is 'Joel Grus' and publicationyear >= 2000

In [None]:
for book in soup('book'):
 if book.author.text == 'Joel Grus' and int(book.publicationyear.text) >= 2000:
    print(book.title.text)

Data Science Book


##Using an Unauthenticated API


In [None]:
import requests, json
endpoint = "https://api.github.com/users/joelgrus/repos"

repos = json.loads(requests.get(endpoint).text)
repos

[{'allow_forking': True,
  'archive_url': 'https://api.github.com/repos/joelgrus/advent2017/{archive_format}{/ref}',
  'archived': False,
  'assignees_url': 'https://api.github.com/repos/joelgrus/advent2017/assignees{/user}',
  'blobs_url': 'https://api.github.com/repos/joelgrus/advent2017/git/blobs{/sha}',
  'branches_url': 'https://api.github.com/repos/joelgrus/advent2017/branches{/branch}',
  'clone_url': 'https://github.com/joelgrus/advent2017.git',
  'collaborators_url': 'https://api.github.com/repos/joelgrus/advent2017/collaborators{/collaborator}',
  'comments_url': 'https://api.github.com/repos/joelgrus/advent2017/comments{/number}',
  'commits_url': 'https://api.github.com/repos/joelgrus/advent2017/commits{/sha}',
  'compare_url': 'https://api.github.com/repos/joelgrus/advent2017/compare/{base}...{head}',
  'contents_url': 'https://api.github.com/repos/joelgrus/advent2017/contents/{+path}',
  'contributors_url': 'https://api.github.com/repos/joelgrus/advent2017/contributors',


생성된 날짜 찾아보기(Counter)

In [None]:
from dateutil.parser import parse

dates = [parse(repo["created_at"]) for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)

print(month_counts)
print(weekday_counts)

Counter({11: 7, 9: 5, 12: 4, 1: 3, 7: 3, 2: 2, 5: 2, 8: 2, 6: 1, 4: 1})
Counter({2: 8, 4: 6, 5: 5, 1: 5, 6: 4, 3: 2})


가장 최근 5개의 repos

In [None]:
last_5_repositories = sorted(repos,
  key=lambda r: r["created_at"],
  reverse=True)[:5]

last_5_languages = [repo["language"]
  for repo in last_5_repositories]

last_5_languages

['Python', 'Python', 'Python', 'Python', None]

##Finding APIs
* https://www.data.go.kr/ 공공데이터 포털

Example: Using the Twitter APIs