In [2]:
# !pip install hdfs
# ref : https://hdfscli.readthedocs.io/en/latest/quickstart.html #python-bindings




In [3]:
import requests
import json
import datetime as dt
from hdfs import InsecureClient
client = InsecureClient('http://localhost:9870', user='big')

## hdfs로 부터 읽기

In [4]:
with client.read('/rdd/score.txt') as reader:
  score = reader.read()
score_str = bytes.decode(score)
print(score_str)

하명도 스파크 50
홍길동 스파크 80
임꺽정 스파크 60
임요환 텐서플로우 100
홍진호 텐서플로우 22
홍진호 텐서플로우 22
이윤열 텐서플로우 90
최연성 장고 100


## hdfs에 쓰기

In [5]:
with open('/home/big/study/data/corona_data/sido_area.csv', encoding='CP949') as reader, client.write('/corona_data/loc/sido_area.csv') as writer:
  for line in reader:
        writer.write(line.encode('CP949'))

FileNotFoundError: [Errno 2] No such file or directory: '/home/big/study/data/corona_data/sido_area.csv'

In [15]:
client.upload('/corona_data/loc', '/home/big/study/data/corona_data/')

'/corona_data/loc'

## hdfs에 수정하기

In [16]:
client.write('/rdd/score.txt', '최연성 장고 100'.encode('UTF-8'), append=True)

## hdfs 권한 수정

In [17]:
client.set_permission('/corona_data/loc', 777)

## hdfs 삭제

In [11]:
client.delete('/corona_data/vaccine/corona_vaccine_2022-09-15.json')
client.delete('/corona_data/vaccine/corona_vaccine_2022-09-17.json')
client.delete('/corona_data/vaccine/corona_vaccine_2022-09-19.json')
client.delete('/corona_data/vaccine/corona_vaccine_2022-09-20.json')

# client.delete('/corona_data/loc/sido_population.xlsx')

True

## REST_API로 데이터를 호출해 HDFS에 저장

In [6]:
# !pip install requests
# 함수 생성
def execute_rest_api(method, url, headers, params):
    if method == 'get':
        res = requests.get(url, params=params, headers=headers)
    elif method == 'post':
        res = requests.post(url, params=params, headers=headers)
        
    if res == None or res.status_code != 200:
        raise Exception('응답코드 : ' + str(res.status_code))
        
    return res.text




### 기준일자 함수

In [7]:
# 일자 생성 함수
def cal_std_day(befor_day):   
    x = dt.datetime.now() - dt.timedelta(befor_day)
    year = x.year
    month = x.month if x.month >= 10 else '0'+ str(x.month)
    day = x.day if x.day >= 10 else '0'+ str(x.day)  
    return str(year)+ '-' +str(month)+ '-' +str(day)


### logger

In [8]:
import logging

co_logger = logging.getLogger('corona_api')
handler = logging.FileHandler('./log/rest_api/'+cal_std_day(0)+'.log')
co_logger.addHandler(handler)


In [24]:
co_logger.error('테스트 에러 로그 입니다.')

ERROR:corona_api:테스트 에러 로그 입니다.


### api 호출

In [None]:
url = 'http://apis.data.go.kr/1352000/ODMS_COVID_04/callCovid04Api'
serviceKey = 'ieyyBBtAz1MCYnzzCoowSgBsd0mIQZMvRafW5mthdtaASR4xv3SSa+2BEv8oxjHNzgfe6LypMzGNG+zCLf39aA=='
file_dir = '/corona_data/patient/'

def create_param(before_day):
    return {
        'serviceKey':serviceKey
        , 'pageNo':'1'
        , 'numOfRows':'500'
        , 'apiType':'JSON'
        , 'std_day':cal_std_day(before_day)
    }

for i in range(1, 50):
    params = create_param(i)
    log_dict = {
        'is_success':'Fail'
        , 'type':'corona_patient'
        , 'std_day':params['std_day']
        , 'params':params
    }
    
    try:
        res = execute_rest_api('get', url, {}, params)
        file_name = 'corona_patient_' + params['std_day'] + '.json'
        client.write(file_dir+file_name, res, encoding='UTF-8')
    except Exception as e:
        log_dict['err_msg'] = e.__str__()
        log_json = json.dumps(log_dict, ensure_ascii=False)
        co_logger.error(log_json)


## 웹크롤링 hdfs 저장

In [16]:
# !pip install BeautifulSoup4

from bs4 import BeautifulSoup

file_dir = '/corona_data/vaccine/'
file_name = 'corona_vaccine_' + cal_std_day(1) +'.json'

url = 'https://ncv.kdca.go.kr/mainStatus.es?mid=a11702000000'
reponse_txt = execute_rest_api('get', url, {}, {})
soup = BeautifulSoup(reponse_txt, 'html.parser')
trs = soup.select('#content > div.data_table.tbl_scrl_t > table > tbody > tr')

cols = ['loc', 'v1', 'v2', 'v3', 'v4']
data = []

for idx, tr in enumerate(trs):
    if idx == 0:
        continue
    th = tr.select('th')
    tds = tr.select('td')
    
    rows = []
    rows.append(th[0].text.replace(' ', '').replace('\r\n', ''))
    
    for idx, td in enumerate(tds):
        if idx % 2 == 0:
            continue
        rows.append(td.text.replace(' ', '').replace('\r\n', '').replace(',', ''))
        
        tmp = dict(zip(cols, rows))
    data.append(tmp)

res = {
    'meta':{
        'desc':'지역별 코로나 예방접종 인구 현황',
        'cols':{
            'loc':'지역'
            , 'v1':'1차 접종자 수'
            , 'v2':'2차 접종자 수'
            , 'v3':'3차 접종자 수'
            , 'v4':'4차 접종자 수'
        },
        'std_day':cal_std_day(1)
    },
    'data':data
}

client.write(file_dir + file_name, json.dumps(res, ensure_ascii=False), encoding='utf-8')
