## 01. Import

In [30]:
import os
import requests #Used to service API connection
from lxml import html #Used to parse XML
from bs4 import BeautifulSoup #Used to read XML table on webpage
import pandas as pd
from pandas.io.json import json_normalize

## 02. 함수 정의

#### 　　　　　02-1. URL 생성 함수

In [31]:
def makeURL(myUrl, op, myKey, sigunguCd, bjdongCd):
    # myUrl = "http://192.168.1.120/index.php?"
    url = myUrl + '/'+ op + "?ServiceKey=" + myKey + "&sigunguCd=" + sigunguCd + '&bjdongCd=' + bjdongCd
    url = url.rstrip('&')
    return url

#### 　　　　　02-2. 폴더 생성 함수

In [32]:
def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print ('Error: Creating directory. ' +  directory)

#### 　　　　　02-3. xml 데이터 추출 함수 ( 우리가 뽑아낼 데이터는 xml이기때문에 xml빼고는 삭제함)

In [33]:
def xmlProcess(url):
    response = requests.get(url)
    # Check if page is up
    if response.status_code == 200:
        # Convert webpage to %Data
        Data = BeautifulSoup(response.text, 'lxml-xml')
        result = []
        rows = 0
        columnName = []
        # search Item all item tag
        iterData = Data.find_all('item')
        for item in iterData:
            item_list = []
            # Fill the value in one row
            for tag in item.find_all():
                try:
                    tagname = tag.name
                    if rows == 0:
                        columnName.append(tagname)
                    item_list.append(item.find(tagname).text)
                except Exception as e:
                    print("This row will be ignored. ", item_list)
            if len(item_list) == 0:
                return pd.DataFrame()
            rows = rows + 1
            result.append(item_list)

    finalResult = pd.DataFrame(result)
    finalResult.columns = columnName

    return finalResult

## 03. 데이터 추출 및 정제

#### 　　　　　03-1. Google docs의 데이터를 받아오기

In [34]:
dataList = pd.read_csv("https://docs.google.com/spreadsheets/d/1VngqG-m7G8k1587c21MZoheR1Fz3amp1mJtiBvA1Jb0/export?format=csv&gid=0")

# dataList라는 변수안에 받아온 데이터를 저장 ( Google docs의 '사이트'칼럼의 길이를 출력한다 )
print("### The total number of target data is " + str(len(dataList)))

### The total number of target data is 417


In [35]:
dataList = dataList[   dataList['사이트'].notnull() ]

# dataList라는 변수안에 받아온 '사이트'칼럼의 데이터가 null값이라면 저장하지않고 null값이 아니라면 저장한다 
# ( 최종적으로 '사이트'칼럼에 데이터가 있는 부분만 저장이 되어 출력 )
print(dataList[["사이트"]])
print("### The total number of filtered data is " + str(len(dataList)))

# dataList라는 변수에 null값들을 빈 문자열을 넣어 채워주고 인덱스를 초기화 해준다.
dataList = dataList.fillna("")
dataList = dataList.reset_index(drop=True)

                                                   사이트
305  https://www.data.go.kr/dataset/fileDownload.do...
306  http://openapi.kab.co.kr/OpenAPI_ToolInstallPa...
308  http://apis.data.go.kr/1611000/nsdi/BuildingUs...
313         https://data.myhome.go.kr/rentalHouseList?
318  http://openapi.kab.co.kr/OpenAPI_ToolInstallPa...
319  http://openapi.kab.co.kr/OpenAPI_ToolInstallPa...
340      http://apis.data.go.kr/1611000/ArchPmsService
342        http://fsc.go.kr/downManager?bbsid=BBS0069&
345  http://apis.data.go.kr/1611000/ArchPmsService/...
349  http://data.insight.go.kr/openapi/service/Pric...
352  http://openapi.molit.go.kr/OpenAPI_ToolInstall...
353  http://openapi.kab.co.kr/OpenAPI_ToolInstallPa...
354  http://openapi.kab.co.kr/OpenAPI_ToolInstallPa...
355  https://www.data.go.kr/dataset/fileDownload.do...
359  http://openapi.kab.co.kr/OpenAPI_ToolInstallPa...
360  http://api.hf.go.kr:8090/service/rest/HfMbsInt...
361  http://api.hf.go.kr:8090/service/rest/HfMbbInt...
363  https

#### 　　　　　03-2. Google docs에서 지정된 이름으로 폴더를 생성

In [42]:
outPath = "../data/outbound/"

# folderList라는 변수 안에 Google docs에서 받아온 '폴더명'칼럼의 데이터를 List형태로 저장함
folderList = dataList["폴더명"].tolist()

# folderList의 길이만큼 for문을 돌면서 폴더를 생성
for i in folderList:
    createFolder(outPath+i)

#### 　　　　　03-3. 데이터 정제에 필요한 정보들을 변수화 하여 입력

In [37]:
inputUrl = 'http://apis.data.go.kr/1611000/ArchPmsService'
inputKey = "ufe1cXhfaa3SBtlsFh402kse1ctLxx0%2BxZU0NDXcW6KST%2BIRc2HMgfJMC2%2BJ9vwZqCjCZvIyvusJfRZdESfOIA%3D%3D"
inputFolder = "340_건축인허가 정보"
inputFile = "getApBasisOulnInfo&getApDongOulnInfo&getApFlrOulnInfo&getApHoOulnInfo&getApImprprInfo&getApHdcrMgmRgstInfo&getApDemolExtngMgmRgstInfo&getApTmpBldInfo&getApWclfInfo&getApPklotInfo&getApAtchPklotInfo&getApExposPubuseAreaInfo&getApHoExposPubuseAreaInfo&getApJijiguInfo&getApRoadRgstInfo&getApPlatPlcInfo&getApHsTpInfo"
inputFile = inputFile.split('&')
inputDataType = "xml"

# inputFile변수는 필요한 모든 Operater들을 &기호로 분류하여 Google docs에 입력하였기 때문에 정제를 위해 split 함수로 쪼개준다.
# 최종적으로 inputFile 이라는 변수는 List형태로 저장됨
print(inputFile)

['getApBasisOulnInfo', 'getApDongOulnInfo', 'getApFlrOulnInfo', 'getApHoOulnInfo', 'getApImprprInfo', 'getApHdcrMgmRgstInfo', 'getApDemolExtngMgmRgstInfo', 'getApTmpBldInfo', 'getApWclfInfo', 'getApPklotInfo', 'getApAtchPklotInfo', 'getApExposPubuseAreaInfo', 'getApHoExposPubuseAreaInfo', 'getApJijiguInfo', 'getApRoadRgstInfo', 'getApPlatPlcInfo', 'getApHsTpInfo']


#### 　　　　　03-4. 데이터 정제에 필요한 파일 불러오기

In [38]:
# 참고문헌에서 참조하라고 했던 문서(우리조의 경우 사이트에서 내려받고 csv파일형태로 새로 저장하여주었음)을 데이터프레임화 하기
codeDF = pd.read_csv('../data/inbound/allCode2.csv', encoding='ms949')

# 데이터프레임화 후 '법정동코드'칼럼의 데이터를 추출하여 codeList라는 변수에 List형태로 저장한다.
codeList = codeDF['법정동코드'].astype(str).tolist()

In [50]:
testList = []
for i in inputFile:
    DF = pd.DataFrame()
    for j in codeList:
        sigunguCd = j[:5]
        bjdongCd = j[5:]
        url = makeURL(inputUrl, i, inputKey, sigunguCd, bjdongCd)
        try:
            tempDF = xmlProcess(url)
            DF = pd.concat([DF, tempDF], sort = False, axis = 0)
        except Exception as e:
            print(url)
            
    resultName = outPath + inputFolder + '/' + i + '.csv'
    DF.to_csv(resultName, encoding = 'ms949', index = False)

In [51]:
testList

NameError: name 'testList' is not defined

In [25]:
response = requests.get('http://apis.data.go.kr/1611000/ArchPmsService/getApPlatPlcInfo?ServiceKey=ufe1cXhfaa3SBtlsFh402kse1ctLxx0%2BxZU0NDXcW6KST%2BIRc2HMgfJMC2%2BJ9vwZqCjCZvIyvusJfRZdESfOIA%3D%3D&sigunguCd=11110&bjdongCd=10600')
response.status_code
Data = BeautifulSoup(response.text, 'lxml-xml')
result = []
rows = 0
columnName = []
# search Item all item tag
iterData = Data.find_all('item')
for item in iterData:
    item_list = []
    # Fill the value in one row
    for tag in item.find_all():
        try:
            tagname = tag.name
            if rows == 0:
                columnName.append(tagname)
            item_list.append(item.find(tagname).text)
        except Exception as e:
            print("This row will be ignored. ", item_list)
    if len(item_list) == 0:
         pd.DataFrame()
    rows = rows + 1
    result.append(item_list)
finalResult = pd.DataFrame(result)
finalResult.columns = columnName
finalResult

Unnamed: 0,bjdongCd,block,bun,crtnDay,hjdongCd,ji,jimokCd,jimokCdNm,lot,mainDongGbCd,mainDongGbCdNm,mgmPlatPlcPk,mgmPmsrgstPk,platGbCd,platPlc,relJibunNm,reprYn,rnum,sigunguCd,splotNm
0,10600,,1,20090326,530,1,8.0,대,,0,전체,11110-100006700,11110-100006488,0,서울특별시 종로구 통의동 1-1번지,,1,1,11110,
1,10600,,1,20090326,530,6,,,,0,전체,11110-100006701,11110-100006488,0,서울특별시 종로구 통의동 1-6번지,,0,2,11110,
2,10600,,1,20140322,530,7,8.0,대,,0,전체,11110-100026025,11110-100019465,0,서울특별시 종로구 통의동 1-7번지,,1,3,11110,
3,10600,,2,20100325,530,1,8.0,대,,0,전체,11110-100012653,11110-100011566,0,서울특별시 종로구 통의동 2-1번지,,1,4,11110,
4,10600,,2,20130327,530,1,8.0,대,,0,전체,11110-100022675,11110-100017287,0,서울특별시 종로구 통의동 2-1번지,,1,5,11110,
5,10600,,2,20170413,530,1,8.0,대,,0,전체,11110-100038544,11110-100011855,0,서울특별시 종로구 통의동 2-1번지,,1,6,11110,
6,10600,,6,20100325,530,0,,,,0,전체,11110-5743,11110-5369,0,서울특별시 종로구 통의동 6번지,,1,7,11110,
7,10600,,6,20130413,530,0,8.0,대,,0,전체,11110-2672,11110-2298,0,서울특별시 종로구 통의동 6번지,,1,8,11110,
8,10600,,7,20091126,530,4,8.0,대,,0,전체,11110-100011816,11110-100011144,0,서울특별시 종로구 통의동 7-4번지,,1,9,11110,
9,10600,,7,20100325,530,5,,,,0,전체,11110-5985,11110-5611,0,서울특별시 종로구 통의동 7-5번지,,1,10,11110,


In [27]:
response = requests.get('http://apis.data.go.kr/1611000/ArchPmsService/getApPlatPlcInfo?ServiceKey=ufe1cXhfaa3SBtlsFh402kse1ctLxx0%2BxZU0NDXcW6KST%2BIRc2HMgfJMC2%2BJ9vwZqCjCZvIyvusJfRZdESfOIA%3D%3D&sigunguCd=11110&bjdongCd=10700')
response.status_code
Data = BeautifulSoup(response.text, 'lxml-xml')
result = []
rows = 0
columnName = []
# search Item all item tag
iterData = Data.find_all('item')
for item in iterData:
    item_list = []
    # Fill the value in one row
    for tag in item.find_all():
        try:
            tagname = tag.name
            if rows == 0:
                columnName.append(tagname)
            item_list.append(item.find(tagname).text)
        except Exception as e:
            print("This row will be ignored. ", item_list)
    if len(item_list) == 0:
         pd.DataFrame()
    rows = rows + 1
    result.append(item_list)
finalResult2 = pd.DataFrame(result)
finalResult2.columns = columnName
finalResult2

Unnamed: 0,bjdongCd,block,bun,crtnDay,hjdongCd,ji,jimokCd,jimokCdNm,lot,mainDongGbCd,mainDongGbCdNm,mgmPlatPlcPk,mgmPmsrgstPk,platGbCd,platPlc,relJibunNm,reprYn,rnum,sigunguCd,splotNm
0,10700,,1,20090326,530,0.0,8.0,대,,0,전체,11110-2816,11110-2442,0,서울특별시 종로구 적선동 1번지,,1,1,11110.0,
1,10700,,1,20100325,530,0.0,,,,0,전체,11110-6737,11110-6363,0,서울특별시 종로구 적선동 1번지,,1,2,11110.0,
2,10700,,2,20090326,0,,,,1.0,동별,11110-2975,11110-10849,0,서울특별시 종로구 적선동 2번지,2,0.0,3,11110,,
3,10700,,3,20090326,0,,,,1.0,동별,11110-2975,11110-10850,0,서울특별시 종로구 적선동 3번지,3,0.0,4,11110,,
4,10700,,4,20090326,0,,,,1.0,동별,11110-2975,11110-10851,0,서울특별시 종로구 적선동 4번지,4,0.0,5,11110,,
5,10700,,5,20090326,1,,,,1.0,동별,11110-2975,11110-10852,0,서울특별시 종로구 적선동 5-1번지,5-1,0.0,6,11110,,
6,10700,,5,20090326,2,,,,1.0,동별,11110-2975,11110-10853,0,서울특별시 종로구 적선동 5-2번지,5-2,0.0,7,11110,,
7,10700,,6,20090326,0,,,,1.0,동별,11110-2975,11110-10854,0,서울특별시 종로구 적선동 6번지,6,0.0,8,11110,,
8,10700,,7,20090326,0,,,,1.0,동별,11110-2975,11110-10855,0,서울특별시 종로구 적선동 7번지,7,0.0,9,11110,,
9,10700,,8,20090326,0,,,,1.0,동별,11110-2975,11110-10856,0,서울특별시 종로구 적선동 8번지,8,0.0,10,11110,,


In [29]:
tset = pd.concat([finalResult, finalResult2], axis = 0, sort = False)
tset

Unnamed: 0,bjdongCd,block,bun,crtnDay,hjdongCd,ji,jimokCd,jimokCdNm,lot,mainDongGbCd,mainDongGbCdNm,mgmPlatPlcPk,mgmPmsrgstPk,platGbCd,platPlc,relJibunNm,reprYn,rnum,sigunguCd,splotNm
0,10600,,1,20090326,530,1.0,8.0,대,,0,전체,11110-100006700,11110-100006488,0,서울특별시 종로구 통의동 1-1번지,,1,1,11110.0,
1,10600,,1,20090326,530,6.0,,,,0,전체,11110-100006701,11110-100006488,0,서울특별시 종로구 통의동 1-6번지,,0,2,11110.0,
2,10600,,1,20140322,530,7.0,8.0,대,,0,전체,11110-100026025,11110-100019465,0,서울특별시 종로구 통의동 1-7번지,,1,3,11110.0,
3,10600,,2,20100325,530,1.0,8.0,대,,0,전체,11110-100012653,11110-100011566,0,서울특별시 종로구 통의동 2-1번지,,1,4,11110.0,
4,10600,,2,20130327,530,1.0,8.0,대,,0,전체,11110-100022675,11110-100017287,0,서울특별시 종로구 통의동 2-1번지,,1,5,11110.0,
5,10600,,2,20170413,530,1.0,8.0,대,,0,전체,11110-100038544,11110-100011855,0,서울특별시 종로구 통의동 2-1번지,,1,6,11110.0,
6,10600,,6,20100325,530,0.0,,,,0,전체,11110-5743,11110-5369,0,서울특별시 종로구 통의동 6번지,,1,7,11110.0,
7,10600,,6,20130413,530,0.0,8.0,대,,0,전체,11110-2672,11110-2298,0,서울특별시 종로구 통의동 6번지,,1,8,11110.0,
8,10600,,7,20091126,530,4.0,8.0,대,,0,전체,11110-100011816,11110-100011144,0,서울특별시 종로구 통의동 7-4번지,,1,9,11110.0,
9,10600,,7,20100325,530,5.0,,,,0,전체,11110-5985,11110-5611,0,서울특별시 종로구 통의동 7-5번지,,1,10,11110.0,


In [None]:

# Check if page is up
if response.status_code == 200:
# Convert webpage to %Data





return finalResult