In [1]:
import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd

#### 1. 조회 변수 설정

##### 1.1. 조회 변수 설정

In [2]:
# 오늘날짜 (YYYY/MM/DD) <- 이 부분만 변경하면 됨.
currentDate = '2022/02/12'

##### 1.2. 고정 상수 설정

In [3]:
# 조회할 기간 from <- (!) 2016/09/03 데이터부터 유효하므로, 아래 데이터는 불변
inputFromDt = '2016/09/03'

# 조회할 기간 to <- 오늘날짜 currentDate와 동일하게 처리
inputToDt = currentDate

# 첫 페이지
currentPageNo = 1

# 조회할 페이지(고정)
recordCountPerPage = 100

#### 2. 페이지 정보 가져오기

In [4]:
url = f"https://www.g2b.go.kr:8070/um/injustice/injusticeBizerList.do?bizRegNo=&callbackFunction=&chfNm=&currentDate={currentDate}&currentPageNo={currentPageNo}&inputFromDt={inputFromDt}&inputToDt={inputToDt}&isOver=2&othr=&pcontKindCd=&periodChk=choise&popYn=&recordCountPerPage={recordCountPerPage}&reqCl=&sanctionInstCd=&sanctionInstNm=&searchUseYn=Y&whereAreYouFrom=ALL"

In [None]:
r = requests.get(url)
r.text

In [None]:
soup = BeautifulSoup(r.text, 'html.parser')
soup

In [8]:
def get_full_pagination(soup):
    """
    전체 페이지를 조회하는 코드
    """

    p = re.compile(r"currentPageNo=(.*)")
    full_page = [i for i in soup.find_all('div', {'id':'pagination'})][0].find_all('a')
    full_page_result = [re.findall(p, i.get('href'))[0] for i in full_page]
    
    full_page_tmp = list(set(full_page_result))
    result = sorted([int(i) for i in full_page_tmp]) 
    
    return result

In [9]:
full_pagination = get_full_pagination(soup)
full_pagination.insert(0, 1) # 첫번째 페이지 수기 추가
full_pagination


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

#### 3. 가져온 페이지 정보로 페이지 내 항목을 조회

In [10]:
def url_get_page_result_list(url):
    """
    url 페이지 내 항목을 조회하는 코드
    """

    p_injusticeNo = re.compile(r"toOnDetail\('(.*?)'")
    p_injusticeModseq = re.compile(r",'(.*?)'")

    header_line_item = ['_dummy1', 'No.', '상호명', '사업자등록번호', '원인부정당정보', '제재기관', '제재시작일자', '대표자명','_dummy2']

    result = []

    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')

    list_of_items = soup.find_all('tbody')
    for line_item in list_of_items:
        line_item_lv1 = [i.text.strip() for i in line_item.find_all('td')]
        line_item_lv2 = dict(zip(header_line_item, line_item_lv1))

        line_item_detail_lv1 = line_item.find('a', {'class':'btn_mdl'})['href']
        injusticeNo = re.findall(p_injusticeNo, line_item_detail_lv1)[0]
        injusticeModseq = re.findall(p_injusticeModseq, line_item_detail_lv1)[0]
        
        url_detail = f'https://www.g2b.go.kr:8070/um/injustice/injusticeBizerDtl.do?flag=view&injusticeNo={injusticeNo}&injusticeModseq={injusticeModseq}&whereAreYouFrom=ALL'
        line_item_lv2['상태조회'] = url_detail

        result.append(line_item_lv2)

    time.sleep(1)
    return result

In [11]:
result = []
for target_pagination in full_pagination:
    currentPageNo = target_pagination
    url = f"https://www.g2b.go.kr:8070/um/injustice/injusticeBizerList.do?bizRegNo=&callbackFunction=&chfNm=&currentDate={currentDate}&currentPageNo={currentPageNo}&inputFromDt={inputFromDt}&inputToDt={inputToDt}&isOver=2&othr=&pcontKindCd=&periodChk=choise&popYn=&recordCountPerPage={recordCountPerPage}&reqCl=&sanctionInstCd=&sanctionInstNm=&searchUseYn=Y&whereAreYouFrom=ALL"
    
    result.append(pd.DataFrame(url_get_page_result_list(url)))

In [12]:
df_concat = pd.concat(result)
del df_concat['_dummy1']
del df_concat['_dummy2']

In [15]:
df_concat.to_excel('./sanction_list_result.xlsx', index=False)
df_concat.head()

Unnamed: 0,No.,상호명,사업자등록번호,원인부정당정보,제재기관,제재시작일자,대표자명,상태조회
0,1,주식회사 이조건설,2648102635,2648102635 (주식회사 이조건설),조달청,2022/02/12,이창훈,https://www.g2b.go.kr:8070/um/injustice/injust...
1,2,코스텍유한회사,3118125477,3118125477 (코스텍유한회사),조달청,2022/02/12,임종길,https://www.g2b.go.kr:8070/um/injustice/injust...
2,3,코스텍유한회사서천지점,5028542634,3118125477 (코스텍유한회사),조달청,2022/02/12,,https://www.g2b.go.kr:8070/um/injustice/injust...
3,4,(주)현진테크,1968600232,1968600232 ((주)현진테크),조달청,2022/02/12,오연숙,https://www.g2b.go.kr:8070/um/injustice/injust...
4,5,상아문화사,4410301986,4410301986 (상아문화사),조달청,2022/02/12,김성근,https://www.g2b.go.kr:8070/um/injustice/injust...


#### 4. 페이지 내 항목의 상세항목 조회

In [16]:
def url_get_page_detail_result_tuple(url_detail):
    """
    url_detail 상세조회 페이지 조회 코드
    """
    
    result = {}

    r = requests.get(url_detail)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    result['문서번호'] = soup.find('th', text='문서번호').find_next_sibling('td').text
    result['게재일시'] = soup.find('th', text='게재일시').find_next_sibling('td').text
    result['법인등록번호'] = soup.find('th', text='법인등록번호').find_next_sibling('td').text
    result['제재근거'] = soup.find('th', text='제재근거').find_next_sibling('td').text
    result['조항호'] = soup.find('th', text='조항호').find_next_sibling('td').text
    result['조항호코드명'] = soup.find('th', text='조항호코드명').find_next_sibling('td').text
    result['조항호코드명_전체'] = soup.find('th', text='만료년월일').find_next_sibling('td').text
    result['시행규칙76조별표2'] = soup.find('th', text='시행규칙76조별표2').find_next_sibling('td').text
    result['시행규칙76조별표2명'] = soup.find('th', text='시행규칙76조별표2명').find_next_sibling('td').text
    result['제재년월일'] = soup.find('th', text='제재년월일').find_next_sibling('td').text
    result['만료년월일'] = soup.find('th', text='만료년월일').find_next_sibling('td').text
    result['제재기간'] = soup.find('th', text='제재기간').find_next_sibling('td').text.strip()
    result['제재기간일수'] = soup.find('th', text='제재기간일수').find_next_sibling('td').text
    result['계약종류'] = soup.find('th', text='계약종류').find_next_sibling('td').text


    # 원 부정당 업체와 법인번호가 동일하여 부정당제재 처리된 업체 존재여부 확인
    is_corp_restricted = soup.find('table', {'summary':re.compile('법인제재 표시 테이블.*')})
    if is_corp_restricted is None:
        result_is_corp_restricted = None
    else:
        result_is_corp_restricted = soup_get_page_detail_result_corp_restricted_list(is_corp_restricted)

    # 부정당업자제재이력 확인
    is_history = soup.find_all('tbody', {'onmouseover':"this.className='on'"})
    result_is_history = soup_get_page_detail_result_history_list(is_history)

    time.sleep(1)
    return ([result], result_is_corp_restricted, result_is_history)
    

In [17]:
def soup_get_page_detail_result_corp_restricted_list(soup):
    """
    상세조회 페이지 내 [ 법인 제재 ] 조회 코드 <- (!) 있을수도 있고 없을수도 있는 항목
    """
    # soup은 넘겨받은 is_corp_restricted
    # is_corp_restricted = soup.find('table', {'summary':re.compile('법인제재 표시 테이블.*')})


    header_line_item = ['사업자등록번호', '상호명.', '본사여부']

    result = []

    list_of_items = soup.find_all('tr')
    for line_item in list_of_items:
        line_item_lv1 = [i.text.strip() for i in line_item.find_all('td')]
        line_item_lv2 = dict(zip(header_line_item, line_item_lv1))

        result.append(line_item_lv2)

    return result

In [18]:
def soup_get_page_detail_result_history_list(soup):
    """
    상세조회 페이지 내 [ 부정당업자제재 이력 ] 조회 코드
    """
    # soup은 넘겨받은 soup.find_all('tbody', {'onmouseover':"this.className='on'"})    

    header_line_item = ['_dummy1', '부정당번호/차수', '통보일시', '제재구분', '통보/정정일자', '정지일자', '재개일자', '해제일자', '삭제일자', '_dummy2', '사유']

    result = []

    list_of_items = soup
    for line_item in list_of_items:
        line_item_lv1 = [i.text.strip() for i in line_item.find_all('td')]
        line_item_lv2 = dict(zip(header_line_item, line_item_lv1))

        result.append(line_item_lv2)

    return result

In [19]:
"""
WinError 10060 발생으로 인한 df_concat_partial 쪼개기
-> 300개씩 쪼갰을 때 문제 없음
"""
# df_concat_partial = df_concat['상태조회'].iloc[:300]
# df_concat_partial = df_concat['상태조회'].iloc[300:600]
# df_concat_partial = df_concat['상태조회'].iloc[600:900]
df_concat_partial = df_concat['상태조회'].iloc[900:]

detail_result_tmp = []
detail_result_is_corp_restricted_tmp = []
detail_result_is_history_tmp = []

for url_detail in df_concat_partial:
    detail_result, detail_result_is_corp_restricted, detail_result_is_history = url_get_page_detail_result_tuple(url_detail)

    df_detail_result = pd.DataFrame(detail_result)
    df_detail_result['상태조회'] = url_detail
    detail_result_tmp.append(df_detail_result)

    if detail_result_is_corp_restricted is not None:
        df_detail_result_is_corp_restricted = pd.DataFrame(detail_result_is_corp_restricted)
        df_detail_result_is_corp_restricted['상태조회'] = url_detail
        detail_result_is_corp_restricted_tmp.append(df_detail_result_is_corp_restricted)

    df_detail_result_is_history = pd.DataFrame(detail_result_is_history)
    df_detail_result_is_history['상태조회'] = url_detail        
    detail_result_is_history_tmp.append(pd.DataFrame(df_detail_result_is_history))

In [20]:
df_concat_detail_result = pd.concat(detail_result_tmp)
df_concat_detail_result.tail(1)

Unnamed: 0,문서번호,게재일시,법인등록번호,제재근거,조항호,조항호코드명,조항호코드명_전체,시행규칙76조별표2,시행규칙76조별표2명,제재년월일,만료년월일,제재기간,제재기간일수,계약종류,상태조회
0,1270000,2017/01/13 16:59,1342110122583,국가를당사자로하는계약에관한법률 제27조제1항,법27조01항08호 령76조01항02호 가목 - 110000015,"계약의 불체결.불이행,주요계약조건 위반",2017/03/11,16가,가. 계약을 체결 또는 이행(하자보수의무의 이행을 포함한다)하지 아니한 자,2016/12/12,2017/03/11,3 월,0 일,물품구매,https://www.g2b.go.kr:8070/um/injustice/injust...


In [21]:
if len(detail_result_is_corp_restricted_tmp) == 0:
    df_concat_detail_result_is_corp_restricted = None
else:    
    df_concat_detail_result_is_corp_restricted = pd.concat(detail_result_is_corp_restricted_tmp)
df_concat_detail_result_is_corp_restricted.tail(1) 

Unnamed: 0,사업자등록번호,상태조회,상호명.,본사여부
1,2128529330,https://www.g2b.go.kr:8070/um/injustice/injust...,(주)더팜지점,지사


In [22]:
df_concat_detail_result_is_history = pd.concat(detail_result_is_history_tmp)

del df_concat_detail_result_is_history['_dummy1']
del df_concat_detail_result_is_history['_dummy2']

# df_concat_detail_result_is_history.tail(1)

In [23]:
df_concat_detail_result.to_excel('./sanction_list_detail_result_4.xlsx', index=False)
df_concat_detail_result_is_corp_restricted.to_excel('./sanction_list_detail_result_corp_restricted_4.xlsx', index=False)
df_concat_detail_result_is_history.to_excel('./sanction_list_detail_result_history_4.xlsx', index=False)

In [24]:
##### 5. 상세조회 df 합치기

a1 = pd.read_excel('./sanction_list_detail_result_1.xlsx')
a2 = pd.read_excel('./sanction_list_detail_result_2.xlsx')
a3 = pd.read_excel('./sanction_list_detail_result_3.xlsx')
a4 = pd.read_excel('./sanction_list_detail_result_4.xlsx')
b1 = pd.read_excel('./sanction_list_detail_result_corp_restricted_1.xlsx')
b2 = pd.read_excel('./sanction_list_detail_result_corp_restricted_2.xlsx')
b3 = pd.read_excel('./sanction_list_detail_result_corp_restricted_3.xlsx')
b4 = pd.read_excel('./sanction_list_detail_result_corp_restricted_4.xlsx')
c1 = pd.read_excel('./sanction_list_detail_result_history_1.xlsx')
c2 = pd.read_excel('./sanction_list_detail_result_history_2.xlsx')
c3 = pd.read_excel('./sanction_list_detail_result_history_3.xlsx')
c4 = pd.read_excel('./sanction_list_detail_result_history_4.xlsx')


FileNotFoundError: [Errno 2] No such file or directory: './sanction_list_detail_result_1.xlsx'

In [None]:
pd.concat([a1,a2,a3,a4]).drop_duplicates().to_excel('./sanction_list_detail_result.xlsx', index=False)
pd.concat([b1,b2,b3,b4])[pd.concat([b1,b2,b3,b4])['사업자등록번호'].notnull()].drop_duplicates().to_excel('./sanction_list_detail_result_corp_restricted.xlsx', index=False)
pd.concat([c1,c2,c3,c4]).drop_duplicates().to_excel('./sanction_list_detail_result_history.xlsx', index=False)