In [1]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm 
import pandas as pd
import requests
import re
import time
import KCycle_crawller as KC

In [2]:
driver = webdriver.Chrome()

# 해당 연도에서 추출할 경기들 리스트

In [3]:
year_list = ['2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']
연도 = year_list[0]

url = f'https://www.kcycle.or.kr/contents/information/fixedChuljuPage.do?stndYear={연도}&tms_dayOrd=01-1'
driver.get(url)

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

In [4]:
base_raw = soup.find('select', attrs={'name': 'tms_dayOrd'})
data_list = [td.get_text(strip=True) for td in base_raw.find_all('option')]

race_day_list = []
for item in data_list:
    match = re.search(r'\((\d+)회 (\d+)일\)', item)
    if match:
        day = int(match.group(2))
        if 1 <= day <= 3:
            race_day_list.append((match.group(1), match.group(2)))

race_day_list.reverse()
race_day_list

[('01', '1'),
 ('01', '2'),
 ('01', '3'),
 ('02', '1'),
 ('02', '2'),
 ('02', '3'),
 ('03', '1'),
 ('03', '2'),
 ('03', '3'),
 ('04', '1'),
 ('04', '2'),
 ('04', '3'),
 ('05', '1'),
 ('05', '2'),
 ('05', '3'),
 ('06', '1'),
 ('06', '2'),
 ('06', '3'),
 ('07', '1'),
 ('07', '2'),
 ('07', '3'),
 ('08', '1'),
 ('08', '2'),
 ('08', '3'),
 ('09', '1'),
 ('09', '2'),
 ('09', '3'),
 ('10', '1'),
 ('10', '2'),
 ('10', '3'),
 ('11', '1'),
 ('11', '2'),
 ('11', '3'),
 ('12', '1'),
 ('12', '2'),
 ('12', '3'),
 ('13', '1'),
 ('13', '2'),
 ('13', '3'),
 ('14', '1'),
 ('14', '2'),
 ('14', '3'),
 ('15', '1'),
 ('15', '2'),
 ('15', '3'),
 ('16', '1'),
 ('16', '2'),
 ('16', '3'),
 ('17', '1'),
 ('17', '2'),
 ('17', '3'),
 ('18', '1'),
 ('18', '2'),
 ('18', '3'),
 ('19', '1'),
 ('19', '2'),
 ('19', '3'),
 ('20', '1'),
 ('20', '2'),
 ('20', '3'),
 ('21', '1'),
 ('21', '2'),
 ('21', '3'),
 ('22', '1'),
 ('22', '2'),
 ('22', '3'),
 ('23', '1'),
 ('23', '2'),
 ('23', '3'),
 ('24', '1'),
 ('24', '2'),
 ('24'

### URL 접속

In [5]:
회차, 일차 = race_day_list[0]
print(f'회차: {회차}, 일차: {일차}')

url = f'https://www.kcycle.or.kr/contents/information/fixedChuljuPage.do?stndYear={연도}&tms_dayOrd={회차}-{일차}'
driver.get(url)

html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')

회차: 01, 일차: 1


In [6]:
strong_text = soup.find('strong').text

# 날짜 패턴 검색 (연도 포함)
date_match = re.search(r'(\d{4})년 (\d{2})월 (\d{2})일', strong_text)
if date_match:
    # 'YYMMDD' 형식으로 포맷 (연도의 마지막 두 자리만 사용)
    year_last_two = date_match.group(1)[-2:]  # 연도의 마지막 두 자리
    month = date_match.group(2)  # 월
    day = date_match.group(3)    # 일
    Date = year_last_two + month + day
else:
    Date = None

print(Date)

170106


In [7]:
base_raw = soup.find('div', class_='turn badge-group mt15')
race_string = [td.get_text(strip=True) for td in base_raw.find_all('span')]
race_string

numbers = [re.search(r'\d+', item).group() for item in race_string if '광' in item]
numbers

['01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '10',
 '11',
 '12',
 '13',
 '14']

### 해당 경기가 포함된 태그 찾기 (ex 광명01 포함된 태그 찾기)

출주표 페이지의 html코드가 각 경기마다 같은 구조로 나열되어있기 때문에 해당하는 경기를 찾는게 편함

In [8]:
target_경기 = numbers[13]
div_found = KC.find_race(soup, race_num=target_경기)

### 해당 경기의 출주표 데이터 추출
- 기본 출주표

In [10]:
method = 3
threshold = 3

year_list = ['2017', '2018', '2019', '2020', '2022', '2023', '2024'] # 2021년 데이터는 오류 발생
# year_list = ['2021']

final_data_merged = pd.DataFrame()

for z in range(len(year_list)):
    연도 = year_list[z]
    
    url = f'https://www.kcycle.or.kr/contents/information/fixedChuljuPage.do?stndYear={연도}&tms_dayOrd=01-1'
    driver.get(url)
    
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    
    base_raw = soup.find('select', attrs={'name': 'tms_dayOrd'})
    data_list = [td.get_text(strip=True) for td in base_raw.find_all('option')]
    
    race_day_list = []
    for item in data_list:
        match = re.search(r'\((\d+)회 (\d+)일\)', item)
        if match:
            day = int(match.group(2))
            if 1 <= day <= 3:
                race_day_list.append((match.group(1), match.group(2)))
    
    race_day_list.reverse()
    
    year_data_merged = pd.DataFrame()
    
    pbar = tqdm(range(len(race_day_list)), desc='Initializing')
    for j in range(len(race_day_list)):
        회차, 일차 = race_day_list[j]
        pbar.set_description(f'Processing Year: {연도}, 회차: {회차}, 일차: {일차}')
        pbar.update(1)
        
        # print(f'연도: {연도}, 회차: {회차}, 일차: {일차}')
        
        url = f'https://www.kcycle.or.kr/contents/information/fixedChuljuPage.do?stndYear={연도}&tms_dayOrd={회차}-{일차}'
        driver.get(url)
        
        html = driver.page_source
        soup_race = BeautifulSoup(html, 'html.parser')
    
        strong_text = soup_race.find('strong').text
        
        date_match = re.search(r'(\d{4})년 (\d{2})월 (\d{2})일', strong_text)
        if date_match:
            # 'YYMMDD' 형식
            year_last_two = date_match.group(1)[-2:]
            month = date_match.group(2)  
            day = date_match.group(3)  
            Date = year_last_two + month + day
        else:
            Date = None
    
        base_raw = soup_race.find('div', class_='turn badge-group mt15')
        race_string = [td.get_text(strip=True) for td in base_raw.find_all('span')]
        
        numbers = [re.search(r'\d+', item).group() for item in race_string if '광' in item]
    
        
        day_data_merged = pd.DataFrame()
        processed_races = []
        
        for i in range(len(numbers)):
            try:
                # time.sleep(0.01)
                target_경기 = numbers[i]
                div_found = KC.find_race(soup_race, race_num=target_경기)
                
                date_info = pd.DataFrame({
                    "BASE_YEAR": [연도] * 7,
                    "TME_VALUE": [회차] * 7,
                    "DAY_ORD_VALUE": [일차] * 7,
                    "RACE_NO": [target_경기] * 7,
                    "Date": [Date] * 7
                })
                
                data_base = KC.baseinfo(div_found, method=method, threshold=3)
                if len(data_base) != 7:
                    print(f"Skipping {회차}회차 and {일차}일차 and 광명{target_경기}경주: {len(data_base)}명이 진행한 경기는 제외합니다.")
                    continue
                
                data_training = KC.training_info(div_found)
                
                data_recent_tmp = KC.recent_result(div_found)
                data_recent = KC.recent_result_process(data_recent_tmp)
            
                data_merged = pd.concat([
                    date_info,
                    data_base,
                    data_training[['훈련일수', '훈련내용']],
                    data_recent.iloc[:, 2:],
                ], axis=1)
            
                day_data_merged = pd.concat([day_data_merged, data_merged], ignore_index=True)
                day_data_merged.reset_index(drop=True, inplace=True)
                processed_races.append(target_경기)  # 성공적으로 처리된 경기 저장
            except Exception as e:
                print(f"Error processing race in {회차}회차 and {일차}일차 and 광명{target_경기}경주: {e}")
                continue
        
        url = f'https://www.kcycle.or.kr/contents/information/raceResultPage.do?stndYear={연도}&tms_dayOrd={회차}-{일차}'
        response = requests.get(url)
        html_content = response.text
        soup_result = BeautifulSoup(html_content, 'html.parser')
        # print(processed_races)
        result_df = KC.fetch_race_results(soup_race, soup_result, 연도, 회차, 일차)
        filtered_results = result_df[result_df['RACE_NO'].isin(processed_races)]
        filtered_results.reset_index(drop=True, inplace=True)
        
        day_data_merged = pd.concat([day_data_merged, filtered_results['rank']], axis=1)
        
        year_data_merged = pd.concat([year_data_merged, day_data_merged], ignore_index=True)
    
    pbar.close()
    final_data_merged = pd.concat([final_data_merged, year_data_merged], ignore_index=True)


Initializing:   0%|          | 0/149 [00:00<?, ?it/s]

Error processing race in 08회차 and 1일차 and 광명04경주: Columns must be same length as key
Error processing race in 13회차 and 1일차 and 광명07경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Skipping 17회차 and 1일차 and 광명15경주: 8명이 진행한 경기는 제외합니다.
Skipping 17회차 and 2일차 and 광명15경주: 8명이 진행한 경기는 제외합니다.
Skipping 18회차 and 1일차 and 광명15경주: 8명이 진행한 경기는 제외합니다.
Skipping 18회차 and 2일차 and 광명15경주: 8명이 진행한 경기는 제외합니다.
Skipping 19회차 and 1일차 and 광명15경주: 8명이 진행한 경기는 제외합니다.
Skipping 19회차 and 2일차 and 광명15경주: 8명이 진행한 경기는 제외합니다.
Skipping 20회차 and 1일차 and 광명15경주: 8명이 진행한 경기는 제외합니다.
Skipping 20회차 and 2일차 and 광명15경주: 8명이 진행한 경기는 제외합니다.
Skipping 21회차 and 2일차 and 광명15경주: 8명이 진행한 경기는 제외합니다.
Skipping 22회차 and 2일차 and 광명15경주: 8명이 진행한 경기는 제외합니다.
Skipping 26회차 and 3일차 and 광명16경주: 8명이 진행한 경기는 제외합니다.
Skipping 34회차 and 1일차 and 광명10경주: 8명이 진행한 경기는 제외합니다.
Skipping 34회차 and 1일차 and 광명11경주: 8명이 진행한 경기는 제외합니다.
Skipping 34회차 and 1일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 34회차 and 2일차 and 광명10경주: 8명이 진행한 

Initializing:   0%|          | 0/151 [00:00<?, ?it/s]

Skipping 09회차 and 2일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 09회차 and 3일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Error processing race in 10회차 and 1일차 and 광명06경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Skipping 10회차 and 2일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 10회차 and 3일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 11회차 and 2일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 11회차 and 3일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 12회차 and 1일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 12회차 and 2일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 12회차 and 3일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 13회차 and 1일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 13회차 and 2일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 13회차 and 3일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 14회차 and 1일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 14회차 and 2일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 14회차 and 3일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 15회차 and 1일차 and 광명12경주: 8명이 진행한 경기는 제외합니다.
Skipping 15회차 and 2일차

Initializing:   0%|          | 0/153 [00:00<?, ?it/s]

Skipping 14회차 and 1일차 and 광명11경주: 8명이 진행한 경기는 제외합니다.
Skipping 14회차 and 2일차 and 광명11경주: 8명이 진행한 경기는 제외합니다.
Skipping 14회차 and 3일차 and 광명11경주: 8명이 진행한 경기는 제외합니다.
Skipping 15회차 and 1일차 and 광명11경주: 8명이 진행한 경기는 제외합니다.
Skipping 15회차 and 2일차 and 광명11경주: 8명이 진행한 경기는 제외합니다.
Skipping 15회차 and 3일차 and 광명11경주: 8명이 진행한 경기는 제외합니다.
Skipping 16회차 and 1일차 and 광명11경주: 8명이 진행한 경기는 제외합니다.
Skipping 16회차 and 2일차 and 광명11경주: 8명이 진행한 경기는 제외합니다.
Skipping 16회차 and 3일차 and 광명11경주: 8명이 진행한 경기는 제외합니다.
Error processing race in 22회차 and 1일차 and 광명14경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 24회차 and 1일차 and 광명10경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 44회차 and 3일차 and 광명10경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'


Initializing:   0%|          | 0/34 [00:00<?, ?it/s]

Error processing race in 02회차 and 1일차 and 광명10경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 04회차 and 1일차 and 광명07경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 43회차 and 1일차 and 광명04경주: Columns must be same length as key


Initializing:   0%|          | 0/154 [00:00<?, ?it/s]

Error processing race in 02회차 and 3일차 and 광명13경주: Columns must be same length as key
Error processing race in 08회차 and 1일차 and 광명09경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 08회차 and 1일차 and 광명11경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 08회차 and 1일차 and 광명12경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 08회차 and 2일차 and 광명14경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 10회차 and 1일차 and 광명08경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 12회차 and 1일차 and 광명11경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 23회차 and 1일차 and 광명13경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType

Initializing:   0%|          | 0/152 [00:00<?, ?it/s]

Error processing race in 05회차 and 1일차 and 광명09경주: Columns must be same length as key
Error processing race in 07회차 and 1일차 and 광명15경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 09회차 and 2일차 and 광명09경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 14회차 and 3일차 and 광명11경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 15회차 and 1일차 and 광명02경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 15회차 and 1일차 and 광명07경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 16회차 and 1일차 and 광명08경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType'
Error processing race in 17회차 and 2일차 and 광명01경주: int() argument must be a string, a bytes-like object or a number, not 'NoneType

Initializing:   0%|          | 0/53 [00:00<?, ?it/s]

KeyError: 'RACE_NO'

In [19]:
race_day_list

[('01', '1'),
 ('01', '2'),
 ('01', '3'),
 ('02', '1'),
 ('02', '2'),
 ('02', '3'),
 ('03', '1'),
 ('03', '2'),
 ('03', '3'),
 ('04', '1'),
 ('04', '2'),
 ('04', '3'),
 ('05', '1'),
 ('05', '2'),
 ('05', '3'),
 ('06', '1'),
 ('06', '2'),
 ('06', '3'),
 ('07', '1'),
 ('07', '2'),
 ('07', '3'),
 ('08', '1'),
 ('08', '2'),
 ('08', '3'),
 ('09', '1'),
 ('09', '2'),
 ('09', '3'),
 ('10', '1'),
 ('10', '2'),
 ('10', '3'),
 ('11', '1'),
 ('11', '2'),
 ('11', '3'),
 ('12', '1'),
 ('12', '2'),
 ('12', '3'),
 ('13', '1'),
 ('13', '2'),
 ('13', '3'),
 ('14', '1'),
 ('14', '2'),
 ('14', '3'),
 ('15', '1'),
 ('15', '2'),
 ('15', '3'),
 ('16', '1'),
 ('16', '2'),
 ('16', '3'),
 ('17', '1'),
 ('17', '2'),
 ('17', '3'),
 ('18', '1'),
 ('18', '2')]

In [16]:
condition = final_data_merged['RACE_NO'].isna()

final_data_merged[condition]

Unnamed: 0,BASE_YEAR,TME_VALUE,DAY_ORD_VALUE,RACE_NO,Date,번호,선수명,색상,기수,나이,...,1회전2일차등수,1회전2일차전법,1회전3일차등수,1회전3일차전법,금회1일차등수,금회1일차전법,금회2일차등수,금회2일차전법,rank,금회3일차
10990,2017,38,1,01,171006,1,김용진,백,8,40,...,5,,3,마마크,,,,,,
10991,2017,38,1,01,171006,2,장동민,흑,12,36,...,3,마마크,1,추추입,,,,,2,
10992,2017,38,1,01,171006,3,김태원,적,12,38,...,2,마마크,6,,,,,,,
10993,2017,38,1,01,171006,4,용석길,청,3,46,...,7,,4,,,,,,,
10994,2017,38,1,01,171006,5,권용재,황,9,45,...,4,,6,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82504,2023,50,3,15,231224,3,윤현구,적,22,32,...,5,,7,,5,,7,,,
82505,2023,50,3,15,231224,4,류재민,청,15,38,...,4,,7,,6,,6,,,
82506,2023,50,3,15,231224,5,박용범,황,18,35,...,1,추추입,3,추추입,1,추추입,1,추추입,1,
82507,2023,50,3,15,231224,6,박성현,녹,16,38,...,3,마마크,3,마마크,7,,4,,,


In [13]:
final_data_merged

Unnamed: 0,BASE_YEAR,TME_VALUE,DAY_ORD_VALUE,RACE_NO,Date,번호,선수명,색상,기수,나이,...,1회전2일차등수,1회전2일차전법,1회전3일차등수,1회전3일차전법,금회1일차등수,금회1일차전법,금회2일차등수,금회2일차전법,rank,금회3일차
0,2017,01,1,01,170106,1,정영훈,백,6,44,...,4,,3,마마크,,,,,,
1,2017,01,1,01,170106,2,설영석,흑,19,30,...,6,,7,,,,,,3,
2,2017,01,1,01,170106,3,한지혁,적,6,38,...,3,마마크,5,,,,,,2,
3,2017,01,1,01,170106,4,김지은,청,6,44,...,2,마마크,2,마마크,,,,,,
4,2017,01,1,01,170106,5,감병삼,황,9,41,...,7,,3,추추입,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82812,2023,51,3,15,231231,3,류재열,적,19,36,...,1,추추입,2,선선발,1,추추입,2,마마크,,
82813,2023,51,3,15,231231,4,성낙송,청,21,33,...,2,마마크,3,마마크,1,젖젖히기,1,추추입,,
82814,2023,51,3,15,231231,5,황인혁,황,21,35,...,2,추추입,7,,2,추추입,1,젖젖히기,,
82815,2023,51,3,15,231231,6,임채빈,녹,25,32,...,1,선선발,1,선선발,1,선선발,1,추추입,1,


In [17]:
final_data_merged.drop('금회3일차', axis=1).to_csv('KCycle_data.csv', index=False, encoding='utf-8-sig')

# result_df를 수집할때, 제외된 경기도 제외하도록 해야됨

In [39]:
연도, 회차, 일차 = 2017, 8, 1
method = 3
threshold = 3

print(f'연도: {연도}, 회차: {회차}, 일차: {일차}')

url = f'https://www.kcycle.or.kr/contents/information/fixedChuljuPage.do?stndYear={연도}&tms_dayOrd={회차}-{일차}'
driver.get(url)

html = driver.page_source
soup_race = BeautifulSoup(html, 'html.parser')

strong_text = soup_race.find('strong').text

# 날짜 패턴 검색 (연도 포함)
date_match = re.search(r'(\d{4})년 (\d{2})월 (\d{2})일', strong_text)
if date_match:
    # 'YYMMDD' 형식
    year_last_two = date_match.group(1)[-2:]
    month = date_match.group(2)  
    day = date_match.group(3)  
    Date = year_last_two + month + day
else:
    Date = None

base_raw = soup_race.find('div', class_='turn badge-group mt15')
race_string = [td.get_text(strip=True) for td in base_raw.find_all('span')]

numbers = [re.search(r'\d+', item).group() for item in race_string if '광' in item]

day_data_merged = pd.DataFrame()
processed_races = []

for i in range(len(numbers)):
    try:
        target_경기 = numbers[i]
        div_found = KC.find_race(soup_race, race_num=target_경기)
        
        date_info = pd.DataFrame({
            "BASE_YEAR": [연도] * 7,
            "TME_VALUE": [회차] * 7,
            "DAY_ORD_VALUE": [일차] * 7,
            "RACE_NO": [target_경기] * 7,
            "Date": [Date] * 7
        })
        
        data_base = KC.baseinfo(div_found, method=method, threshold=3)
        # print(len(data_base))
        
        data_training = KC.training_info(div_found)
        
        data_recent_tmp = KC.recent_result(div_found)
        data_recent = KC.recent_result_process(data_recent_tmp)
    
        data_merged = pd.concat([
            date_info,
            data_base,
            data_training[['훈련일수', '훈련내용']],
            data_recent.iloc[:, 2:],
        ], axis=1)
    
        day_data_merged = pd.concat([day_data_merged, data_merged], ignore_index=True)
        day_data_merged.reset_index(drop=True, inplace=True)
        processed_races.append(target_경기)
    except Exception as e:
        print(f"Error processing race {target_경기} in 회차 {회차} and 일차 {일차}: {e}")
        continue

url = f'https://www.kcycle.or.kr/contents/information/raceResultPage.do?stndYear={연도}&tms_dayOrd={회차}-{일차}'
response = requests.get(url)
html_content = response.text
soup_result = BeautifulSoup(html_content, 'html.parser')
# print(processed_races)
result_df = KC.fetch_race_results(soup_race, soup_result, 연도, 회차, 일차)
filtered_results = result_df[result_df['RACE_NO'].isin(processed_races)]
filtered_results.reset_index(drop=True, inplace=True)

print(filtered_results.shape, day_data_merged.shape)
day_data_merged = pd.concat([day_data_merged, filtered_results['rank']], axis=1)
print(day_data_merged.shape)

연도: 2017, 회차: 8, 일차: 1
Error processing race 04 in 회차 8 and 일차 1: Columns must be same length as key
(91, 6) (91, 57)
(91, 58)


In [11]:
year, meeting, day = 2017, 1, 1


circle_map = {
    '①': 1, '②': 2, '③': 3, '④': 4,
    '⑤': 5, '⑥': 6, '⑦': 7, '-': None
}

url = f'https://www.kcycle.or.kr/contents/information/raceResultPage.do?stndYear={year}&tms_dayOrd={meeting}-{day}'
response = requests.get(url)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')

table = soup.find('div', class_='table pcType')
tbody = table.find('tbody')
# print('1.51.51.5', tbody)
gwang_races = []
for tr in tbody.find_all('tr'):
    if tr.find('td', class_='gwang'):
        gwang_races.append(tr)
print('222:', gwang_races)
all_result = pd.DataFrame()

for race in gwang_races:
    print('333:', race)
    race_data = [td.get_text(strip=True) for td in race.find_all('td')]
    numbers = [circle_map[data[0]] for data in race_data[1:4] if data[0] in circle_map]

    # df = pd.DataFrame({
    #     "BASE_YEAR": [year] * 7,
    #     "TME_VALUE": [meeting] * 7,
    #     "DAY_ORD_VALUE": [day] * 7,
    #     "RACE_NO": [race] * 7,
    #     'line': range(1, 8),  # Assuming a maximum of 8 lines per race
    #     'rank': [None] * 7  # Create 8 rows initialized with None
    # })
    print('444:', race_data)
    df = pd.DataFrame({
        'line': range(1, 8),  # Assuming a maximum of 8 lines per race
        'rank': [None] * 7,  # Create 7 rows initialized with None
        'BASE_YEAR': [year] * 7,
        'TME_VALUE': [meeting] * 7,
        'DAY_ORD_VALUE': [day] * 7,
        'RACE_NO': [race_data[0].split()[-1]] * 7  # Assuming the race number is included in the first td
    })
    
    for idx, num in enumerate(numbers):
        if num is not None:
            df.at[num-1, 'rank'] = idx + 1

    all_result = pd.concat([all_result, df], ignore_index=True)

1.51.51.5 <tbody>
<tr>
<td class="chang">
<a href="/contents/information/raceResultDetailPage.do?stndYear=2017&amp;raceDt=0106&amp;tms=1&amp;tmsNoNum=1&amp;dayOrd=1&amp;meetCd=002&amp;raceNo=01&amp;allYN=N&amp;chkRaceNo=1">창원01</a>
<br/>
</td>
<td>①주석진</td>
<td>⑦양진우</td>
<td>⑤고재준</td>
<td>
									
										
										
											(1)1.3
										
									
								</td>
<td>
									
										
										
											(1)1.0<br/>(7)16.8<br/>
</td>
<td>
									
										
										
											(1-7)20.8
										
									
								</td>
<td>
									
										
										
											(1-7)19.6
										
									
								</td>
<td>
									
										
										
											(1-5-7)28.2
										
									
								</td>
<td>
							    	
										
										
											-
										
									
							    </td>
<td>-</td>
<td>
<a href="/contents/information/raceResultDetailPage.do?stndYear=2017&amp;raceDt=0106&amp;tms_dayOrd=1-1&amp;tms=1&amp;tmsNoNum=1&amp;dayOrd=1&amp;meetCd=002&a

In [12]:
all_result

Unnamed: 0,line,rank,BASE_YEAR,TME_VALUE,DAY_ORD_VALUE,RACE_NO
0,1,,2017,1,1,광명01
1,2,3,2017,1,1,광명01
2,3,2,2017,1,1,광명01
3,4,,2017,1,1,광명01
4,5,,2017,1,1,광명01
...,...,...,...,...,...,...
93,3,,2017,1,1,광명14
94,4,3,2017,1,1,광명14
95,5,,2017,1,1,광명14
96,6,2,2017,1,1,광명14
