In [12]:
import requests
from tqdm import tqdm
import pandas as pd
import json

## solved ac 크롤링
### id 목록을 이용해 문제 목록 크롤링
- 문제번호는 대강 1000번에서 26000번까지
- 한번에 100개씩 가능
### 흐름
- 1000부터 26000까지 100씩 나눠 쿼리에 넘겨줄 id_list string 생성
- 문제리스트 크롤링
- json 형태로 변환 및 병합
- 저장

In [247]:
# 크롤링할 id리스트 생성
id_list = []
for i in range(1000, 26000, 100):
    tmp_id = ""
    for j in range(i, i+100):
        tmp_id += ' ' + str(j)
    tmp_id = ','.join(tmp_id.split())
    id_list.append(tmp_id)

In [279]:
# 문제 리스트 크롤링
problem_list = []
url = "https://solved.ac/api/v3/problem/lookup"
headers = {"Content-Type": "application/json"}

for ids in id_list:
    querystring = {"problemIds": ids}
    response = requests.request("GET", url, headers=headers, params=querystring)
    problem_list.append(response.text)

In [287]:
# 문제 json 형태로 변환 및 병합
problem_json = []

for problem in tqdm(problem_list):
    tmp_json = json.loads(problem)
    problem_json += tmp_json

100%|███████████████████████████████████████████████████████████████████████████████| 250/250 [00:00<00:00, 355.38it/s]


In [292]:
# json 저장
with open("boj_problems.json", "w") as f:
    json.dump(problem_json, f)

In [293]:
with open('boj_problems.json', 'r', encoding='utf-8') as f:
    json_data = json.load(f)

## df 변환
- json data를 df 형태로 변환

In [295]:
df = pd.json_normalize(json_data)

In [296]:
df.columns

Index(['problemId', 'titleKo', 'titles', 'isSolvable', 'isPartial',
       'acceptedUserCount', 'level', 'votedUserCount', 'sprout',
       'givesNoRating', 'isLevelLocked', 'averageTries', 'official', 'tags'],
      dtype='object')

In [316]:
df.head()

Unnamed: 0,problemId,titleKo,titles,isSolvable,isPartial,acceptedUserCount,level,votedUserCount,sprout,givesNoRating,isLevelLocked,averageTries,official,tags
0,1000,A+B,"[{'language': 'ko', 'languageDisplayName': 'ko...",True,False,174601,1,17,False,False,True,2.3654,True,"[{'key': 'arithmetic', 'isMeta': False, 'bojTa..."
1,1001,A-B,"[{'language': 'ko', 'languageDisplayName': 'ko...",True,False,146249,1,8,False,False,True,1.3962,True,"[{'key': 'arithmetic', 'isMeta': False, 'bojTa..."
2,1002,터렛,"[{'language': 'ko', 'languageDisplayName': 'ko...",True,False,25647,7,112,False,False,False,4.6,True,"[{'key': 'geometry', 'isMeta': False, 'bojTagI..."
3,1003,피보나치 함수,"[{'language': 'ko', 'languageDisplayName': 'ko...",True,False,34563,8,103,False,False,False,3.1952,True,"[{'key': 'dp', 'isMeta': False, 'bojTagId': 25..."
4,1004,어린 왕자,"[{'language': 'ko', 'languageDisplayName': 'ko...",True,False,8419,8,57,False,False,False,2.3461,True,"[{'key': 'geometry', 'isMeta': False, 'bojTagI..."


In [344]:
df.to_csv('boj_problems.csv', encoding='utf-8-sig')

In [346]:
# pd.read_csv('boj_problems.csv', index_col=0)

In [300]:
df['tags'][0]

[{'key': 'arithmetic',
  'isMeta': False,
  'bojTagId': 121,
  'problemCount': 585,
  'displayNames': [{'language': 'en',
    'name': 'arithmetic',
    'short': 'arithmetic'},
   {'language': 'ko', 'name': '사칙연산', 'short': '사칙연산'}]},
 {'key': 'implementation',
  'isMeta': False,
  'bojTagId': 102,
  'problemCount': 3456,
  'displayNames': [{'language': 'en',
    'name': 'implementation',
    'short': 'impl'},
   {'language': 'ko', 'name': '구현', 'short': '구현'}]},
 {'key': 'math',
  'isMeta': False,
  'bojTagId': 124,
  'problemCount': 3849,
  'displayNames': [{'language': 'en', 'name': 'mathematics', 'short': 'math'},
   {'language': 'ko', 'name': '수학', 'short': '수학'}]}]

- tag depth가 너무 깊기 때문에 데이터를 분리하기로
- df행 돌면서 tag 데이터만 추출 후 병합
- 따로 df형태로 변환

In [324]:
tag_list = []

for index, row in tqdm(df.iterrows()):
    for t in row['tags']:
        t['problemId'] = row['problemId']
        tag_list.append(t)

22979it [00:06, 3528.12it/s]


In [325]:
tag_list[0]

{'key': 'arithmetic',
 'isMeta': False,
 'bojTagId': 121,
 'problemCount': 585,
 'displayNames': [{'language': 'en',
   'name': 'arithmetic',
   'short': 'arithmetic'},
  {'language': 'ko', 'name': '사칙연산', 'short': '사칙연산'}],
 'problemId': 1000}

In [327]:
for tag in tag_list:
    for t in tag['displayNames']:
        if t['language'] == 'ko':
            tag['tag_name'] = t['name']
            tag['tag_short'] = t['short']
    else:
        tag['tag_name'] = t['name']
        tag['tag_short'] = t['short']

In [332]:
df_tags = pd.json_normalize(tag_list)

In [336]:
df_tags.drop('displayNames', inplace=True, axis=1)

In [364]:
df_tags.head()

Unnamed: 0,key,isMeta,bojTagId,problemCount,problemId,tag_name,tag_short
0,arithmetic,False,121,585,1000,사칙연산,사칙연산
1,implementation,False,102,3456,1000,구현,구현
2,math,False,124,3849,1000,수학,수학
3,arithmetic,False,121,585,1001,사칙연산,사칙연산
4,implementation,False,102,3456,1001,구현,구현


In [347]:
df_tags.to_csv('boj_tags.csv', encoding='utf-8-sig')

In [361]:
df_tags = pd.read_csv('boj_tags.csv', index_col=0)

In [372]:
boj_taglist = df_tags.copy()
boj_taglist.drop('problemId', inplace=True, axis=1)

In [375]:
boj_taglist = boj_taglist.drop_duplicates().reset_index(drop=True)

In [377]:
boj_taglist

Unnamed: 0,key,isMeta,bojTagId,problemCount,tag_name,tag_short
0,arithmetic,False,121,585,사칙연산,사칙연산
1,implementation,False,102,3456,구현,구현
2,math,False,124,3849,수학,수학
3,geometry,False,100,843,기하학,기하학
4,dp,False,25,2415,다이나믹 프로그래밍,다이나믹 프로그래밍
...,...,...,...,...,...,...
180,utf8,False,199,2,utf-8 입력 처리,utf-8
181,rb_tree,False,94,1,레드-블랙 트리,레드-블랙 트리
182,discrete_sqrt,False,147,4,이산 제곱근,이산 제곱근
183,top_tree,False,105,2,탑 트리,탑 트리


In [379]:
boj_taglist.to_csv('boj_taglist.csv', encoding='utf-8-sig')

In [371]:
boj_taglist.drop_duplicates().set_index('bojTagId', drop=True)

Unnamed: 0_level_0,key,isMeta,problemCount,tag_name,tag_short
bojTagId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
121,arithmetic,False,585,사칙연산,사칙연산
102,implementation,False,3456,구현,구현
124,math,False,3849,수학,수학
100,geometry,False,843,기하학,기하학
25,dp,False,2415,다이나믹 프로그래밍,다이나믹 프로그래밍
...,...,...,...,...,...
199,utf8,False,2,utf-8 입력 처리,utf-8
94,rb_tree,False,1,레드-블랙 트리,레드-블랙 트리
147,discrete_sqrt,False,4,이산 제곱근,이산 제곱근
105,top_tree,False,2,탑 트리,탑 트리
