<a href="https://colab.research.google.com/github/ByungjunKim/DDMKL/blob/main/01RissScraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RISS에서 학위 논문 데이터 수집하기

In [None]:
import requests
import math
import pickle
import time
import sys
import pandas as pd
from tqdm.notebook import tqdm
from random import uniform
import lxml
import lxml.etree as et
from bs4 import BeautifulSoup
import glob
import urllib3
urllib3.disable_warnings()
from natsort import natsorted
import re
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "DNT": "1", # Do Not Track 요청 헤더 (사용자 추적 거부)
    # 필요한 추가 헤더를 여기에 추가
}

In [None]:
def request_until_success(url, headers, timeout=7, delay=3, max_retries=5, backoff_factor=2):
    """
    Continuously makes a request to the specified URL with a timeout until the request is successful
    or the maximum number of retries is reached.

    Args:
    - url (str): The URL to request.
    - headers (dict): The headers to include in the request.
    - timeout (int): The timeout for the request in seconds.
    - delay (int): The initial delay between retries in seconds.
    - max_retries (int): The maximum number of retries before giving up.
    - backoff_factor (int): The factor by which to multiply the delay after each retry.

    Returns:
    - response: The successful response from the server.

    Raises:
    - requests.RequestException: If the request fails after max_retries attempts.
    """
    attempt = 1
    while attempt <= max_retries:
        try:
            response = requests.get(url, headers=headers, timeout=timeout)
            response.raise_for_status()  # Raises HTTPError for bad responses
            return response
        except requests.Timeout as e:
            print(f"Request timed out (Attempt {attempt}): {e}")
        except requests.RequestException as e:
            print(f"Request failed (Attempt {attempt}): {e}")

        if attempt == max_retries:
            raise requests.RequestException(f"Max retries reached: Failed to get a successful response from {url}")

        # 지수적 백오프 적용
        sleep_time = delay * (backoff_factor ** (attempt - 1))
        print(f"Retrying in {sleep_time} seconds...")
        time.sleep(sleep_time)
        attempt += 1

### 검색 조건에 맞는 URL 설정
학과정보 : 국어국문 OR 국문  
학위수여연도 : 2000~2023  
학위유형 : 국내박사  
페이지당 출력 : 100개씩  
https://www.riss.kr/search/Search.do?isDetailSearch=Y&searchGubun=true&viewYn=OP&queryText=znMajor%2C%EA%B5%AD%EC%96%B4%EA%B5%AD%EB%AC%B8%40op%2COR%40znMajor%2C%EA%B5%AD%EB%AC%B8&strQuery=&exQuery=&exQueryText=&order=%2FDESC&onHanja=false&strSort=RANK&p_year1=2000&p_year2=2023&iStartCount=0&orderBy=&mat_type=&mat_subtype=T2&fulltext_kind=&t_gubun=&learning_type=&ccl_code=&inside_outside=&fric_yn=&db_type=&image_yn=&gubun=&kdc=&ttsUseYn=&l_sub_code=&fsearchMethod=search&sflag=1&isFDetailSearch=N&pageNumber=1&resultKeyword=&fsearchSort=&fsearchOrder=&limiterList=&limiterListText=&facetList=&facetListText=&fsearchDB=&icate=bib_t&colName=bib_t&pageScale=100&isTab=Y&regnm=&dorg_storage=&language=&language_code=&clickKeyword=&relationKeyword=&query=

In [None]:
# 검색 조건을 넣은 url을 아래에 복사해 넣으세요
url = 'https://www.riss.kr/search/Search.do?isDetailSearch=Y&searchGubun=true&viewYn=OP&queryText=znMajor%2C%EA%B5%AD%EC%96%B4%EA%B5%AD%EB%AC%B8%40op%2COR%40znMajor%2C%EA%B5%AD%EB%AC%B8&strQuery=&exQuery=&exQueryText=&order=%2FDESC&onHanja=false&strSort=RANK&p_year1=2000&p_year2=2023&iStartCount=0&orderBy=&mat_type=&mat_subtype=T2&fulltext_kind=&t_gubun=&learning_type=&ccl_code=&inside_outside=&fric_yn=&db_type=&image_yn=&gubun=&kdc=&ttsUseYn=&l_sub_code=&fsearchMethod=search&sflag=1&isFDetailSearch=N&pageNumber=1&resultKeyword=&fsearchSort=&fsearchOrder=&limiterList=&limiterListText=&facetList=&facetListText=&fsearchDB=&icate=bib_t&colName=bib_t&pageScale=100&isTab=Y&regnm=&dorg_storage=&language=&language_code=&clickKeyword=&relationKeyword=&query='

In [None]:
req = request_until_success(url,headers).text

In [None]:
soup = BeautifulSoup(req, 'lxml')

In [None]:
cont_ml60_classes = soup.find_all('div', class_='cont ml60')

data = [{
    'title': cont_ml60.find('p', class_='title').get_text(strip=True) if cont_ml60.find('p', class_='title') else None,
    'link': cont_ml60.find('p', class_='title').find('a').get('href') if cont_ml60.find('p', class_='title') and cont_ml60.find('p', class_='title').find('a') else None,
    'writer': cont_ml60.find('p', class_='etc').find_all('span')[0].get_text(strip=True) if cont_ml60.find('p', class_='etc') and len(cont_ml60.find('p', class_='etc').find_all('span')) > 0 else None,
    'assigned': cont_ml60.find('p', class_='etc').find_all('span')[1].get_text(strip=True) if cont_ml60.find('p', class_='etc') and len(cont_ml60.find('p', class_='etc').find_all('span')) > 1 else None,
    'year': cont_ml60.find('p', class_='etc').find_all('span')[2].get_text(strip=True) if cont_ml60.find('p', class_='etc') and len(cont_ml60.find('p', class_='etc').find_all('span')) > 2 else None,
    'grad': cont_ml60.find('p', class_='etc').find_all('span')[3].get_text(strip=True) if cont_ml60.find('p', class_='etc') and len(cont_ml60.find('p', class_='etc').find_all('span')) > 3 else None,
    'preAbstract': cont_ml60.find('p', class_='preAbstract').get_text(strip=True) if cont_ml60.find('p', class_='preAbstract') else None
} for cont_ml60 in cont_ml60_classes]

In [None]:
data

### 기본 정보 자동으로 수집하기

In [None]:
# 총 논문 개수 확인(total_count)
# Assuming 'soup' is your BeautifulSoup object
num_span = soup.find('span', class_='num')

if num_span is not None:
    total_count = int(num_span.get_text().replace(',', ''))
else:
    total_count = 0

total_count

In [None]:
# 수집용 URL 세팅
# The base URL without the 'iStartCount' parameter
# Parse the URL
parsed_url = urlparse(url)

# Parse the query parameters
params = parse_qs(parsed_url.query)

# Remove the 'iStartCount' parameter
params.pop('iStartCount', None)

# Re-encode the query parameters
new_query = urlencode(params, doseq=True)

# Replace the old query parameters with the new ones
new_url = parsed_url._replace(query=new_query)

# Unparse the URL
base_url = urlunparse(new_url)

print(base_url)

##### 실제 수집 코드

In [None]:
# The base URL without the 'iStartCount' parameter
# base_url = "https://www.riss.kr/search/Search.do?isDetailSearch=Y&searchGubun=true&viewYn=OP&queryText=znMajor,국어국문@op,OR@znMajor,국문&strQuery=&exQuery=&exQueryText=&order=/DESC&onHanja=false&strSort=RANK&p_year1=2000&p_year2=2023&orderBy=&mat_type=&mat_subtype=T2&fulltext_kind=&t_gubun=&learning_type=&ccl_code=&inside_outside=&fric_yn=&db_type=&image_yn=&gubun=&kdc=&ttsUseYn=&l_sub_code=&fsearchMethod=search&sflag=1&isFDetailSearch=N&pageNumber=1&resultKeyword=&fsearchSort=&fsearchOrder=&limiterList=&limiterListText=&facetList=&facetListText=&fsearchDB=&icate=bib_t&colName=bib_t&pageScale=100&isTab=Y&regnm=&dorg_storage=&language=&language_code=&clickKeyword=&relationKeyword=&query="

# Calculate the number of iterations
iterations = math.ceil(total_count / 100)

# Initialize an empty list to store the data
data = []

# 멀티 쓰레딩 활용 데이터 수집 함수
def fetch_data(i):
    url = f"{base_url}&iStartCount={i * 100}"
    response = request_until_success(url, headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    cont_ml60_classes = soup.find_all('div', class_='cont ml60')

    # 필요한 데이터만 한 번에 추출
    data = []
    for cont_ml60 in cont_ml60_classes:
        title = cont_ml60.find('p', class_='title')
        etc = cont_ml60.find('p', class_='etc')
        spans = etc.find_all('span') if etc else []

        data.append({
            'title': title.get_text(strip=True) if title else None,
            'link': title.find('a').get('href') if title and title.find('a') else None,
            'writer': spans[0].get_text(strip=True) if len(spans) > 0 else None,
            'assigned': spans[1].get_text(strip=True) if len(spans) > 1 else None,
            'year': spans[2].get_text(strip=True) if len(spans) > 2 else None,
            'grad': spans[3].get_text(strip=True) if len(spans) > 3 else None,
            'preAbstract': cont_ml60.find('p', class_='preAbstract').get_text(strip=True) if cont_ml60.find('p', class_='preAbstract') else None
        })
    return data

# ThreadPoolExecutor를 사용하여 멀티스레딩 구현
with ThreadPoolExecutor(max_workers=10) as executor:
    data_list = list(tqdm(executor.map(fetch_data, range(iterations)), total=iterations))

# 리스트 평탄화
data = [item for sublist in data_list for item in sublist]

In [None]:
df = pd.DataFrame(data)
df

In [None]:
# csv로 저장
df.to_csv('./riss_basic.csv')

### 논문별 상세정보 수집하기

In [None]:
df['link'] = 'https://www.riss.kr' + df['link']
df['link']

In [None]:
headers_for_riss = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6",
    "Connection": "keep-alive",
    "Cookie": "Your Cookie Here",
    "Host": "www.riss.kr",
    "Referer": None,
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "sec-ch-ua": '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "Windows"
}

In [None]:
def fetch_data(link):
    headers_for_riss["Referer"] = link
    response = request_until_success(link, headers_for_riss)
    soup = BeautifulSoup(response.text, 'lxml')

    data = {'link': link}

    title_tag = soup.find('h3', class_='title')
    data['title'] = title_tag.get_text(strip=True) if title_tag else None

    info_detail_div = soup.find('div', class_='infoDetailL')
    if info_detail_div:
        for li in info_detail_div.find_all('li'):
            key_element = li.find('span', {'class': 'strong'})
            value_element = li.find('div')
            if key_element and value_element:
                key = key_element.text.strip()
                value = value_element.text.strip()
                data[key] = value

    try:
        additional_info_div = soup.find('div', class_='content additionalInfo')
        if additional_info_div:
            text_off_divs = additional_info_div.find_all('div', class_='text off')
            title_text_dict = {}
            for div in text_off_divs:
                title = div.find_previous_sibling('p', class_='title')
                if title:
                    if title.text.strip() == "참고문헌 (Reference)":
                        title_text_dict[title.text.strip()] = [p.text.strip() for p in div.find_all('p')]
                    else:
                        title_text_dict[title.text.strip()] = div.text.strip()
            data.update(title_text_dict)
    except AttributeError:
        print(f'참고문헌 등 추가 정보 없음 : {link}')

    return data

In [None]:
# 실행 코드
links = df['link'].tolist()[:100]  # 100개만 테스트, 다하려면 [:100] 제거
with ThreadPoolExecutor(max_workers=10) as executor:
    data_list = list(tqdm(executor.map(fetch_data, links), total=len(links)))

In [None]:
detail = pd.DataFrame(data_list)
detail

In [None]:
# csv로 저장
detail.to_csv('riss_detail.csv', index=False)