<a href="https://colab.research.google.com/github/ByungjunKim/DDMKL/blob/main/240411_RissScraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RISS에서 학위 논문 데이터 수집하기

In [None]:
import requests
import math
import pickle
import time
import sys
import pandas as pd
from tqdm.notebook import tqdm
from random import uniform
import lxml
import lxml.etree as et
from bs4 import BeautifulSoup
import glob
import urllib3
urllib3.disable_warnings()
from natsort import natsorted
import re

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "DNT": "1", # Do Not Track 요청 헤더 (사용자 추적 거부)
    # 필요한 추가 헤더를 여기에 추가
}

In [None]:
def request_until_success(url, headers, timeout=5, delay=2):
    """
    Continuously makes a request to the specified URL with a timeout until the request is successful.

    Args:
    - url (str): The URL to request.
    - timeout (int): The timeout for the request in seconds.
    - delay (int): The delay between retries in seconds.

    Returns:
    - response: The successful response from the server.
    """
    attempt = 1
    while True:
        try:
            # print(f"Attempt {attempt}")
            response = requests.get(url,headers=headers, timeout=timeout)
            response.raise_for_status()  # Raises HTTPError for bad responses
            # print("Request successful")
            return response
        except requests.Timeout as e:
            print(f"Request timed out: {e}")
        except requests.RequestException as e:
            print(f"Request failed: {e}")

        # print(f"Retrying in {delay} seconds...")
        time.sleep(delay)
        attempt += 1

### 검색 조건에 맞는 URL 설정
학과정보 : 국어국문 OR 국문  
학위수여연도 : 2000~2023  
학위유형 : 국내박사  
https://www.riss.kr/search/Search.do?isDetailSearch=Y&searchGubun=true&viewYn=OP&queryText=znMajor%2C%EA%B5%AD%EC%96%B4%EA%B5%AD%EB%AC%B8%40op%2COR%40znMajor%2C%EA%B5%AD%EB%AC%B8&strQuery=&exQuery=&exQueryText=&order=%2FDESC&onHanja=false&strSort=RANK&p_year1=2000&p_year2=2023&iStartCount=0&orderBy=&mat_type=&mat_subtype=T2&fulltext_kind=&t_gubun=&learning_type=&ccl_code=&inside_outside=&fric_yn=&db_type=&image_yn=&gubun=&kdc=&ttsUseYn=&l_sub_code=&fsearchMethod=search&sflag=1&isFDetailSearch=N&pageNumber=1&resultKeyword=&fsearchSort=&fsearchOrder=&limiterList=&limiterListText=&facetList=&facetListText=&fsearchDB=&icate=bib_t&colName=bib_t&pageScale=100&isTab=Y&regnm=&dorg_storage=&language=&language_code=&clickKeyword=&relationKeyword=&query=

In [None]:
url = 'https://www.riss.kr/search/Search.do?isDetailSearch=Y&searchGubun=true&viewYn=OP&queryText=znMajor%2C%EA%B5%AD%EC%96%B4%EA%B5%AD%EB%AC%B8%40op%2COR%40znMajor%2C%EA%B5%AD%EB%AC%B8&strQuery=&exQuery=&exQueryText=&order=%2FDESC&onHanja=false&strSort=RANK&p_year1=2000&p_year2=2023&iStartCount=0&orderBy=&mat_type=&mat_subtype=T2&fulltext_kind=&t_gubun=&learning_type=&ccl_code=&inside_outside=&fric_yn=&db_type=&image_yn=&gubun=&kdc=&ttsUseYn=&l_sub_code=&fsearchMethod=search&sflag=1&isFDetailSearch=N&pageNumber=1&resultKeyword=&fsearchSort=&fsearchOrder=&limiterList=&limiterListText=&facetList=&facetListText=&fsearchDB=&icate=bib_t&colName=bib_t&pageScale=100&isTab=Y&regnm=&dorg_storage=&language=&language_code=&clickKeyword=&relationKeyword=&query='

In [None]:
req = request_until_success(url,headers).text

In [None]:
soup = BeautifulSoup(req, 'lxml')

In [None]:
cont_ml60_classes = soup.find_all('div', class_='cont ml60')

data = [{
    'title': cont_ml60.find('p', class_='title').get_text(strip=True) if cont_ml60.find('p', class_='title') else None,
    'link': cont_ml60.find('p', class_='title').find('a').get('href') if cont_ml60.find('p', class_='title') and cont_ml60.find('p', class_='title').find('a') else None,
    'writer': cont_ml60.find('p', class_='etc').find_all('span')[0].get_text(strip=True) if cont_ml60.find('p', class_='etc') and len(cont_ml60.find('p', class_='etc').find_all('span')) > 0 else None,
    'assigned': cont_ml60.find('p', class_='etc').find_all('span')[1].get_text(strip=True) if cont_ml60.find('p', class_='etc') and len(cont_ml60.find('p', class_='etc').find_all('span')) > 1 else None,
    'year': cont_ml60.find('p', class_='etc').find_all('span')[2].get_text(strip=True) if cont_ml60.find('p', class_='etc') and len(cont_ml60.find('p', class_='etc').find_all('span')) > 2 else None,
    'grad': cont_ml60.find('p', class_='etc').find_all('span')[3].get_text(strip=True) if cont_ml60.find('p', class_='etc') and len(cont_ml60.find('p', class_='etc').find_all('span')) > 3 else None,
    'preAbstract': cont_ml60.find('p', class_='preAbstract').get_text(strip=True) if cont_ml60.find('p', class_='preAbstract') else None
} for cont_ml60 in cont_ml60_classes]

In [None]:
data

### 기본 정보 자동으로 수집하기

In [None]:
# Assuming 'soup' is your BeautifulSoup object
num_span = soup.find('span', class_='num')

if num_span is not None:
    total_count = int(num_span.get_text().replace(',', ''))
else:
    total_count = 0

total_count

In [None]:
# The base URL without the 'iStartCount' parameter
base_url = "https://www.riss.kr/search/Search.do?isDetailSearch=Y&searchGubun=true&viewYn=OP&queryText=znMajor,국어국문@op,OR@znMajor,국문&strQuery=&exQuery=&exQueryText=&order=/DESC&onHanja=false&strSort=RANK&p_year1=2000&p_year2=2023&orderBy=&mat_type=&mat_subtype=T2&fulltext_kind=&t_gubun=&learning_type=&ccl_code=&inside_outside=&fric_yn=&db_type=&image_yn=&gubun=&kdc=&ttsUseYn=&l_sub_code=&fsearchMethod=search&sflag=1&isFDetailSearch=N&pageNumber=1&resultKeyword=&fsearchSort=&fsearchOrder=&limiterList=&limiterListText=&facetList=&facetListText=&fsearchDB=&icate=bib_t&colName=bib_t&pageScale=100&isTab=Y&regnm=&dorg_storage=&language=&language_code=&clickKeyword=&relationKeyword=&query="

# Calculate the number of iterations
iterations = math.ceil(total_count / 100)

# Initialize an empty list to store the data
data = []

# Loop over the range, incrementing by 100 each time
for i in tqdm(range(iterations)):
    # Create the URL with the incremented 'iStartCount' parameter
    url = f"{base_url}&iStartCount={i * 100}"

    # Send a GET request to the URL
    # response = requests.get(url)
    response = request_until_success(url, headers)

    # Create a BeautifulSoup object from the response text
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all 'div' elements with class 'cont ml60'
    cont_ml60_classes = soup.find_all('div', class_='cont ml60')

    # Extract the data and append it to the list
    data.extend([{
        'title': cont_ml60.find('p', class_='title').get_text(strip=True) if cont_ml60.find('p', class_='title') else None,
        'link': cont_ml60.find('p', class_='title').find('a').get('href') if cont_ml60.find('p', class_='title') and cont_ml60.find('p', class_='title').find('a') else None,
        'writer': cont_ml60.find('p', class_='etc').find_all('span')[0].get_text(strip=True) if cont_ml60.find('p', class_='etc') and len(cont_ml60.find('p', class_='etc').find_all('span')) > 0 else None,
        'assigned': cont_ml60.find('p', class_='etc').find_all('span')[1].get_text(strip=True) if cont_ml60.find('p', class_='etc') and len(cont_ml60.find('p', class_='etc').find_all('span')) > 1 else None,
        'year': cont_ml60.find('p', class_='etc').find_all('span')[2].get_text(strip=True) if cont_ml60.find('p', class_='etc') and len(cont_ml60.find('p', class_='etc').find_all('span')) > 2 else None,
        'grad': cont_ml60.find('p', class_='etc').find_all('span')[3].get_text(strip=True) if cont_ml60.find('p', class_='etc') and len(cont_ml60.find('p', class_='etc').find_all('span')) > 3 else None,
        'preAbstract': cont_ml60.find('p', class_='preAbstract').get_text(strip=True) if cont_ml60.find('p', class_='preAbstract') else None
    } for cont_ml60 in cont_ml60_classes])

In [None]:
df = pd.DataFrame(data)
df

### 논문별 상세정보 수집하기 (작업 중)

In [None]:
df['link'] = 'https://www.riss.kr' + df['link']
df['link']

In [None]:
df['link'][0]

In [None]:
req = request_until_success(df['link'][0],headers).text
soup = BeautifulSoup(req, 'lxml')
soup

In [None]:
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br, zstd",
    "Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6",
    "Connection": "keep-alive",
    "Cookie": "Your Cookie Here",
    "Host": "www.riss.kr",
    "Referer": df['link'][0],
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "sec-ch-ua": '"Google Chrome";v="123", "Not:A-Brand";v="8", "Chromium";v="123"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "Windows"
}


response = requests.get(df['link'][0], headers=headers)

In [None]:
BeautifulSoup(response.text, 'lxml')