# Crawl JAIS Papers by URL
-  1. Extract URL from official website
- 2. Add Title, Abstract, DOI, as a Customized class "Paper"

> Used Libraries: Pandas, Beautifulsoup4

In [5]:
# DATA HANDLING
import pandas as pd
from dataclasses import dataclass

# CRAWLING
import time
from bs4 import BeautifulSoup
from urllib.parse import quote
import requests
from tqdm import tqdm

In [3]:
@dataclass
class Paper:
    title: str
    abstract: str
    citation: str
    url: str
    year: int
    volume: int
    issue: int

    def __repr__(self):
        return f"Paper(title={self.title}, url={self.url}, year={self.year}, volume={self.volume}, issue={self.issue})"

In [45]:
def abstract_crawler_by_url(issue=22, volume=1, num=1) -> Paper:
    BASE_QUERY = "https://aisel.aisnet.org/jais/"
    url = f"{BASE_QUERY}/vol{volume}/iss{issue}/{num}/"

    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')

        # 1. 논문 제목
        try:
            title = soup.find(id='title').text.strip()
        except AttributeError:
            title = "Unknown"

        # 2. 요약
        try:
            abstract = soup.find(id='abstract').text.strip()
        except AttributeError:
            abstract = "Unknown"

        # 3. 인용 정보
        try:
            citation = soup.find(id='recommended_citation').text.strip()
        except AttributeError:
            citation = "Unknown"

        # 4. 연도 계산
        year = volume - 1 + 2000

        return Paper(
            title=title,
            abstract=abstract,
            citation=citation,
            url=url,
            year=year,
            volume=volume,
            issue=issue
        )
    
    except Exception as e:
        print(f"Error crawling {url}: {e}")
        return Paper(
            title="Unknown",
            abstract="Unknown",
            citation="Unknown",
            url=url,
            year=volume - 1 + 2000,
            volume=volume,
            issue=issue
        )


In [None]:
vols = [23, 24, 25, 26]
issues = [[1, 2, 3, 4, 5, 6],
          [1, 2, 3, 4, 5, 6],
          [1, 2, 3, 4, 5, 6],
          [1, 2]]
BASE_QUERY = "https://aisel.aisnet.org/jais/"
papers = []

for i, vol in enumerate(vols):
    for issue in issues[i]:
        print(f"🔍 Crawling vol {vol}, issue {issue}...")
        url = f"{BASE_QUERY}vol{vol}/iss{issue}/"
        result = requests.get(url)

        soup = BeautifulSoup(result.text, 'html.parser')
        docs = soup.select('h2#article ~ div.doc')
        Article_num = len(docs)

        for j in tqdm(range(Article_num), desc=f"vol {vol} iss {issue}", leave=True):
            doc_url = url + f"{j+1}/"
            paper = abstract_crawler_by_url(issue, vol, j+1)
            papers.append(paper)


🔍 Crawling vol 23, issue 1...


vol 23 iss 1: 100%|██████████| 10/10 [00:07<00:00,  1.29it/s]


🔍 Crawling vol 23, issue 2...


vol 23 iss 2: 100%|██████████| 5/5 [00:03<00:00,  1.31it/s]

🔍 Crawling vol 23, issue 3...



vol 23 iss 3: 100%|██████████| 6/6 [00:04<00:00,  1.30it/s]


🔍 Crawling vol 23, issue 4...


vol 23 iss 4: 100%|██████████| 6/6 [00:04<00:00,  1.32it/s]


🔍 Crawling vol 23, issue 5...


vol 23 iss 5: 100%|██████████| 8/8 [00:06<00:00,  1.28it/s]


🔍 Crawling vol 23, issue 6...


vol 23 iss 6: 100%|██████████| 9/9 [00:07<00:00,  1.28it/s]


🔍 Crawling vol 24, issue 1...


vol 24 iss 1: 100%|██████████| 9/9 [00:06<00:00,  1.33it/s]


🔍 Crawling vol 24, issue 2...


vol 24 iss 2: 100%|██████████| 8/8 [00:06<00:00,  1.30it/s]


🔍 Crawling vol 24, issue 3...


vol 24 iss 3: 100%|██████████| 9/9 [00:08<00:00,  1.04it/s]


🔍 Crawling vol 24, issue 4...


vol 24 iss 4: 100%|██████████| 8/8 [00:06<00:00,  1.33it/s]


🔍 Crawling vol 24, issue 5...


vol 24 iss 5: 100%|██████████| 9/9 [00:06<00:00,  1.34it/s]


🔍 Crawling vol 24, issue 6...


vol 24 iss 6: 100%|██████████| 8/8 [00:06<00:00,  1.31it/s]


🔍 Crawling vol 25, issue 1...


vol 25 iss 1: 0it [00:00, ?it/s]


🔍 Crawling vol 25, issue 2...


vol 25 iss 2: 100%|██████████| 9/9 [00:07<00:00,  1.27it/s]


🔍 Crawling vol 25, issue 3...


vol 25 iss 3: 100%|██████████| 9/9 [00:06<00:00,  1.31it/s]


🔍 Crawling vol 25, issue 4...


vol 25 iss 4: 100%|██████████| 9/9 [00:06<00:00,  1.32it/s]


🔍 Crawling vol 25, issue 5...


vol 25 iss 5: 100%|██████████| 9/9 [00:06<00:00,  1.29it/s]


🔍 Crawling vol 25, issue 6...


vol 25 iss 6: 100%|██████████| 9/9 [00:06<00:00,  1.31it/s]


🔍 Crawling vol 26, issue 1...


vol 26 iss 1: 100%|██████████| 9/9 [00:07<00:00,  1.16it/s]


🔍 Crawling vol 26, issue 2...


vol 26 iss 2: 100%|██████████| 9/9 [00:09<00:00,  1.01s/it]


In [56]:
# 데이터 프레임으로 변환 및 CSV 저장
papers_df = pd.DataFrame([paper.__dict__ for paper in papers])
papers_df.to_csv('JAIS_papers.csv', index=False, encoding='utf-8-sig')

## 2025년 Preprint분에 대해서 추가로 수행

In [16]:
# Preprints
BASE_QUERY = "https://aisel.aisnet.org/jais_preprints/"
papers = []

for num in tqdm(range(167, 179), desc=f"Preprints", leave=True):
    
    url = f"{BASE_QUERY}{num}/"
    
    response = requests.get(url, timeout=5)

    soup = BeautifulSoup(response.text, 'html.parser')
    
    try:
        title = soup.find(id='title').text.strip()
    except AttributeError:
        title = "Unknown"
    try: 
        abstract = soup.find(id='abstract').text.strip()
    except AttributeError:
        abstract = "Unknown"
    try:
        citation = soup.find(id='recommended_citation').text.strip()
    except AttributeError:
        citation = "Unknown"

    paper = Paper(
        title=title,
        abstract=abstract,
        citation=citation,
        url=BASE_QUERY + f'{num}',
        year=2025,
        volume=0,
        issue="Preprint"
    )
    papers.append(paper)

print(papers)

Preprints: 100%|██████████| 12/12 [00:07<00:00,  1.59it/s]

[Paper(title=Data Control Coordination in the  Formation of Ecosystems in Highly Regulated Sectors, url=https://aisel.aisnet.org/jais_preprints/167, year=2025, volume=0, issue=Preprint), Paper(title=What Is Augmented? A Meta-Narrative Review of AI-Based Augmentation, url=https://aisel.aisnet.org/jais_preprints/168, year=2025, volume=0, issue=Preprint), Paper(title=Computationally Intensive Research:  Advancing a Role for Secondary Analysis of  Qualitative Data, url=https://aisel.aisnet.org/jais_preprints/169, year=2025, volume=0, issue=Preprint), Paper(title=Achieving Reward-Based Crowdfunding Project Success:  An Examination of Value Congruence, url=https://aisel.aisnet.org/jais_preprints/170, year=2025, volume=0, issue=Preprint), Paper(title=Achieving Reward-Based Crowdfunding Project Success:  An Examination of Value Congruence, url=https://aisel.aisnet.org/jais_preprints/171, year=2025, volume=0, issue=Preprint), Paper(title=Capturing the “Social” in Social Networks: The Conceptual




In [17]:
df = pd.DataFrame([paper.__dict__ for paper in papers])
df.to_csv('JAIS_preprints.csv', index=False, encoding='utf-8-sig')

df2 = pd.read_csv('JAIS_papers.csv', encoding='utf-8-sig')
df2 = pd.concat([df2, df], ignore_index=True)
df2.to_csv('JAIS_papers.csv', index=False, encoding='utf-8-sig')