<a href="https://colab.research.google.com/github/Arpita3012/CSET340/blob/main/apache_jira_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!mkdir -p apache-jira-scraper/data/raw apache-jira-scraper/data/processed apache-jira-scraper/logs
%cd apache-jira-scraper


/content/apache-jira-scraper


In [3]:
%%writefile requirements.txt
requests
tqdm


Writing requirements.txt


In [4]:
%%writefile utils.py
import json
import datetime

def save_checkpoint(project, value, filename="checkpoint.json"):
    try:
        with open(filename, "r") as f:
            data = json.load(f)
    except FileNotFoundError:
        data = {}
    data[project] = value
    with open(filename, "w") as f:
        json.dump(data, f)

def load_checkpoint(filename="checkpoint.json"):
    try:
        with open(filename, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return {}

def log_message(msg, logfile="logs/scraper.log"):
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(logfile, "a") as f:
        f.write(f"[{timestamp}] {msg}\n")
    print(msg)


Writing utils.py


In [6]:
!python utils.py


In [8]:
%%writefile scraper.py
import requests
import json
import time
from tqdm import tqdm
from utils import save_checkpoint, load_checkpoint, log_message

BASE_URL = "https://issues.apache.org/jira/rest/api/2/search"
PROJECTS = ["ACCUMULO", "ACE", "ARTEMIS"]  # Updated project list âœ…
PAGE_SIZE = 500
MAX_RETRIES = 3

def fetch_issues(project, start_at):
    params = {
        "jql": f"project={project}",
        "maxResults": PAGE_SIZE,
        "startAt": start_at
    }

    for attempt in range(MAX_RETRIES):
        try:
            response = requests.get(BASE_URL, params=params, timeout=15)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 429:
                log_message("Rate limit hit. Sleeping 30s...")
                time.sleep(30)
            elif 500 <= response.status_code < 600:
                log_message(f"Server error {response.status_code}. Retry {attempt+1}/{MAX_RETRIES}")
                time.sleep(5)
            else:
                log_message(f"Unexpected status {response.status_code}")
                break
        except requests.exceptions.RequestException as e:
            log_message(f"Request failed: {e}")
            time.sleep(5)
    return None


def scrape_project(project):
    checkpoint = load_checkpoint()
    start_at = checkpoint.get(project, 0)
    all_issues = []

    while True:
        data = fetch_issues(project, start_at)
        if not data or "issues" not in data:
            break

        issues = data["issues"]
        if not issues:
            break

        for issue in issues:
            all_issues.append(issue)

        with open(f"data/raw/{project}.json", "a", encoding="utf-8") as f:
            for issue in issues:
                json.dump(issue, f)
                f.write("\n")

        start_at += len(issues)
        save_checkpoint(project, start_at)
        log_message(f"{project}: fetched {len(issues)} issues, total so far {start_at}")

        if len(issues) < PAGE_SIZE:
            break

        time.sleep(1)

    log_message(f"{project}: completed scraping {start_at} issues.")
    return all_issues


if __name__ == "__main__":
    for project in PROJECTS:
        log_message(f"Starting scraping for {project}")
        scrape_project(project)
        log_message(f"Finished scraping {project}\n")


Overwriting scraper.py


In [9]:
!python scraper.py


Starting scraping for ACCUMULO
ACCUMULO: completed scraping 4745 issues.
Finished scraping ACCUMULO

Starting scraping for ACE
ACE: fetched 389 issues, total so far 539
ACE: completed scraping 539 issues.
Finished scraping ACE

Starting scraping for ARTEMIS
ARTEMIS: fetched 500 issues, total so far 500
ARTEMIS: fetched 500 issues, total so far 1000
ARTEMIS: fetched 500 issues, total so far 1500
ARTEMIS: fetched 500 issues, total so far 2000
ARTEMIS: fetched 500 issues, total so far 2500
ARTEMIS: fetched 500 issues, total so far 3000
ARTEMIS: fetched 500 issues, total so far 3500
ARTEMIS: fetched 500 issues, total so far 4000
ARTEMIS: fetched 500 issues, total so far 4500
ARTEMIS: fetched 500 issues, total so far 5000
ARTEMIS: fetched 500 issues, total so far 5500
ARTEMIS: fetched 118 issues, total so far 5618
ARTEMIS: completed scraping 5618 issues.
Finished scraping ARTEMIS



In [10]:
%%writefile transform.py
import json
import os

RAW_DIR = "data/raw"
PROCESSED_FILE = "data/processed/apache_issues.jsonl"

def extract_text(issue):
    fields = issue.get("fields", {})
    return {
        "project": fields.get("project", {}).get("key"),
        "issue_key": issue.get("key"),
        "title": fields.get("summary"),
        "description": fields.get("description"),
        "status": fields.get("status", {}).get("name"),
        "assignee": fields.get("assignee", {}).get("displayName") if fields.get("assignee") else None,
        "reporter": fields.get("reporter", {}).get("displayName") if fields.get("reporter") else None,
        "priority": fields.get("priority", {}).get("name") if fields.get("priority") else None,
        "labels": fields.get("labels"),
        "created": fields.get("created"),
        "updated": fields.get("updated"),
        "comments": "\n".join(
            c.get("body", "") for c in (fields.get("comment", {}).get("comments", []))
        ),
        "task": "summarization"
    }

def transform_to_jsonl():
    with open(PROCESSED_FILE, "w", encoding="utf-8") as out_f:
        for raw_file in os.listdir(RAW_DIR):
            path = os.path.join(RAW_DIR, raw_file)
            with open(path, "r", encoding="utf-8") as f:
                for line in f:
                    issue = json.loads(line)
                    transformed = extract_text(issue)
                    json.dump(transformed, out_f)
                    out_f.write("\n")
    print("âœ… Transformation complete â†’ data/processed/apache_issues.jsonl")

if __name__ == "__main__":
    transform_to_jsonl()


Writing transform.py


In [11]:
!python transform.py

âœ… Transformation complete â†’ data/processed/apache_issues.jsonl


In [12]:
!head -n 5 data/processed/apache_issues.jsonl


{"project": "ACE", "issue_key": "ACE-635", "title": "TV Channel Duke vs Virginia Tech Live College basketball 2018 Watch Online", "description": "TV Channel Duke vs Virginia Tech Live College basketball 2018 Watch\r\n\r\nCLICK HERE >>>> [http://livetv24.org/college-basketball|http://livetv24.org/college-basketball]\r\n\r\nMarvin Bagley's status uncertain for Duke's game against Virginia Tech. Duke associate head coach Jeff Capel III said Monday that he did not have an update yet on freshman forward Marvin Bagley III's status for Wednesday's game against Virginia Tech. Prolific\r\n\r\nDuke poses challenges for Virginia Tech. Duke vs. Virginia Tech odds: Picks from unbiased computer model on 15-7 run, The Virginia Tech Hokies visit the Duke Blue Devils at Cameron Indoor Stadium on Wednesday night at 7 p.m. ET. The Blue Devils are favored by 11 points, unchanged from the opening line. The Over-Under, or total number of points\r\n\r\nVegas thinks will be scored, is 162, down half-a-point f

In [13]:
!wc -l data/processed/apache_issues.jsonl


10902 data/processed/apache_issues.jsonl


In [16]:
import pandas as pd
import json

# Input and output file paths
jsonl_path = "data/processed/apache_issues.jsonl"
csv_path = "data/processed/apache_issues.csv"

# Read JSONL and convert
rows = []
with open(jsonl_path, "r", encoding="utf-8") as f:
    for line in f:
        rows.append(json.loads(line))

# Convert to DataFrame and save as CSV
df = pd.DataFrame(rows)
df.to_csv(csv_path, index=False)

print("âœ… Converted to CSV â†’", csv_path)
print("ðŸ“Š Total records:", len(df))


âœ… Converted to CSV â†’ data/processed/apache_issues.csv
ðŸ“Š Total records: 10902
