In [37]:
# Check if the Jira API is accessible

In [2]:
import requests

# Common Jira API
jira_api_url = "https://issues.apache.org/jira/rest/api/2/project"

try:
    response = requests.get(jira_api_url)
    response.raise_for_status() # Raise an HTTPError for bad responses
    print("Jira API is accessible. Status code:", response.status_code)

except requests.exceptions.RequestException as e:
    print(f"Error accessing Jira API: {e}")
    print("It seems a public API might not be directly accessible or requires authentication.")

Jira API is accessible. Status code: 200


In [36]:
# Fetching and displaying information about available projects

In [4]:
import requests

jira_api_url = "https://issues.apache.org/jira/rest/api/2/project"

try:
    response = requests.get(jira_api_url)
    response.raise_for_status()
    projects = response.json()

    print(f"Found {len(projects)} projects.")
    if projects:
        print("\nKeys for the first project:")
        print(projects[0].keys())

except requests.exceptions.RequestException as e:
    print(f"Error accessing Jira API: {e}")

Found 672 projects.

Keys for the first project:
dict_keys(['expand', 'self', 'id', 'key', 'name', 'avatarUrls', 'projectTypeKey', 'archived'])


In [39]:
# Search and select Spark, Hadoop, and Kafka related projects

In [6]:
import requests

jira_api_url = "https://issues.apache.org/jira/rest/api/2/project"

try:
    response = requests.get(jira_api_url)
    response.raise_for_status()
    projects = response.json()

    spark_projects = [p for p in projects if 'spark' in p['key'].lower() or 'spark' in p['name'].lower()]
    hadoop_projects = [p for p in projects if 'hadoop' in p['key'].lower() or 'hadoop' in p['name'].lower()]
    kafka_projects = [p for p in projects if 'kafka' in p['key'].lower() or 'kafka' in p['name'].lower()]

    print(f"Found {len(spark_projects)} Spark related projects.")
    print(f"Found {len(hadoop_projects)} Hadoop related projects.")
    print(f"Found {len(kafka_projects)} Kafka related projects.")

    selected_projects = []
    if spark_projects:
        selected_projects.append(spark_projects[0])
        print(f"\nSelected Spark project: {spark_projects[0]['key']}")
    if hadoop_projects:
        selected_projects.append(hadoop_projects[0])
        print(f"Selected Hadoop project: {hadoop_projects[0]['key']}")
    if kafka_projects:
        selected_projects.append(kafka_projects[0])
        print(f"Selected Kafka project: {kafka_projects[0]['key']}")

    if selected_projects:
        print("\nSelected projects for scraping:")
        for project in selected_projects:
            print(f"- {project['key']}: {project['name']}")
    else:
        print("\nCould not find projects matching the criteria.")


except requests.exceptions.RequestException as e:
    print(f"Error accessing Jira API: {e}")

Found 1 Spark related projects.
Found 5 Hadoop related projects.
Found 1 Kafka related projects.

Selected Spark project: SPARK
Selected Hadoop project: HADOOP
Selected Kafka project: KAFKA

Selected projects for scraping:
- SPARK: Spark
- HADOOP: Hadoop Common
- KAFKA: Kafka


In [10]:
#CHECKING TO SEE IF WE CAN FETCH A SINGLE ISSUE

In [8]:
import requests
import json

issue_key = "KAFKA-1" # Placeholder key

jira_api_url = f"https://issues.apache.org/jira/rest/api/2/issue/{issue_key}"

try:
    response = requests.get(jira_api_url)
    response.raise_for_status()
    issue_data = response.json()

    print(f"Successfully fetched data for issue: {issue_key}")
    print(f"Summary: {issue_data['fields']['summary']}")
    print(f"Status: {issue_data['fields']['status']['name']}")
    print(f"Created: {issue_data['fields']['created']}")
    print(f"Updated: {issue_data['fields']['updated']}")
    print(f"Comments count: {issue_data['fields']['comment']['total']}")

except requests.exceptions.RequestException as e:
    print(f"Error fetching data for issue {issue_key}: {e}")
    print("Please ensure the issue key is valid and exists in the project.")

Successfully fetched data for issue: KAFKA-2
Summary: a restful producer API
Status: Resolved
Created: 2011-07-19T21:32:09.586+0000
Updated: 2016-08-26T22:35:11.849+0000
Comments count: 2


In [None]:
# Scraping issues with pagination, error handling, retries, and resume functionality

In [19]:
import requests
import json
import time
import os

jira_search_url = "https://issues.apache.org/jira/rest/api/2/search"

jql_query = "project=KAFKA ORDER BY key ASC"
max_results = 500

output_filename = "kafka_issues_raw.json"
progress_filename = "kafka_scrape_progress.json"

all_issues = []
start_at = 0

#Resume functionality
if os.path.exists(progress_filename):
    try:
        with open(progress_filename, 'r') as f:
            progress_data = json.load(f)
            start_at = progress_data.get('last_start_at', 0)
            print(f"Resuming scraping from startAt={start_at}")

        if os.path.exists(output_filename):
             try:
                 with open(output_filename, 'r', encoding='utf-8') as f:
                     all_issues = json.load(f)
                 print(f"Loaded {len(all_issues)} issues from previous run.")
             except json.JSONDecodeError:
                 print(f"Warning: Could not decode existing {output_filename}. Starting fresh.")
                 all_issues = []
                 start_at = 0 # Reset if file is corrupt

    except json.JSONDecodeError:
        print(f"Warning: Could not decode {progress_filename}. Starting fresh.")
        start_at = 0

# Retry parameters
max_retries = 5
retry_delay = 5

print(f"Fetching issues for JQL query: {jql_query}")

while True:
    params = {
        'jql': jql_query,
        'startAt': start_at,
        'maxResults': max_results
    }

    for attempt in range(max_retries):
        try:
            response = requests.get(jira_search_url, params=params)
            response.raise_for_status()
            search_results = response.json()

            issues = search_results.get('issues', [])
            total_issues = search_results.get('total', 0)

            all_issues.extend(issues)

            print(f"Fetched {len(issues)} issues from startAt={start_at}. Total issues found so far: {len(all_issues)}")

            #Save progress and data periodically
            if len(issues) > 0 or start_at == 0:
                try:
                    # Save scraped data
                    with open(output_filename, 'w', encoding='utf-8') as f:
                        json.dump(all_issues, f, indent=4)

                    # Save progress
                    progress_data = {'last_start_at': start_at + len(issues)}
                    with open(progress_filename, 'w') as f:
                        json.dump(progress_data, f)

                except IOError as e:
                    print(f"Warning: Could not save progress or data: {e}")

            if (start_at + max_results) >= total_issues:
                print("\nFinished fetching all issues.")
                break
            else:
                start_at += max_results
                time.sleep(1)
                break

        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt + 1} of {max_retries} failed: {e}")
            if attempt < max_retries - 1:
                print(f"Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
            else:
                print(f"Max retries reached for batch starting at {start_at}. Skipping this batch.")
                total_issues = 0
                break
        except json.JSONDecodeError:
            print("Error decoding JSON response.")
            print(f"Skipping batch starting at {start_at} due to JSON error.")
            break

    if (start_at + max_results) >= total_issues and total_issues > 0:
        break

print(f"\nFinished fetching. Total issues collected: {len(all_issues)}")

Resuming scraping from startAt=18500
Loaded 18500 issues from previous run.
Fetching issues for JQL query: project=KAFKA ORDER BY key ASC
Fetched 84 issues from startAt=18500. Total issues found so far: 18584

Finished fetching all issues.

Finished fetching. Total issues collected: 18584


In [15]:
import json

output_filename = "kafka_issues_raw.json"
try:
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(all_issues, f, indent=4)

    print(f"Successfully saved scraped data to {output_filename}")

except IOError as e:
    print(f"Error saving data to file: {e}")

Successfully saved scraped data to kafka_issues_raw.json


In [30]:
import json

raw_data_path = "/content/kafka_issues_raw.json"
try:
    with open(raw_data_path, 'r', encoding='utf-8') as f:
        kafka_issues_raw = json.load(f)

    print(f"Successfully loaded raw data from {raw_data_path}")
    print(f"Number of issues loaded: {len(kafka_issues_raw)}")

    spark_issues_raw = kafka_issues_raw

except FileNotFoundError:
    print(f"Error: The file {raw_data_path} was not found.")
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {raw_data_path}. The file might be corrupted.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Successfully loaded raw data from /content/kafka_issues_raw.json
Number of issues loaded: 18584


In [22]:
import json

if 'spark_issues_raw' in locals() and len(spark_issues_raw) > 0:
    sample_issue = spark_issues_raw[0]

    print("Keys of a sample issue:")
    print(sample_issue.keys())

    if 'fields' in sample_issue:
        print("\nKeys within the 'fields' section:")
        print(sample_issue['fields'].keys())

        print("\nStructure of key fields:")
        if 'summary' in sample_issue['fields']:
            print(f"- Summary: {type(sample_issue['fields']['summary'])}")
        if 'description' in sample_issue['fields']:
            print(f"- Description: {type(sample_issue['fields']['description'])}")
        if 'comment' in sample_issue['fields'] and 'comments' in sample_issue['fields']['comment']:
             print(f"- Comments: {type(sample_issue['fields']['comment']['comments'])} containing {len(sample_issue['fields']['comment']['comments'])} comments (if any)")
             if len(sample_issue['fields']['comment']['comments']) > 0:
                 print(f"  - Structure of a sample comment: {type(sample_issue['fields']['comment']['comments'][0])}")
                 print(f"  - Keys of a sample comment: {sample_issue['fields']['comment']['comments'][0].keys()}")


else:
    print("Raw data not loaded or empty. Please run the previous cell to load the data.")

Keys of a sample issue:
dict_keys(['expand', 'id', 'self', 'key', 'fields'])

Keys within the 'fields' section:
dict_keys(['fixVersions', 'resolution', 'customfield_12312322', 'customfield_12312323', 'customfield_12310420', 'customfield_12312320', 'customfield_12312321', 'customfield_12312328', 'customfield_12312329', 'customfield_12312326', 'customfield_12312327', 'customfield_12310300', 'customfield_12312324', 'customfield_12312720', 'customfield_12312325', 'lastViewed', 'priority', 'labels', 'customfield_12312333', 'customfield_12312334', 'customfield_12313422', 'customfield_12312331', 'customfield_12310310', 'customfield_12312332', 'aggregatetimeoriginalestimate', 'timeestimate', 'customfield_12312330', 'versions', 'customfield_12311120', 'customfield_12313826', 'issuelinks', 'customfield_12312339', 'customfield_12313825', 'assignee', 'customfield_12312337', 'customfield_12313823', 'customfield_12312338', 'customfield_12311920', 'customfield_12313822', 'customfield_12312335', 'cust

In [40]:
# Extract, clean, and derive tasks from issue data

In [41]:
import re

transformed_issues = []

if 'spark_issues_raw' in locals() and len(spark_issues_raw) > 0:
    print(f"Starting extraction, cleaning, and task derivation for {len(spark_issues_raw)} issues.")

    for issue in spark_issues_raw:
        issue_data = {}
        issue_data['key'] = issue.get('key')

        fields = issue.get('fields', {})

        issue_data['summary'] = fields.get('summary')
        issue_data['description'] = fields.get('description')

        #status
        status = fields.get('status', {})
        issue_data['status'] = status.get('name') if status else None

        #reporter
        reporter = fields.get('reporter', {})
        issue_data['reporter'] = reporter.get('displayName') if reporter else None

        #dates
        issue_data['created'] = fields.get('created')
        issue_data['updated'] = fields.get('updated')

        if issue_data['description']:
            issue_data['description'] = re.sub(r'<.*?>', '', issue_data['description'])
            issue_data['description'] = issue_data['description'].strip()

        if issue_data['summary']:
             issue_data['summary'] = re.sub(r'<.*?>', '', issue_data['summary'])
             issue_data['summary'] = issue_data['summary'].strip()

        #Process comments
        comments_data = fields.get('comment', {})
        comment_list = comments_data.get('comments', [])
        issue_data['comments'] = []
        for comment in comment_list:
            comment_body = comment.get('body')
            if comment_body:
                cleaned_comment = re.sub(r'<.*?>', '', comment_body)
                cleaned_comment = cleaned_comment.strip()
                issue_data['comments'].append(cleaned_comment)

        #Derive Tasks
        issue_data['summarization_task'] = {
            "instruction": "Summarize the following Jira issue:",
            "input": f"Summary: {issue_data.get('summary', '')}\nDescription: {issue_data.get('description', '')}\nComments: {' '.join(issue_data.get('comments', []))}",
            "output": "Generated summary goes here."
        }

        #Classification Task
        issue_data['classification_task'] = {
            "instruction": "Classify the type of the following Jira issue:",
            "input": f"Summary: {issue_data.get('summary', '')}\nDescription: {issue_data.get('description', '')}",
            "output": issue_data.get('status', 'Unknown')
        }

        #Question Answering Task
        issue_data['qna_task'] = {
            "instruction": "Answer the following question based on the Jira issue:",
            "input": f"Issue: Summary: {issue_data.get('summary', '')}\nDescription: {issue_data.get('description', '')}\nComments: {' '.join(issue_data.get('comments', []))}\nQuestion: What is the main problem described in this issue?", # Replace with actual generated question
            "output": issue_data.get('summary', 'No summary provided.')
        }


        transformed_issues.append(issue_data)

    print(f"Finished extraction, cleaning, and task derivation. Created {len(transformed_issues)} transformed issue entries.")

else:
    print("Raw data not loaded or empty. Please run the loading cell first.")

Starting extraction, cleaning, and task derivation for 18584 issues.
Finished extraction, cleaning, and task derivation. Created 18584 transformed issue entries.


In [25]:
import json

if 'transformed_issues' in locals() and transformed_issues:
    print("Sample of transformed issue data:")
    print(json.dumps(transformed_issues[0], indent=4))
else:
    print("Transformed data not available. Please run the previous transformation steps.")

Sample of transformed issue data:
{
    "key": "SPARK-290",
    "summary": "Use SPARK_MASTER_IP if it is set in start-slaves.sh.",
    "description": "Also check to prompt user if it is not set and the script cannot figure out the master's ip.",
    "status": "Resolved",
    "reporter": "Reynold Xin",
    "created": "0012-10-19T00:09:00.000+0000",
    "updated": "2012-10-19T22:50:34.000+0000",
    "comments": []
}


In [31]:
import json

if 'transformed_issues' in locals() and transformed_issues:
    output_jsonl_filename = "kafka_issues_transformed.jsonl"

    try:
        with open(output_jsonl_filename, 'w', encoding='utf-8') as f:
            for issue in transformed_issues:
                #each issue as a JSON object on a new line
                f.write(json.dumps(issue, ensure_ascii=False) + '\n')

        print(f"Successfully saved transformed data to {output_jsonl_filename}")

    except IOError as e:
        print(f"Error saving transformed data to file: {e}")
else:
    print("Transformed data not available. Please run the transformation steps first.")

Successfully saved transformed data to kafka_issues_transformed.jsonl


In [32]:
import os

spark_file = "spark_issues_transformed.jsonl"
hadoop_file = "hadoop_issues_transformed.jsonl"
kafka_file = "kafka_issues_transformed.jsonl"

files = [spark_file, hadoop_file, kafka_file]

print("File sizes of transformed data:")

for file in files:
    if os.path.exists(file):
        size_in_bytes = os.path.getsize(file)
        size_in_kb = size_in_bytes / 1024
        size_in_mb = size_in_kb / 1024
        print(f"- {file}: {size_in_bytes} bytes ({size_in_kb:.2f} KB, {size_in_mb:.2f} MB)")
    else:
        print(f"- {file}: Not found")

File sizes of transformed data:
- spark_issues_transformed.jsonl: 284047402 bytes (277390.04 KB, 270.89 MB)
- hadoop_issues_transformed.jsonl: 284047402 bytes (277390.04 KB, 270.89 MB)
- kafka_issues_transformed.jsonl: 284047402 bytes (277390.04 KB, 270.89 MB)


In [33]:
import json
import os

output_combined_filename = "all_issues_transformed.jsonl"
input_files = [
    "spark_issues_transformed.jsonl",
    "hadoop_issues_transformed.jsonl",
    "kafka_issues_transformed.jsonl"
]

print(f"Starting to combine transformed data into {output_combined_filename}")

try:
    with open(output_combined_filename, 'w', encoding='utf-8') as outfile:
        for input_file in input_files:
            if os.path.exists(input_file):
                print(f"Reading from {input_file}...")
                with open(input_file, 'r', encoding='utf-8') as infile:
                    for line in infile:
                        outfile.write(line)
            else:
                print(f"Warning: Input file not found: {input_file}. Skipping.")

    print(f"Successfully combined data into {output_combined_filename}")

except IOError as e:
    print(f"Error combining data: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Starting to combine transformed data into all_issues_transformed.jsonl
Reading from spark_issues_transformed.jsonl...
Reading from hadoop_issues_transformed.jsonl...
Reading from kafka_issues_transformed.jsonl...
Successfully combined data into all_issues_transformed.jsonl


In [34]:
import os

combined_file = "all_issues_transformed.jsonl"

print(f"Checking the size of the combined file: {combined_file}")

if os.path.exists(combined_file):
    size_in_bytes = os.path.getsize(combined_file)
    size_in_kb = size_in_bytes / 1024
    size_in_mb = size_in_kb / 1024
    print(f"- {combined_file}: {size_in_bytes} bytes ({size_in_kb:.2f} KB, {size_in_mb:.2f} MB)")
else:
    print(f"- {combined_file}: Not found")

Checking the size of the combined file: all_issues_transformed.jsonl
- all_issues_transformed.jsonl: 852142206 bytes (832170.12 KB, 812.67 MB)


In [35]:
import json

combined_file = "all_issues_transformed.jsonl"

print(f"\nDisplaying sample entries from {combined_file}:")

try:
    with open(combined_file, 'r', encoding='utf-8') as f:
        for i in range(5):
            line = f.readline()
            if not line:
                break
            try:
                issue_data = json.loads(line)
                print(f"\n--- Sample Issue {i+1} ---")
                print(json.dumps(issue_data, indent=4))
            except json.JSONDecodeError:
                print(f"Error decoding JSON on line {i+1}. Line content: {line.strip()}")

except FileNotFoundError:
    print(f"Error: The file {combined_file} was not found.")
except Exception as e:
    print(f"An unexpected error occurred while reading the file: {e}")


Displaying sample entries from all_issues_transformed.jsonl:

--- Sample Issue 1 ---
{
    "key": "SPARK-290",
    "summary": "Use SPARK_MASTER_IP if it is set in start-slaves.sh.",
    "description": "Also check to prompt user if it is not set and the script cannot figure out the master's ip.",
    "status": "Resolved",
    "reporter": "Reynold Xin",
    "created": "0012-10-19T00:09:00.000+0000",
    "updated": "2012-10-19T22:50:34.000+0000",
    "comments": [],
    "summarization_task": {
        "instruction": "Summarize the following Jira issue:",
        "input": "Summary: Use SPARK_MASTER_IP if it is set in start-slaves.sh.\nDescription: Also check to prompt user if it is not set and the script cannot figure out the master's ip.\nComments: ",
        "output": "Generated summary goes here."
    },
    "classification_task": {
        "instruction": "Classify the type of the following Jira issue:",
        "input": "Summary: Use SPARK_MASTER_IP if it is set in start-slaves.sh.\nD