In [12]:
import os 
import numpy as np 
import pandas as pd 
import requests 
import sys
from dotenv import load_dotenv
import json

In [13]:
data_path = "data"

df = pd.read_csv(os.path.join(data_path, "sampled_issues_100.csv"))


In [14]:
def get_github_token():
    load_dotenv()
    token = os.getenv("GITHUB_TOKEN")
    if token is None:
        print("GITHUB_TOKEN not found in environment variables.")
        sys.exit(1)
    return token

In [15]:
def create_github_link(repository_url, issue_number):
    return f"{repository_url}/issues/{issue_number}"

In [16]:
def fetch_issue_contents(repository_url, issue_number, token):
    """
    Fetches comprehensive issue data including:
    - Basic issue details
    - All comments
    - All events (labels, assignments, etc.)
    - Timeline (unified view of comments and events)
    """
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/vnd.github+json",
        "User-Agent": "simple-github-issues-script",
    }
    
    issue_data = {}
    
    # 1. Fetch basic issue details
    issue_url = create_github_link(repository_url=repository_url, issue_number=issue_number)
    response = requests.get(issue_url, headers=headers)
    if response.status_code != 200:
        print(f"Failed to fetch issue: {response.status_code}")
        return {}
    issue_data['issue'] = response.json()
    
    # 2. Fetch all comments
    comments_url = f"{issue_url}/comments"
    comments = []
    page = 1
    while True:
        response = requests.get(comments_url, headers=headers, params={"page": page, "per_page": 100})
        if response.status_code != 200:
            print(f"Failed to fetch comments: {response.status_code}")
            break
        page_comments = response.json()
        if not page_comments:
            break
        comments.extend(page_comments)
        page += 1
    issue_data['comments'] = comments
    
    # 3. Fetch all events (labels, assignments, closes, reopens, etc.)
    events_url = f"{issue_url}/events"
    events = []
    page = 1
    while True:
        response = requests.get(events_url, headers=headers, params={"page": page, "per_page": 100})
        if response.status_code != 200:
            print(f"Failed to fetch events: {response.status_code}")
            break
        page_events = response.json()
        if not page_events:
            break
        events.extend(page_events)
        page += 1
    issue_data['events'] = events
    
    # 4. Fetch timeline (unified chronological view)
    # Note: Requires special accept header
    timeline_url = f"{issue_url}/timeline"
    timeline_headers = headers.copy()
    timeline_headers["Accept"] = "application/vnd.github.mockingbird-preview+json"
    timeline = []
    page = 1
    while True:
        response = requests.get(timeline_url, headers=timeline_headers, params={"page": page, "per_page": 100})
        if response.status_code != 200:
            print(f"Failed to fetch timeline: {response.status_code}")
            break
        page_timeline = response.json()
        if not page_timeline:
            break
        timeline.extend(page_timeline)
        page += 1
    issue_data['timeline'] = timeline
    
    return issue_data


In [17]:
issue = df.iloc[0]
content = fetch_issue_contents(
    repository_url=issue['repository_url'],
    issue_number=issue['issue_number'],
    token=get_github_token()
)
print(content)
content_json_path = os.path.join(data_path, "issue_content_example.json")
with open(content_json_path, "w") as f:
    json.dump(content, f, indent=4)

{'issue': {'url': 'https://api.github.com/repos/SemBioProcess/SemGen/issues/17', 'repository_url': 'https://api.github.com/repos/SemBioProcess/SemGen', 'labels_url': 'https://api.github.com/repos/SemBioProcess/SemGen/issues/17/labels{/name}', 'comments_url': 'https://api.github.com/repos/SemBioProcess/SemGen/issues/17/comments', 'events_url': 'https://api.github.com/repos/SemBioProcess/SemGen/issues/17/events', 'html_url': 'https://github.com/SemBioProcess/SemGen/issues/17', 'id': 55046762, 'node_id': 'MDU6SXNzdWU1NTA0Njc2Mg==', 'number': 17, 'title': 'Process participant multipliers need to be doubles, not integers', 'user': {'login': 'maxneal', 'id': 3513016, 'node_id': 'MDQ6VXNlcjM1MTMwMTY=', 'avatar_url': 'https://avatars.githubusercontent.com/u/3513016?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/maxneal', 'html_url': 'https://github.com/maxneal', 'followers_url': 'https://api.github.com/users/maxneal/followers', 'following_url': 'https://api.github.com/users/maxn

In [18]:
content_json_path = os.path.join(data_path, "issue_content_example.json")
with open(content_json_path, "r") as f:
    issue_content = json.load(f)

In [19]:
print(issue_content['issue']['html_url'])
print(issue_content['issue']['user']['login'])
print(issue_content['issue']['author_association'])
print(issue_content['issue']['comments'])

https://github.com/SemBioProcess/SemGen/issues/17
maxneal
COLLABORATOR
4


In [20]:
print(issue_content['comments'][0]['user']['login'])
print(issue_content['comments'][0]['body'])
print(issue_content['comments'][0]['author_association'])

thompsct
I've fixed this in the refactoring branch. Since I'm close to merging anyway, I'll bring them in with everything else.

COLLABORATOR


In [21]:
def extract_comment_info(issue_content):
    """
    Extracts relevant information from issue comments.
    Returns a list of dictionaries with the desired fields for each comment.
    """
    comments_info = []
    for comment in issue_content['comments']:
        comments_info.append({
            'commenter': comment['user']['login'],
            'comment_text': comment['body'].strip(),
            'commenter_association': comment['author_association']
        })  
    return comments_info

In [22]:
def extract_issue_info(issue_content):
    """
    Extracts relevant information from fetched issue content.
    Returns a dictionary with the desired fields.
    """
    comment_info = extract_comment_info(issue_content)
    
    return {
        'html_url': issue_content['issue']['html_url'],
        'issue_reporter': issue_content['issue']['user']['login'],
        'issue_reporter_association': issue_content['issue']['author_association'],
        'comment_count': len(issue_content['comments']),
        'commenters': [c['commenter'] for c in comment_info],
        'comment_texts': [c['comment_text'] for c in comment_info],
        'commenter_associations': [c['commenter_association'] for c in comment_info]
    }

In [23]:
# Test with the already fetched issue
issue_info = extract_issue_info(issue_content)
print("Extracted info:")
for key, value in issue_info.items():
    print(f"{key}: {value}")

Extracted info:
html_url: https://github.com/SemBioProcess/SemGen/issues/17
issue_reporter: maxneal
issue_reporter_association: COLLABORATOR
comment_count: 4
commenters: ['thompsct', 'maxneal', 'thompsct', 'maxneal']
comment_texts: ["I've fixed this in the refactoring branch. Since I'm close to merging anyway, I'll bring them in with everything else.", 'If you are in the custom process editor and enter a double (as opposed to an integer) in the "Multiplier" field for a source, sink or mediator, you get an error message saying that it\'s an invalid entry.', "Sorry, I hadn't committed it yet. Try it now.", 'Yep, works now. Thanks.']
commenter_associations: ['COLLABORATOR', 'COLLABORATOR', 'COLLABORATOR', 'COLLABORATOR']


In [24]:
# Initialize the new columns first with appropriate data types
if 'html_url' not in df.columns:
    df['html_url'] = None
    df['html_url'] = df['html_url'].astype('object')
if 'issue_reporter' not in df.columns:
    df['issue_reporter'] = None
    df['issue_reporter'] = df['issue_reporter'].astype('object')
if 'issue_reporter_association' not in df.columns:
    df['issue_reporter_association'] = None
    df['issue_reporter_association'] = df['issue_reporter_association'].astype('object')
if 'comment_count' not in df.columns:
    df['comment_count'] = None
    df['comment_count'] = df['comment_count'].astype('Int64')
if 'commenters' not in df.columns:
    df['commenters'] = [[] for _ in range(len(df))]
if 'comment_texts' not in df.columns:
    df['comment_texts'] = [[] for _ in range(len(df))]
if 'commenter_associations' not in df.columns:
    df['commenter_associations'] = [[] for _ in range(len(df))]

# Add the new columns to the first row of the DataFrame (the one we fetched)
issue_info = extract_issue_info(issue_content)
for key, value in issue_info.items():
    df.at[0, key] = value

# Display the updated row
print(df.iloc[0])

repository_url                https://api.github.com/repos/SemBioProcess/SemGen
issue_number                                                                 17
title_processed               process participant multipliers need doubles n...
body_processed                data type process participant multipliers stoi...
label                                                                       bug
label_cat                                                                   bug
test_tag                                                                      0
valid                                                                     valid
html_url                      https://github.com/SemBioProcess/SemGen/issues/17
issue_reporter                                                          maxneal
issue_reporter_association                                         COLLABORATOR
comment_count                                                                 4
commenters                              

### Fetch and Update All Issues
Now we can fetch all issues and update the DataFrame with the new columns.

In [25]:
# Initialize new columns with appropriate data types
df['html_url'] = None
df['html_url'] = df['html_url'].astype('object')
df['issue_reporter'] = None
df['issue_reporter'] = df['issue_reporter'].astype('object')
df['issue_reporter_association'] = None
df['issue_reporter_association'] = df['issue_reporter_association'].astype('object')
df['comment_count'] = None
df['comment_count'] = df['comment_count'].astype('Int64')
df['commenters'] = [[] for _ in range(len(df))]
df['comment_texts'] = [[] for _ in range(len(df))]
df['commenter_associations'] = [[] for _ in range(len(df))]

# Fetch and update all issues
token = get_github_token()
failed_issues = []

for idx, row in df.iterrows():
    print(f"Fetching issue {idx + 1}/{len(df)}: {row['repository_url']} #{row['issue_number']}")
    
    try:
        issue_content = fetch_issue_contents(
            repository_url=row['repository_url'],
            issue_number=row['issue_number'],
            token=token
        )
        
        if issue_content:  # Check if fetch was successful
            issue_info = extract_issue_info(issue_content)
            for key, value in issue_info.items():
                df.at[idx, key] = value
            print(f"  ✓ Success: {issue_info['comment_count']} comments")
        else:
            failed_issues.append((idx, row['repository_url'], row['issue_number']))
            print(f"  ✗ Failed to fetch")
    except Exception as e:
        failed_issues.append((idx, row['repository_url'], row['issue_number']))
        print(f"  ✗ Error: {str(e)}")

print(f"\n\nCompleted: {len(df) - len(failed_issues)}/{len(df)} issues fetched successfully")
if failed_issues:
    print(f"Failed issues: {len(failed_issues)}")
    for idx, repo, issue_num in failed_issues:
        print(f"  - Index {idx}: {repo} #{issue_num}")

Fetching issue 1/100: https://api.github.com/repos/SemBioProcess/SemGen #17
  ✓ Success: 4 comments
Fetching issue 2/100: https://api.github.com/repos/MegaMek/megamek #273
  ✓ Success: 4 comments
Fetching issue 2/100: https://api.github.com/repos/MegaMek/megamek #273
  ✓ Success: 3 comments
Fetching issue 3/100: https://api.github.com/repos/eclipse/vorto #1809
  ✓ Success: 3 comments
Fetching issue 3/100: https://api.github.com/repos/eclipse/vorto #1809
  ✓ Success: 0 comments
Fetching issue 4/100: https://api.github.com/repos/JHUAPL/AccumuloGraph #9
  ✓ Success: 0 comments
Fetching issue 4/100: https://api.github.com/repos/JHUAPL/AccumuloGraph #9
  ✓ Success: 0 comments
Fetching issue 5/100: https://api.github.com/repos/spring-cloud/spring-cloud-netflix #673
  ✓ Success: 0 comments
Fetching issue 5/100: https://api.github.com/repos/spring-cloud/spring-cloud-netflix #673
  ✓ Success: 8 comments
Fetching issue 6/100: https://api.github.com/repos/wicketforge/wicketforge #146
  ✓ Success:

In [26]:
# Display summary statistics
print("Summary Statistics:")
print(f"Total issues: {len(df)}")
print(f"Issues with data: {df['html_url'].notna().sum()}")
print(f"\nComment count statistics:")
print(df['comment_count'].describe())
print(f"\nIssue reporter associations:")
print(df['issue_reporter_association'].value_counts())

Summary Statistics:
Total issues: 100
Issues with data: 94

Comment count statistics:
count        94.0
mean     3.202128
std      5.623644
min           0.0
25%           0.0
50%           1.0
75%           4.0
max          38.0
Name: comment_count, dtype: Float64

Issue reporter associations:
issue_reporter_association
NONE            38
CONTRIBUTOR     19
OWNER           16
COLLABORATOR    12
MEMBER           9
Name: count, dtype: int64


In [27]:
# Save the updated DataFrame
output_path = os.path.join(data_path, "sampled_issues_100_enhanced.csv")
df.to_csv(output_path, index=False)
print(f"Enhanced dataset saved to: {output_path}")

Enhanced dataset saved to: data\sampled_issues_100_enhanced.csv


In [28]:
df = pd.read_csv(os.path.join(data_path, "sampled_issues_100_enhanced.csv"))
df.columns

Index(['repository_url', 'issue_number', 'title_processed', 'body_processed',
       'label', 'label_cat', 'test_tag', 'valid', 'html_url', 'issue_reporter',
       'issue_reporter_association', 'comment_count', 'commenters',
       'comment_texts', 'commenter_associations'],
      dtype='object')

In [31]:
df[df["valid"] == "invalid"]["html_url"]

44    https://github.com/chorhatarahuduketuri/gc/iss...
45    https://github.com/MateuszKubuszok/JSDPU/issues/4
46              https://github.com/hi3g/freki/issues/13
47    https://github.com/dimagi/commcare-android/pul...
48    https://github.com/spring-projects/spring-petc...
49     https://github.com/doanduyhai/Achilles/issues/22
50    https://github.com/David-Bromell/Android-Devel...
51     https://github.com/cryptic-game/server/issues/31
52                                                  NaN
53    https://github.com/JamesPierce82/RPGClubApp/is...
54        https://github.com/eclipse-milo/milo/pull/643
55    https://github.com/opennetworkinglab/flowvisor...
56    https://github.com/thelivelock/subscribe-lambd...
57    https://github.com/rascarlo/ArchPackages/issues/2
58    https://github.com/neo4j/neo4j-java-driver-spr...
59    https://github.com/spring-cloud/spring-cloud-r...
60    https://github.com/mguenther/kafka-junit/issue...
61    https://github.com/BorderTech/wcomponents/

In [32]:
# Access all full links
print("All issue links:")
for url in df['html_url']:
    print(url)

# Or get them as a list
all_links = df['html_url'].tolist()
print(f"\nTotal links: {len(all_links)}")

All issue links:
https://github.com/SemBioProcess/SemGen/issues/17
https://github.com/MegaMek/megamek/issues/273
https://github.com/eclipse-vorto/vorto/issues/1809
https://github.com/JHUAPL/AccumuloGraph/issues/9
https://github.com/spring-cloud/spring-cloud-netflix/issues/673
https://github.com/wicketforge/wicketforge/issues/146
https://github.com/stefan-niedermann/nextcloud-deck/issues/169
https://github.com/bentocorp/android/issues/142
https://github.com/MarcusWolschon/osmeditor4android/issues/648
https://github.com/inaturalist/iNaturalistAndroid/issues/225
https://github.com/lokka30/PhantomCombat/issues/31
https://github.com/CyclopsMC/IntegratedDynamics/issues/166
https://github.com/mett29/ing-sw-2018-lambertucci-losavio-mancassola/issues/9
https://github.com/1908-aug19-java/Project2-Samuel-Chris-Kyle-BackEnd/pull/11
https://github.com/Rajawali/Rajawali/issues/993
https://github.com/imixs/imixs-workflow/issues/396
https://github.com/google/closure-compiler/issues/3178
https://github

In [37]:
# Access links for specific conditions
invalid_links = df[df["valid"] == "invalid"]["html_url"].tolist()
print("Invalid and having commented issue links:")
for url in invalid_links:
    print(url)
    
valid_links = df[df["valid"] == "valid"]["html_url"].tolist()
print(f"\nValid issues: {len(valid_links)}")
print(f"Invalid issues: {len(invalid_links)}")

Invalid and having commented issue links:
https://github.com/chorhatarahuduketuri/gc/issues/63
https://github.com/MateuszKubuszok/JSDPU/issues/4
https://github.com/hi3g/freki/issues/13
https://github.com/dimagi/commcare-android/pull/2124
https://github.com/spring-projects/spring-petclinic/pull/182
https://github.com/doanduyhai/Achilles/issues/22
https://github.com/David-Bromell/Android-Development-Project/issues/16
https://github.com/cryptic-game/server/issues/31
nan
https://github.com/JamesPierce82/RPGClubApp/issues/2
https://github.com/eclipse-milo/milo/pull/643
https://github.com/opennetworkinglab/flowvisor/issues/30
https://github.com/thelivelock/subscribe-lambda/pull/3
https://github.com/rascarlo/ArchPackages/issues/2
https://github.com/neo4j/neo4j-java-driver-spring-boot-starter/issues/7
https://github.com/spring-cloud/spring-cloud-release-tools/issues/5
https://github.com/mguenther/kafka-junit/issues/17
https://github.com/BorderTech/wcomponents/issues/179
https://github.com/Slee

In [34]:
# Access a specific link by index
print(f"First issue link: {df.iloc[0]['html_url']}")
print(f"Issue reporter: {df.iloc[0]['issue_reporter']}")
print(f"Comment count: {df.iloc[0]['comment_count']}")

First issue link: https://github.com/SemBioProcess/SemGen/issues/17
Issue reporter: maxneal
Comment count: 4.0
