# Data Extraction

## Issues

In [9]:
import pandas as pd
import re
import os
import time
import logging
from dotenv import load_dotenv
from github import Github
from tqdm import tqdm
from datetime import datetime, timedelta

In [26]:
g = Github(secret)
org = g.get_organization(org_name)
repo = org.get_repo(repo_name)

reponame_noperiod = repo_name.replace(".", "")
reponame_noperiod = reponame_noperiod.replace("_", "")
reponame_noperiod = reponame_noperiod.lower()

In [32]:
g.rate_limiting

(3892, 5000)

Functions

In [21]:
def check_rate_limit(g):
    """Check GitHub API rate limit and return remaining calls and reset time"""
    rate_limit = g.get_rate_limit()
    remaining = rate_limit.core.remaining
    reset_time = rate_limit.core.reset  # This is a datetime object
    
    return remaining, reset_time  # Return the datetime object directly

def wait_if_needed(g, threshold=10):
    """Wait if approaching rate limit"""
    remaining, reset_time = check_rate_limit(g)
    if remaining < threshold:
        # Convert current time to datetime for proper comparison
        current_time = datetime.fromtimestamp(time.time())
        
        # Calculate time difference in seconds
        sleep_duration = (reset_time - current_time).total_seconds() + 60  # Add 60 second buffer
        
        if sleep_duration > 0:
            print(f"Approaching rate limit. Sleeping for {sleep_duration:.1f} seconds.")
            time.sleep(sleep_duration)
            print("Resuming execution.")


def extract_mentions(text):
    if not text:
        return []
    # Basic regex to find GitHub-style @mentions
    mentions = re.findall(r'@(\w+)', text)
    return mentions

issues = repo.get_issues(state="all")
interactions_list = []

for issue in tqdm(issues, desc="Processing issues", unit="issue", total=issues.totalCount):
    wait_if_needed(g)  # Check rate limit before processing each issue
    
    if not issue.pull_request:
        # Add the initial issue as an interaction
        interactions_list.append({
            "issues_id": issue.number,
            "id": issue.id,
            "issues_title": issue.title,
            "body": issue.body,
            "created_at": issue.created_at,
            "created_by": issue.user.login,
            "labels": [label.name for label in issue.labels],
            "interaction_type": "issue_creation",
            "recipient_user": None,
        })
        
        # Process all comments on this issue
        wait_if_needed(g)  # Check before getting comments
        for comment in issue.get_comments():
            wait_if_needed(g)  # Check before processing each comment
            
            # Extract @mentions from comment body to identify recipients
            mentions = extract_mentions(comment.body)
            
            interactions_list.append({
                "issues_id": issue.number,
                "id": comment.id,
                "issues_title": issue.title,
                "body": comment.body,
                "created_at": comment.created_at,
                "created_by": comment.user.login,
                "labels": [label.name for label in issue.labels],
                "interaction_type": "issue_comment",
                "recipient_user": mentions[0] if mentions else None,

            })
        
        for event in issue.get_events():
            wait_if_needed(g)
            #no matter the type of event, we want to capture the event
            interactions_list.append({
                "issues_id": issue.number,
                "id": event.id,
                "issues_title": issue.title,
                "body": event.commit_id,
                "created_at": event.created_at,
                "created_by": event.actor.login,
                "labels": [label.name for label in issue.labels],
                "interaction_type": event.event,
                "recipient_user": None,

            })


Processing issues:   1%|          | 4/747 [00:06<19:08,  1.55s/issue]


KeyboardInterrupt: 

In [31]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("github_api.log"),
        logging.StreamHandler()
    ]
)
def check_rate_limit(g):
    """Check GitHub API rate limit and return remaining calls and reset time"""
    try:
        rate_limit = g.get_rate_limit()
        remaining = rate_limit.core.remaining
        reset_time = rate_limit.core.reset  # This is a datetime object
        return remaining, reset_time
    except Exception as e:
        logging.error(f"Error checking rate limit: {e}")
        # Return conservative values if we can't check
        return 0, datetime.now() + timedelta(hours=1)

def wait_if_needed(g, threshold=10, max_retries=3):
    """Wait if approaching rate limit with retry mechanism"""
    for attempt in range(max_retries):
        try:
            remaining, reset_time = check_rate_limit(g)
            if remaining < threshold:
                # Convert current time to datetime for proper comparison
                current_time = datetime.now()
                
                # Calculate time difference in seconds
                sleep_duration = (reset_time - current_time).total_seconds() + 5  # Add 60 second buffer
                
                if sleep_duration > 0:
                    logging.warning(f"Approaching rate limit. Sleeping until {reset_time} plus buffer (about {sleep_duration/60:.1f} minutes)")
                    
                    # For long waits, sleep in shorter intervals and log progress
                    if sleep_duration > 300:  # If more than 5 minutes
                        chunks = min(int(sleep_duration / 60), 30)  # At most 30 chunks
                        chunk_time = sleep_duration / chunks
                        
                        for i in range(chunks):
                            time.sleep(chunk_time)
                            logging.info(f"Still waiting... {((i+1)/chunks*100):.0f}% complete")
                    else:
                        time.sleep(sleep_duration)
                    
                    logging.info("Wait complete. Verifying rate limit before resuming...")
                    new_remaining, _ = check_rate_limit(g)
                    if new_remaining > threshold:
                        logging.info(f"Rate limit restored. {new_remaining} calls available.")
                        return True
                    else:
                        logging.warning(f"Rate limit not restored yet. Current remaining: {new_remaining}")
                        # Will retry if under max_retries
            else:
                return True  # No waiting needed
                
        except Exception as e:
            logging.error(f"Error during rate limit wait: {e}")
            time.sleep(60)  # Sleep a bit on error
    
    logging.critical("Failed to restore rate limit after maximum retries")
    return False  # Indicate failure


In [None]:
# Get all issues (excluding PRs)
if wait_if_needed(g, threshold=15):
    issues = repo.get_issues(state="all")
    interactions_list = []

    for issue in tqdm(issues, desc="Processing issues", unit="issue", total=issues.totalCount):
        wait_if_needed(g)  # Check rate limit before processing each issue

        if not issue.pull_request:
            # Add the initial issue as an interaction
            interactions_list.append({
                "issues_id": issue.number,
                "id": issue.id,
                "issues_title": issue.title,
                "body": issue.body,
                "created_at": issue.created_at,
                "created_by": issue.user.login,
                "labels": [label.name for label in issue.labels],
                "interaction_type": "issue_creation",
                "recipient_user": None,
            })

            # Process all comments on this issue
            wait_if_needed(g)  # Check before getting comments
            for comment in issue.get_comments():
                wait_if_needed(g)  # Check before processing each comment

                # Extract @mentions from comment body to identify recipients
                mentions = extract_mentions(comment.body)

                interactions_list.append({
                    "issues_id": issue.number,
                    "id": comment.id,
                    "issues_title": issue.title,
                    "body": comment.body,
                    "created_at": comment.created_at,
                    "created_by": comment.user.login,
                    "labels": [label.name for label in issue.labels],
                    "interaction_type": "issue_comment",
                    "recipient_user": mentions[0] if mentions else None,

                })

            for event in issue.get_events():
                wait_if_needed(g)
                #no matter the type of event, we want to capture the event
                interactions_list.append({
                    "issues_id": issue.number,
                    "id": event.id,
                    "issues_title": issue.title,
                    "body": event.commit_id,
                    "created_at": event.created_at,
                    "created_by": event.actor.login,
                    "labels": [label.name for label in issue.labels],
                    "interaction_type": event.event,
                    "recipient_user": None,

                })


Processing issues:   7%|▋         | 455/6819 [06:17<1:27:59,  1.21issue/s]


KeyboardInterrupt: 

In [None]:
# Create DataFrame from interactions list
interactions_df = pd.DataFrame(interactions_list)

# Convert lists to strings for proper storage
interactions_df['labels'] = interactions_df['labels'].apply(lambda x: ', '.join(x) if x else '')
interactions_df['body'] = interactions_df['body'].apply(lambda x: re.sub(r'[^\x20-\x7E]', '', str(x)) if pd.notnull(x) else x)

# Save to Excel
interactions_df.to_excel(f"Files/{reponame_noperiod}_issue_interactions.xlsx", index=False)
interactions_df

## Pulls

In [None]:


def extract_mentions(text):
    if not text:
        return []
    return re.findall(r'@([a-zA-Z0-9_-]+)', text)

In [30]:
try:
    if wait_if_needed(g, threshold=15):
        pulls = repo.get_pulls(state='all')
        interactions_list_pulls = []

        for pr in tqdm(pulls, desc="Processing PRs", unit="PR", total=pulls.totalCount):
            try:
                if not wait_if_needed(g):
                    logging.error(f"Rate limit wait failed for PR #{pr.number}, skipping")
                    continue
                
                # PR creation
                interactions_list_pulls.append({
                    "pulls_id": pr.number,
                    "id": pr.id,
                    "pulls_title": pr.title,
                    "body": pr.body,
                    "created_at": pr.created_at,
                    "created_by": pr.user.login,
                    "labels": [label.name for label in pr.labels],
                    "interaction_type": "pr_creation",
                    "recipient_user": None,
                })
                
                # PR commits
                try:
                    commits = pr.get_commits()
                    for commit in commits:
                        if not wait_if_needed(g):
                            logging.error(f"Rate limit wait failed for commit in PR #{pr.number}, skipping remaining commits")
                            break
                        interactions_list_pulls.append({
                            "pulls_id": pr.number,
                            "id": commit.sha,
                            "pulls_title": pr.title,
                            "body": commit.commit.message,
                            "created_at": commit.commit.author.date,
                            "created_by": commit.commit.author.name,
                            "labels": [label.name for label in pr.labels],
                            "interaction_type": "pr_commit",
                            "recipient_user": None,
                        })
                except Exception as e:
                    logging.error(f"Error processing commits for PR #{pr.number}: {e}")
                
                # Similar pattern for other sections...
                
            except Exception as e:
                logging.error(f"Error processing PR #{pr.number}: {e}")
                # Maybe wait a bit before continuing
                time.sleep(10)
    else:
        logging.critical("Initial rate limit check failed, cannot proceed")
except Exception as e:
    logging.critical(f"Fatal error in PR processing: {e}")

Processing PRs:  15%|█▌        | 343/2270 [05:44<31:41,  1.01PR/s]  2025-03-06 02:30:07,981 - ERROR - Error processing commits for PR #6297: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Processing PRs:  15%|█▌        | 344/2270 [6:02:48<3438:53:29, 6427.83s/PR]2025-03-06 02:30:07,997 - ERROR - Error checking rate limit: HTTPSConnectionPool(host='api.github.com', port=443): Max retries exceeded with url: /rate_limit (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002B753C2D250>: Failed to resolve 'api.github.com' ([Errno 11001] getaddrinfo failed)"))
2025-03-06 06:32:26,055 - INFO - Still waiting... 3% complete
2025-03-06 06:34:28,057 - INFO - Still waiting... 7% complete
2025-03-06 06:36:30,059 - INFO - Still waiting... 10% complete
2025-03-06 06:38:32,061 - INFO - Still waiting... 13% complete
2025-03-06 06:40:34,063 - INFO - Still waiting... 17% complete
2025-03-06 06:42:36,065 - INFO - Still waiting..

KeyboardInterrupt: 

In [None]:
# Create DataFrame from interactions list
interactions_df = pd.DataFrame(interactions_list_pulls)

# Convert lists to strings for proper storage
interactions_df['labels'] = interactions_df['labels'].apply(lambda x: ', '.join(x) if x else '')
interactions_df['body'] = interactions_df['body'].apply(lambda x: re.sub(r'[^\x20-\x7E]', '', str(x)) if pd.notnull(x) else x)

# Save to Excel
interactions_df.to_excel(f"{reponame_noperiod}_interactions_pulls.xlsx", index=False)
interactions_df