# Data Extraction

## Issues

In [None]:
import pandas as pd
import re
import os
import time
import logging
import numpy as np
from dotenv import load_dotenv
from github import Github
from tqdm import tqdm
from datetime import datetime, timedelta
from collections import Counter


In [None]:
#sams key 2 
# 
# sams key 1
# 
secret= ""
org_name = "Rdatatable"
repo_name = "data.table"

In [53]:
g = Github(secret)
org = g.get_organization(org_name)
repo = org.get_repo(repo_name)

reponame_noperiod = repo_name.replace(".", "").replace("_", "").lower()


In [54]:
g.rate_limiting

(4965, 5000)

Functions

In [12]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("github_api.log"),
        logging.StreamHandler()
    ]
)
def check_rate_limit(g, retry_count=3, retry_delay=5):
    """Check GitHub API rate limit with network error handling"""
    for attempt in range(retry_count):
        try:
            rate_limit = g.get_rate_limit()
            remaining = rate_limit.core.remaining
            reset_time = rate_limit.core.reset
            return remaining, reset_time
        except Exception as e:
            if attempt < retry_count - 1:
                logging.warning(f"Network error checking rate limit (attempt {attempt+1}/{retry_count}): {e}")
                time.sleep(retry_delay * (attempt + 1))  # Exponential backoff
            else:
                logging.error(f"Failed to check rate limit after {retry_count} attempts: {e}")
                # Return conservative values if we can't check
                return 0, datetime.now() + timedelta(hours=1)

def wait_if_needed(g, threshold=50, max_retries=3):  # Increased threshold to 50
    """Wait if approaching rate limit with retry mechanism"""
    for attempt in range(max_retries):
        try:
            remaining, reset_time = check_rate_limit(g)
            
            # If we get a very low remaining count or error (returning 0)
            if remaining < threshold:
                # Convert current time to datetime for proper comparison
                current_time = datetime.now()
                
                # Calculate time difference in seconds
                sleep_duration = (reset_time - current_time).total_seconds() + 60  # Add 60 second buffer
                
                if sleep_duration > 0:
                    logging.warning(f"Approaching rate limit ({remaining} remaining). Sleeping until {reset_time} plus buffer (about {sleep_duration/60:.1f} minutes)")
                    
                    # For long waits, sleep in shorter intervals and log progress
                    if sleep_duration > 300:  # If more than 5 minutes
                        chunks = min(int(sleep_duration / 60), 30)  # At most 30 chunks
                        chunk_time = sleep_duration / chunks
                        
                        for i in range(chunks):
                            time.sleep(chunk_time)
                            logging.info(f"Still waiting... {((i+1)/chunks*100):.0f}% complete")
                    else:
                        time.sleep(sleep_duration)
                    
                    logging.info("Wait complete. Verifying rate limit before resuming...")
                    new_remaining, _ = check_rate_limit(g, retry_count=5)
                    if new_remaining > threshold:
                        logging.info(f"Rate limit restored. {new_remaining} calls available.")
                        return True
                    else:
                        logging.warning(f"Rate limit not restored yet. Current remaining: {new_remaining}")
                        # Will retry if under max_retries
            else:
                return True  # No waiting needed
                
        except Exception as e:
            logging.error(f"Error during rate limit wait: {e}")
            time.sleep(60)  # Sleep a bit on error
    
    logging.critical("Failed to restore rate limit after maximum retries")
    return False  # Indicate failure


def extract_mentions(text):
    if not text:
        return []
    return re.findall(r'@([a-zA-Z0-9_-]+)', text)

## Find Key Devolpers

In [None]:

# Load the CSV file with org and repo columns
df = pd.read_csv("repos.csv")

# Define what "recently active" means - e.g., last 3 months
cutoff_date = datetime.now() - timedelta(days=90)

# Prepare a list to store results
active_devs = []

# Iterate over each repo
for index, row in df.iterrows():
    org_name = row['org']
    repo_name = row['repo']
    
    try:
        print(f"Processing {org_name}/{repo_name}...")
        repo = g.get_repo(f"{org_name}/{repo_name}")
        
        # First get ALL contributors to make sure we don't miss top ones
        all_contributors = list(repo.get_contributors())
        all_contributors.sort(key=lambda x: x.contributions, reverse=True)
        
        # Get recent commits to check for activity
        recent_commits = repo.get_commits(since=cutoff_date)
        
        # Track active committers
        active_committers = set()
        for commit in tqdm(recent_commits, desc="Checking recent commits"):
            # Skip commits without author
            if not commit.author:
                continue
            active_committers.add(commit.author.login)
        
        print(f"Found {len(active_committers)} active committers in last 90 days")
        
        # Process top contributors (get top 10 regardless of recent activity)
        for contributor in all_contributors[:10]:
            is_active = contributor.login in active_committers
            active_devs.append({
                "org": org_name,
                "repo": repo_name,
                "username": contributor.login,
                "total_commits": contributor.contributions,
                "is_active": is_active,
                "is_top_10": True
            })
        
        # Add remaining active contributors who aren't in the top 10
        for contributor in all_contributors[10:]:
            if contributor.login in active_committers:
                active_devs.append({
                    "org": org_name,
                    "repo": repo_name,
                    "username": contributor.login,
                    "total_commits": contributor.contributions,
                    "is_active": True,
                    "is_top_10": False
                })
        
        # Avoid rate limiting
        time.sleep(2)
        
    except Exception as e:
        print(f"Error processing {org_name}/{repo_name}: {e}")

# Convert results to DataFrame and save
result_df = pd.DataFrame(active_devs)
result_df.to_excel("active_top_contributors.xlsx", index=False)

results = []

# Iterate over each row in the CSV file
for index, row in df.iterrows():
    org_name = row['org']
    repo_name = row['repo']
    try:
        print(f"Processing {org_name}/{repo_name}...")
        repo = g.get_repo(f"{org_name}/{repo_name}")
        
        # Get all contributors, sorted by number of contributions
        contributors = list(repo.get_contributors())
        contributors.sort(key=lambda x: x.contributions, reverse=True)
        
        # Take top contributors (let's say top 10)
        top_contributors = contributors[:10]
        
        # Store their information
        for contributor in top_contributors:
            results.append({
                "org": org_name,
                "repo": repo_name,
                "username": contributor.login,
                "commits": contributor.contributions,
            })
            
        # Avoid rate limiting
        time.sleep(2)
        
    except Exception as e:
        print(f"Error processing {org_name}/{repo_name}: {e}")

# Convert results to DataFrame and save
result_df = pd.DataFrame(results)
result_df.to_excel("top_contributors.xlsx", index=False)
result_df


Processing pandas-dev/pandas...


KeyboardInterrupt: 

## Issues

In [None]:
# Load the list of repositories and top contributors
contributors_df = pd.read_excel("top_contributors.xlsx")

interactions_list = []

# Iterate over each row in contributors_df
for _, row in contributors_df.iterrows():
    org, repo_name, username = row["org"], row["repo"], row["username"]
    
    try:
        repo = g.get_repo(f"{org}/{repo_name}")
        issues = repo.get_issues(state="all")
        
        for issue in tqdm(issues, desc=f"Processing {repo_name}", unit="issue", total=issues.totalCount):
            if not issue.pull_request and issue.user.login == username:
                interactions_list.append({
                    "org": org,
                    "repo": repo_name,
                    "user": username,
                    "interaction_type": "issue_creation",
                    "issues_id": issue.number,
                    "id": issue.id,
                    "issues_title": issue.title,
                    "body": issue.body,
                    "created_at": issue.created_at,
                    "created_by": issue.user.login,
                    "labels": [label.name for label in issue.labels],
                    "recipient_user": None,
                })

                # Process comments
                for comment in issue.get_comments():
                    if comment.user.login == username:
                        mentions = extract_mentions(comment.body)
                        interactions_list.append({
                            "org": org,
                            "repo": repo_name,
                            "user": username,
                            "interaction_type": "issue_comment",
                            "issues_id": issue.number,
                            "id": comment.id,
                            "issues_title": issue.title,
                            "body": comment.body,
                            "created_at": comment.created_at,
                            "created_by": comment.user.login,
                            "labels": [label.name for label in issue.labels],
                            "recipient_user": mentions[0] if mentions else None,
                        })
                
                # Capture events
                for event in issue.get_events():
                    if event.actor and event.actor.login == username:
                        interactions_list.append({
                            "org": org,
                            "repo": repo_name,
                            "user": username,
                            "interaction_type": event.event,
                            "issues_id": issue.number,
                            "id": event.id,
                            "issues_title": issue.title,
                            "body": event.commit_id,
                            "created_at": event.created_at,
                            "created_by": event.actor.login,
                            "labels": [label.name for label in issue.labels],
                            "recipient_user": None,
                        })
    
    except Exception as e:
        print(f"Error processing {org}/{repo_name}: {e}")



Processing pandas:   0%|          | 41/61173 [00:05<2:15:19,  7.53issue/s]


KeyboardInterrupt: 

In [None]:
# Convert to DataFrame and save to CSV
interactions_df = pd.DataFrame(interactions_list)
interactions_df.to_csv("developer_interactions.csv", index=False)
print("Data collection complete. Saved to developer_interactions.csv")

In [None]:
input_df = pd.read_excel("active_top_contributors.xlsx")

# Prepare a list to store all interactions
all_interactions = []

# Group by repository to process each repo once
for (org, repo_name), group_df in input_df.groupby(['org', 'repo']):
    try:
        print(f"Processing {org}/{repo_name}...")
        repo = g.get_repo(f"{org}/{repo_name}")
        
        # Get list of developers we're interested in for this repo
        developers = group_df['username'].tolist()
        
        # Get all issues (excluding PRs)
        issues = repo.get_issues(state="all")
        print(f"Found {issues.totalCount} issues to process")
        
        for issue in tqdm(issues, desc="Processing issues", unit="issue"):
            if hasattr(issue, 'pull_request') and issue.pull_request:
                continue  # Skip PRs
                
            # Filter by developer creation
            if issue.user and issue.user.login in developers:
                all_interactions.append({
                    "org": org,
                    "repo": repo_name,
                    "issues_id": issue.number,
                    "id": issue.id,
                    "issues_title": issue.title,
                    "body": issue.body,
                    "created_at": issue.created_at,
                    "created_by": issue.user.login,
                    "labels": [label.name for label in issue.labels],
                    "interaction_type": "issue_creation",
                    "recipient_user": None,
                })

                # Process comments on this issue
                try:
                    comments = issue.get_comments()
                    for comment in comments:
                        if comment.user and comment.user.login in developers:
                            mentions = extract_mentions(comment.body)
                            all_interactions.append({
                                "org": org,
                                "repo": repo_name,
                                "issues_id": issue.number,
                                "id": comment.id,
                                "issues_title": issue.title,
                                "body": comment.body,
                                "created_at": comment.created_at,
                                "created_by": comment.user.login,
                                "labels": [label.name for label in issue.labels],
                                "interaction_type": "issue_comment",
                                "recipient_user": mentions[0] if mentions else None,
                            })
                except Exception as e:
                    print(f"Error processing comments for issue #{issue.number}: {e}")

                # Capture any events linked to the issue
                try:
                    events = issue.get_events()
                    for event in events:
                        if event.actor and event.actor.login in developers:
                            all_interactions.append({
                                "org": org,
                                "repo": repo_name,
                                "issues_id": issue.number,
                                "id": event.id,
                                "issues_title": issue.title,
                                "body": event.commit_id if hasattr(event, 'commit_id') else None,
                                "created_at": event.created_at,
                                "created_by": event.actor.login,
                                "labels": [label.name for label in issue.labels],
                                "interaction_type": event.event,
                                "recipient_user": None,
                            })
                except Exception as e:
                    print(f"Error processing events for issue #{issue.number}: {e}")
            
            # Check comments and events even if issue wasn't created by a target developer
            else:
                # Process comments for developers' interactions on others' issues
                try:
                    comments = issue.get_comments()
                    for comment in comments:
                        if comment.user and comment.user.login in developers:
                            mentions = extract_mentions(comment.body)
                            all_interactions.append({
                                "org": org,
                                "repo": repo_name,
                                "issues_id": issue.number,
                                "id": comment.id,
                                "issues_title": issue.title,
                                "body": comment.body,
                                "created_at": comment.created_at,
                                "created_by": comment.user.login,
                                "labels": [label.name for label in issue.labels],
                                "interaction_type": "issue_comment",
                                "recipient_user": mentions[0] if mentions else None,
                            })
                except Exception as e:
                    print(f"Error processing comments for issue #{issue.number}: {e}")
                
                # Check events for developers
                try:
                    events = issue.get_events()
                    for event in events:
                        if event.actor and event.actor.login in developers:
                            all_interactions.append({
                                "org": org,
                                "repo": repo_name,
                                "issues_id": issue.number,
                                "id": event.id,
                                "issues_title": issue.title,
                                "body": event.commit_id if hasattr(event, 'commit_id') else None,
                                "created_at": event.created_at,
                                "created_by": event.actor.login,
                                "labels": [label.name for label in issue.labels],
                                "interaction_type": event.event,
                                "recipient_user": None,
                            })
                except Exception as e:
                    print(f"Error processing events for issue #{issue.number}: {e}")
        
        # Rate limit handling
        rate_limit = g.get_rate_limit()
        if rate_limit.core.remaining < 100:
            reset_timestamp = rate_limit.core.reset.timestamp()
            sleep_time = reset_timestamp - time.time() + 60  # Add 60 seconds buffer
            print(f"Rate limit almost reached. Sleeping for {sleep_time/60:.1f} minutes")
            time.sleep(max(1, sleep_time))
        else:
            time.sleep(2)  # Small delay between repos
            
    except Exception as e:
        print(f"Error processing repository {org}/{repo_name}: {e}")

# Convert to DataFrame and save results
interactions_df = pd.DataFrame(all_interactions)
interactions_df.to_csv("developer_interactions.csv", index=False)
print(f"Analysis complete! Found {len(interactions_df)} interactions across all repositories.")
print(f"Results saved to developer_interactions.csv")

Processing Homebrew/brew...
Analyzing interactions for users: MikeMcQuaid, reitermarkus, jacknagel, BrewTestBot, Bo98, dependabot[bot], adamv, carlocab, Rylan12, mxcl, issyl0, dduugg, p-linnane, nandahkrishna, EricFromCanada, samford, apainintheneck, ZhongRuoyu, woodruffw, SMillerDev, iMichka, cho-m, bevanjkay, alebcay, tyuwags, abitrolly, gromgit, colindean, Moisan, branchvincent, botantony, Kentzo, mislav, gibfahn, rrotter, osalbahr, infogrind, khipp, toobuntu, koddsson, zyoshoka, BingoKingo, heaths
Found 18810 issues to process


Processing issues: 158issue [00:13, 12.02issue/s]


KeyboardInterrupt: 

In [26]:
# Create DataFrame from interactions list
interactions_df = pd.DataFrame(interactions_list)

# Convert lists to strings for proper storage
interactions_df['labels'] = interactions_df['labels'].apply(lambda x: ', '.join(x) if x else '')
interactions_df['body'] = interactions_df['body'].apply(lambda x: re.sub(r'[^\x20-\x7E]', '', str(x)) if pd.notnull(x) else x)

# Save to Excel
interactions_df.to_excel(f"Files/{reponame_noperiod}_issue_interactions_Small.xlsx", index=False)

#find the uniue users in this df
unique_users = pd.Series(interactions_df['created_by'].unique()).to_frame(name='unique_users')
unique_users

Unnamed: 0,unique_users
0,hadley
1,gaborcsardi
2,jennybc
3,salim-b
4,krlmlr


## Pulls

In [12]:
try:
    if wait_if_needed(g, threshold=15):
        pulls = repo.get_pulls(state='all')
        interactions_list_pulls = []

        for pr in tqdm(pulls, desc="Processing PRs", unit="PR", total=pulls.totalCount):
            try:
                if not wait_if_needed(g):
                    logging.error(f"Rate limit wait failed for PR #{pr.number}, skipping")
                    continue
                
                # PR creation
                interactions_list_pulls.append({
                    "pulls_id": pr.number,
                    "id": pr.id,
                    "pulls_title": pr.title,
                    "body": pr.body,
                    "created_at": pr.created_at,
                    "created_by": pr.user.login,
                    "labels": [label.name for label in pr.labels],
                    "interaction_type": "pr_creation",
                    "recipient_user": None,
                })
                
                # PR commits
                try:
                    commits = pr.get_commits()
                    for commit in commits:
                        if not wait_if_needed(g):
                            logging.error(f"Rate limit wait failed for commit in PR #{pr.number}, skipping remaining commits")
                            break
                            
                        # Get the correct author identifier
                        try:
                            if commit.author:
                                author_login = commit.author.login
                            else:
                                author_login = commit.commit.author.name
                        except Exception as e:
                            logging.warning(f"Could not get author login for commit {commit.sha}: {e}")
                            author_login = commit.commit.author.name
                            
                        interactions_list_pulls.append({
                            "pulls_id": pr.number,
                            "id": commit.sha,
                            "pulls_title": pr.title,
                            "body": commit.commit.message,
                            "created_at": commit.commit.author.date,
                            "created_by": author_login,  # Using the corrected author login
                            "labels": [label.name for label in pr.labels],
                            "interaction_type": "pr_commit",
                            "recipient_user": None,
                        })
                except Exception as e:
                    logging.error(f"Error processing commits for PR #{pr.number}: {e}")

                
                # PR comments
                for comment in pr.get_comments():
                    wait_if_needed(g)
                    mentions = extract_mentions(comment.body)
                    interactions_list_pulls.append({
                        "pulls_id": pr.number,
                        "id": comment.id,
                        "pulls_title": pr.title,
                        "body": comment.body,
                        "created_at": comment.created_at,
                        "created_by": comment.user.login,
                        "labels": [label.name for label in pr.labels],
                        "interaction_type": "pr_comment",
                        "recipient_user": mentions[0] if mentions else None,
                    })

                # PR reviews
                for review in pr.get_reviews():
                    wait_if_needed(g)
                    interactions_list_pulls.append({
                        "pulls_id": pr.number,
                        "id": review.id,
                        "pulls_title": pr.title,
                        "body": review.body,
                        "created_at": review.submitted_at,
                        "created_by": review.user.login,
                        "labels": [label.name for label in pr.labels],
                        "interaction_type": f"pr_review_{review.state.lower()}",  # Review state (approved, changes_requested, etc.)
                        "recipient_user": None,
                    })

                for review in pr.get_reviews():
                    wait_if_needed(g)
                    mentions = extract_mentions(review.body)
                    if mentions:
                        for mention in mentions:
                            interactions_list_pulls.append({
                                "pulls_id": pr.number,
                                "id": review.id,
                                "pulls_title": pr.title,
                                "body": review.body,
                                "created_at": review.submitted_at,
                                "created_by": review.user.login,
                                "labels": [label.name for label in pr.labels],
                                "interaction_type": f"pr_review_{review.state.lower()}",  # Review state (approved, changes_requested, etc.)
                                "recipient_user": mention,
                            })
                    else:
                        interactions_list_pulls.append({
                            "pulls_id": pr.number,
                            "id": review.id,
                            "pulls_title": pr.title,
                            "body": review.body,
                            "created_at": review.submitted_at,
                            "created_by": review.user.login,
                            "labels": [label.name for label in pr.labels],
                            "interaction_type": f"pr_review_{review.state.lower()}",
                            "recipient_user": None,
                        })
            except Exception as e:
                logging.error(f"Error processing PR #{pr.number}: {e}")
                # Maybe wait a bit before continuing
                time.sleep(10)
    else:
        logging.critical("Initial rate limit check failed, cannot proceed")
except Exception as e:
    logging.critical(f"Fatal error in PR processing: {e}")

Processing PRs: 100%|██████████| 227/227 [06:55<00:00,  1.83s/PR]


In [14]:
# Create DataFrame from interactions list
interactions_df = pd.DataFrame(interactions_list_pulls)

# Convert lists to strings for proper storage
interactions_df['labels'] = interactions_df['labels'].apply(lambda x: ', '.join(x) if x else '')
interactions_df['body'] = interactions_df['body'].apply(lambda x: re.sub(r'[^\x20-\x7E]', '', str(x)) if pd.notnull(x) else x)

# Save to Excel
interactions_df.to_excel(f"{reponame_noperiod}_interactions_pulls.xlsx", index=False)
interactions_df

Unnamed: 0,pulls_id,id,pulls_title,body,created_at,created_by,labels,interaction_type,recipient_user
0,744,2238515808,Configurable file hyperlinks,This PR makes it possible for Positron (or any...,2024-12-17 00:20:45,jennybc,,pr_creation,
1,744,6bb0b21cda4b43f2eb70506e52d2648b0b20d894,Configurable file hyperlinks,Make file hyperlinks configurable,2024-12-13 22:27:51,jennybc,,pr_commit,
2,744,55c2eb3dc6adc8f0ecba4b97d18e8c2a2d540546,Configurable file hyperlinks,Increment version number to 3.6.3.9002,2024-12-16 23:40:32,jennybc,,pr_commit,
3,744,4e31755a84f4a404c5b6a971a8f4e0c771913847,Configurable file hyperlinks,Test helpers that parse and interpolate file l...,2024-12-17 22:10:26,jennybc,,pr_commit,
4,744,f4e7c06b41f0cbbd9cef26c7e34dfccdc2fb2fef,Configurable file hyperlinks,Add docs,2024-12-17 22:55:15,jennybc,,pr_commit,
...,...,...,...,...,...,...,...,...,...
1347,11,40e04868d28e1cccb38a9fd62b7a1e1695be96e0,Respect RStudio console width,Respect RStudio console widthFixes #9,2017-10-20 17:24:01,hadley,,pr_commit,
1348,11,70916231,Respect RStudio console width,,2017-10-20 17:42:03,gaborcsardi,,pr_review_approved,
1349,11,70916231,Respect RStudio console width,,2017-10-20 17:42:03,gaborcsardi,,pr_review_approved,
1350,7,124712329,"README, closes #4",,2017-06-08 18:14:29,gaborcsardi,,pr_creation,


### Survival Anyasis data Transforamtion