In [1]:
from github import Github
from dotenv import load_dotenv
import os
import json
load_dotenv()

GITHUB_ACCESS_TOKEN=os.getenv("GITHUB_ACCESS_TOKEN")
g = Github(GITHUB_ACCESS_TOKEN)


In [2]:
def get_repo_data(repo):
    # Basic repository info
    repo_data = {
        'name': repo.name,
        'description': repo.description,
        'url': repo.html_url,
        'branches': [],
        'issues': []  # Add an empty list for issues
    }

    # Fetch and add issues to repo_data
    for issue in repo.get_issues(state='all'):  # Fetch all issues, both open and closed
        issue_data = {
            'title': issue.title,
            'number': issue.number,
            'state': issue.state,
            'created_at': issue.created_at.isoformat(),
            'updated_at': issue.updated_at.isoformat(),
            'body': issue.body,
            'url': issue.html_url
        }
        repo_data['issues'].append(issue_data)

    # Branches and commits
    for branch in repo.get_branches():
        branch_data = {
            'name': branch.name,
            'commits': []
        }
        
        # List commits for each branch - limit set for demonstration purposes
        for commit in repo.get_commits(sha=branch.commit.sha):  # Adjust as needed
            commit_data = {
                'sha': commit.sha,
                'message': commit.commit.message,
                'date': commit.commit.author.date.isoformat(),
                'author': commit.commit.author.name,
                'url': commit.html_url
            }
            branch_data['commits'].append(commit_data)
        
        repo_data['branches'].append(branch_data)
    
    return repo_data


# Main function to get data for all repositories
def get_github_data():
    all_repos_data = []
    
    for repo in g.get_user().get_repos():
        repo_data = get_repo_data(repo)
        all_repos_data.append(repo_data)
    
    return all_repos_data


In [3]:
github_corpus = get_github_data()

# Save the corpus to a JSON file
with open('github_corpus.json', 'w',encoding="utf-8") as f:
    json.dump(github_corpus, f, indent=4)

In [4]:
def flatten_repo_data(all_repos_data):
    lines = []  # This will store all the lines before joining them into a single text string

    for repo_data in all_repos_data:
        repo_name = f"repo_name:{repo_data['name']}"
        repo_description = f"repo_description:{repo_data['description']}"

        # Handle issues for the repository
        for issue in repo_data['issues']:
            issue_body_cleaned = issue['body'].replace('\n', ' ')  # Clean issue body outside of f-string
            issue_line = [
                repo_name,
                repo_description,
                f"issue_title:{issue['title']}",
                f"issue_number:{issue['number']}",
                f"issue_state:{issue['state']}",
                f"created_at:{issue['created_at']}",
                f"updated_at:{issue['updated_at']}",
                f"issue_body:{issue_body_cleaned}"
            ]
            lines.append(','.join(issue_line))

        # Handle commits for each branch in the repository
        for branch in repo_data['branches']:
            branch_name = f"branch_name:{branch['name']}"
            for commit in branch['commits']:
                commit_message_cleaned = commit['message'].replace('\n', ' ')  # Clean commit message outside of f-string
                commit_line = [
                    repo_name,
                    repo_description,
                    branch_name,
                    f"commit_message:{commit_message_cleaned}",
                    f"commit_date:{commit['date']}",
                    f"commit_author:{commit['author']}"
                ]
                lines.append(','.join(commit_line))

    # Join all the lines into a single text string
    text = "\n".join(lines)
    return text

# Assuming you have a list of repositories data in `all_repos_data`
text = flatten_repo_data(github_corpus)
