In [1]:
from graphqlclient import GraphQLClient
import json
import csv
from datetime import datetime
import time
import requests
from dotenv import DotEnv
import re
import logging

In [2]:
environment = DotEnv()
gh_api_token = environment.get('GITHUB_TOKEN')
client = GraphQLClient('https://api.github.com/graphql')
client.inject_token(f'token {gh_api_token}')

In [3]:
def get_gh_query(repo_owner, repo_name, gh_cursor=None):
    str_cursor = 'null'
    if gh_cursor:
        str_cursor = f'"{gh_cursor}"'

    query = '''
        query {
            repository(owner: "%s", name: "%s") {
                owner{
                    ... on Organization {
                        name
                        url
                        email
                        login
                    }
                    ... on User {
                        name
                        url
                        email
                        login
                    }
                }
                issues(first: 100, labels: ["help wanted", "code.gov"], after: %s) {
                    nodes {
                        title
                        bodyText
                        url
                        state
                        createdAt
                        lastEditedAt
                        publishedAt
                        updatedAt
                        labels(first:20) {
                            nodes {
                                name
                            }
                        }
                        locked
                        participants {
                            totalCount
                        }
                    }
                    pageInfo {
                        hasNextPage
                        endCursor
                    }
                }
            }
            rateLimit {
                limit
                cost
                remaining
                resetAt
            }
        }
    '''
    return query % (repo_owner, repo_name, str_cursor)

In [4]:
def get_gh_issues(repo_owner, repo_name, gh_cursor=None):

    query = get_gh_query(repo_owner, repo_name, gh_cursor)

    rate_limit = get_rate_limit()
    handle_rate_limit(rate_limit)

    response = client.execute(query)
    json_response = json.loads(response)

    if json_response is None:
        error_msg = f'[ERROR] while getting issues for {repo_owner}/{repo_name}. Error: Not Found'
        logging.error(error_msg)
        raise Exception(error_msg)

    if 'errors' in json_response:
        errors = json_response['errors']
        error_msg = f'[ERROR] while getting issues for {repo_owner}/{repo_name}. Errors: {errors}'
        logging.error(error_msg)
        raise Exception(error_msg)
    
    issues = json_response['data']['repository']['issues']
    repository_owner_data = json_response['data']['repository']['owner']
    return_issues = []

    if issues['nodes']:
        for issue in issues['nodes']:
            return_issues.append({
                'repo_name': repo_name,
                'repo_owner_name': repository_owner_data['name'],
                'repo_owner_email': repository_owner_data['email'],
                'repo_owner_user_name': repository_owner_data['login'],
                'repo_owner_profile_url': repository_owner_data['url'],
                'title': issue['title'],
                'bodyHTML': issue['bodyHTML'],
                'url': issue['url'],
                'state': issue['state'],
                'createdAt': issue['createdAt'],
                'lastEditedAt': issue['lastEditedAt'],
                'publishedAt': issue['publishedAt'],
                'updatedAt': issue['updatedAt'],
                'labels': [node['name'] for node in issue['labels']['nodes']],
                'is_locked': issue['locked'],
                'total_participants': issue['participants']['totalCount'],
            })

        hasNext = issues['pageInfo']['hasNextPage']

        if hasNext:
            cursor = issues['pageInfo']['endCursor']
            
            get_gh_issues(gh_api_token, repo_owner, repo_name, cursor)
            
            return return_issues.extend()
        else:
            return return_issues
    else:
        logging.debug(f'No issues found for {repo_owner}/{repo_name}')
        return []

In [5]:
def get_repo_owner_and_name(gh_url):
    if re.match(r'(https|http)://github.com', gh_url):
        url_split = gh_url.split('/')
        
        # Naively Verify that the url is complete and correct.
        # Split should have 5 items if the github url includes the owner and repo name
        if len(url_split) > 4:
            repo_name = url_split[-1]
            owner = url_split[-2]
            # Removes the .git suffix if it is present
            if '.git' in repo_name:
                repo_name = repo_name[:-4]

            return owner, repo_name
        else:
            owner = url_split[-1]
            return owner, None

    if re.match(r'git@github.com', gh_url):
        url_split = gh_url.split(':')

        owner_repo = url_split[-1].split('/')

        if len(owner_repo) > 1:
            owner = owner_repo[0]
            repo_name = owner_repo[1]

            # Removes the .git suffix if it is present
            if '.git' in repo_name:
                repo_name = repo_name[:-4]

            return owner, repo_name
        else:
            return owner_repo[0], None

    logging.info(f'URL: {gh_url} is not a valida Github URL')
    return None, None

In [6]:
def get_repos_from_code_gov(api_token):
    logging.info('Getting repos from Code.gov')
    headers = {
        'X-API-KEY': api_token,
        'Content-Type': 'application/json',
    }
    params = {
        'permissions.usageType': 'opensource',
        'size': 5000
    }
    response = requests.get('https://api.code.gov/repos', headers=headers, params=params)
    json_response = response.json()

    return json_response['repos']

In [7]:
def create_csv(file_name, data, fields):
    logging.info('Creating issues csv file')
    with open(file_name, 'w') as issues_csv:
        
        writer = csv.DictWriter(issues_csv, fieldnames=fields)
        writer.writeheader()
        writer.writerows(data)

In [18]:
def handle_rate_limit(rate_limit):
    remaining = rate_limit['remaining']
    limit = rate_limit['limit']
    percent_remaining = remaining / limit
    reset_at = rate_limit['resetAt']
    if percent_remaining < 0.15:
        reset_at = datetime.strptime(reset_at, '%Y-%m-%dT%H:%M:%SZ')
        current_time = datetime.now()
        time_diff = reset_at - current_time
        seconds = time_diff.total_seconds()
        time.sleep(seconds)
    else:
        time.sleep(1)

In [9]:
def get_rate_limit():
    query = '''query {
        rateLimit {
            limit
            remaining
            resetAt
        }
    }'''
    response = client.execute(query)
    json_response = json.loads(response)
    return json_response['data']['rateLimit']

In [10]:
def create_github_issues_csv(data):
    fields = [
        "agency",
        'repo_name',
        'repo_owner_name',
        'repo_owner_email',
        'repo_owner_user_name',
        'repo_owner_profile_url',
        'title',
        'bodyHTML',
        'url',
        'state',
        'createdAt',
        'lastEditedAt',
        'publishedAt',
        'updatedAt',
        'labels',
        'is_locked',
        'total_participants',
    ]
    
    create_csv(f'github_issues-{datetime.now()}.csv', data, fields)

In [11]:
def create_repos_with_errors_csv(data):
    fields = [
        'repo_name',
        'repo_url',
        'errors',
    ]
    
    create_csv(f'repos_with_error-{datetime.now()}.csv', data, fields)

In [12]:
def add_agency_to_data(issues, agency):
    modified_issues = []
    for issue in issues:
        issue['agency'] = agency
        modified_issues.append(issue)
    return modified_issues

In [19]:
start_time = datetime.now()
logging.info(f'Execution started: {start_time}')

repos = get_repos_from_code_gov(environment.get('CODE_GOV_API_TOKEN'))

github_repos = [repo for repo in repos if 'github.com' in repo['repositoryURL']]
# github_repos = list(filter(lambda repo: re.match(r'(https:\/\/||git@)github.com', repo['repositoryURL']), repos))

github_issues = []
repos_with_errors = []

logging.info('Getting Github Issues')
for repo in github_repos:
    repo_url = repo['repositoryURL']

    repo_owner, repo_name = get_repo_owner_and_name(repo_url)
    
    if repo_owner and repo_name:
        try:
            issues = get_gh_issues(repo_owner, repo_name)

            github_issues.extend(
                add_agency_to_data(issues, repo['agency']['acronym'])
            )
        except Exception as errors:
            repos_with_errors.append({
                'repo_name': repo['name'], 
                'repo_url': repo_url,
                'errors': errors
            })
    else:
        repos_with_errors.append({
            'repo_name': repo['name'], 
            'repo_url': repo_url,
            'errors': f'Owner: {repo_owner} or Repo name: {repo_name} as missing'
        })

finish_time = datetime.now()
delta = finish_time - start_time

logging.info(f'Execution finished: {finish_time}')
logging.info(f'Execution took {delta.seconds} seconds')

ERROR:root:[ERROR] while getting issues for USEPA/QA-SDMP-Project. Errors: [{'message': "Could not resolve to a Repository with the name 'QA-SDMP-Project'.", 'type': 'NOT_FOUND', 'path': ['repository'], 'locations': [{'line': 3, 'column': 13}]}]
ERROR:root:[ERROR] while getting issues for losalamos/NHPP-for-FRBs. Errors: [{'message': "Could not resolve to a Repository with the name 'NHPP-for-FRBs'.", 'type': 'NOT_FOUND', 'path': ['repository'], 'locations': [{'line': 3, 'column': 13}]}]
ERROR:root:[ERROR] while getting issues for ktoddbrown/soils-long-tail-recovery. Errors: [{'message': "Could not resolve to a Repository with the name 'soils-long-tail-recovery'.", 'type': 'NOT_FOUND', 'path': ['repository'], 'locations': [{'line': 3, 'column': 13}]}]
ERROR:root:[ERROR] while getting issues for USEPA/LCI-Primer. Errors: [{'message': "Could not resolve to a Repository with the name 'LCI-Primer'.", 'type': 'NOT_FOUND', 'path': ['repository'], 'locations': [{'line': 3, 'column': 13}]}]
ERR

ERROR:root:[ERROR] while getting issues for losalamos/Draco. Errors: [{'message': "Could not resolve to a Repository with the name 'Draco'.", 'type': 'NOT_FOUND', 'path': ['repository'], 'locations': [{'line': 3, 'column': 13}]}]
ERROR:root:[ERROR] while getting issues for losalamos/GlobalSums. Errors: [{'message': "Could not resolve to a Repository with the name 'GlobalSums'.", 'type': 'NOT_FOUND', 'path': ['repository'], 'locations': [{'line': 3, 'column': 13}]}]
ERROR:root:[ERROR] while getting issues for ornl-sava/gopcap. Errors: [{'message': "Could not resolve to a Repository with the name 'gopcap'.", 'type': 'NOT_FOUND', 'path': ['repository'], 'locations': [{'line': 3, 'column': 13}]}]
ERROR:root:[ERROR] while getting issues for slaclab/lightpath. Errors: [{'message': "Could not resolve to a Repository with the name 'lightpath'.", 'type': 'NOT_FOUND', 'path': ['repository'], 'locations': [{'line': 3, 'column': 13}]}]
ERROR:root:[ERROR] while getting issues for sandialabs/BioComp

ERROR:root:[ERROR] while getting issues for slaclab/HXRSnD. Errors: [{'message': "Could not resolve to a Repository with the name 'HXRSnD'.", 'type': 'NOT_FOUND', 'path': ['repository'], 'locations': [{'line': 3, 'column': 13}]}]
ERROR:root:[ERROR] while getting issues for slaclab/pcds-devices. Errors: [{'message': "Could not resolve to a Repository with the name 'pcds-devices'.", 'type': 'NOT_FOUND', 'path': ['repository'], 'locations': [{'line': 3, 'column': 13}]}]
ERROR:root:[ERROR] while getting issues for master/uq4sim. Errors: [{'message': "Could not resolve to a Repository with the name 'uq4sim'.", 'type': 'NOT_FOUND', 'path': ['repository'], 'locations': [{'line': 3, 'column': 13}]}]
ERROR:root:[ERROR] while getting issues for NASA-DEVELOP/SAVDT. Errors: [{'message': "Could not resolve to a Repository with the name 'SAVDT'.", 'type': 'NOT_FOUND', 'path': ['repository'], 'locations': [{'line': 3, 'column': 13}]}]
ERROR:root:[ERROR] while getting issues for GSA/cfpb.githu. Errors

KeyboardInterrupt: 

In [14]:
create_github_issues_csv(github_issues)

In [15]:
create_repos_with_errors_csv(repos_with_errors)

In [16]:
print(github_issues)

[]


In [17]:
print(repos_with_errors)

[{'repo_name': 'fec-pattern-library', 'repo_url': 'https://github.com/fecgov/fec-pattern-library', 'errors': KeyError('bodyHTML')}, {'repo_name': 'NARA Scripts', 'repo_url': 'https://github.com/usnationalarchives/nara-scripts', 'errors': KeyError('bodyHTML')}, {'repo_name': 'Intelligent Transportation Systems Public Data Hub', 'repo_url': 'https://github.com/usdot-its-jpo-data-portal/microsite/', 'errors': 'Owner: microsite or Repo name:  as missing'}, {'repo_name': 'usepa-harmonization', 'repo_url': 'https://github.com/USEPA', 'errors': 'Owner: USEPA or Repo name: None as missing'}, {'repo_name': 'QA-SDMP-Project', 'repo_url': 'https://github.com/USEPA/QA-SDMP-Project.git', 'errors': Exception('[ERROR] while getting issues for USEPA/QA-SDMP-Project. Errors: [{\'message\': "Could not resolve to a Repository with the name \'QA-SDMP-Project\'.", \'type\': \'NOT_FOUND\', \'path\': [\'repository\'], \'locations\': [{\'line\': 3, \'column\': 13}]}]')}, {'repo_name': 'NHPP for FRBs, Version 