In [1]:
import requests
from graphqlclient import GraphQLClient
import json
import csv
from datetime import datetime
import time
from dotenv import DotEnv
import os
from IPython.display import clear_output

In [2]:
environment = DotEnv()
github_token = environment.get('GITHUB_TOKEN')
client = GraphQLClient('https://api.github.com/graphql')
client.inject_token(f'token {github_token}')

In [3]:
owner_list = [
    '18f',
    'cfpb',
    'cmsgov',
    'commercegov',
    'doecode',
    'fcc',
    'fda',
    'fecgov',
    'gsa',
    'nasa',
    'SSAgov',
    'usdepartmentoflabor',
    'USDeptVeteransAffairs',
    'usdoj',
    'usds',
    'whitehouse',
]

In [22]:
def get_gh_query(org, date_from, size=100, cursor=None):
    """
    Get and fill out the Github query to be used
    """
    size = size if size <= 100 else 100
    return '''
        query {
            search(first:%s, query:"org:%s created:>%s", type: ISSUE, after:%s){
                issueCount
                edges {
                    node {
                        __typename
                        ... on Issue {
                            title
                            createdAt
                            lastEditedAt
                            state
                            updatedAt
                            repository {
                                name
                                owner {
                                    login
                                }
                                forks {
                                    totalCount
                                }
                                stargazers {
                                    totalCount
                                }
                                watchers {
                                    totalCount
                                }
                                forkCount
                                nameWithOwner
                                createdAt
                                isPrivate
                            }
                        }
                        ... on PullRequest {
                            title
                            createdAt
                            lastEditedAt
                            state
                            updatedAt
                            repository {
                                name
                                owner {
                                    login
                                }
                                forks {
                                    totalCount
                                }
                                stargazers {
                                    totalCount
                                }
                                watchers {
                                    totalCount
                                }
                                forkCount
                                nameWithOwner
                                createdAt
                                isPrivate
                            }
                        }
                    }
                }
                pageInfo {
                    endCursor
                    hasNextPage
                }
            }
        }
    ''' % (size, org, date_from, f'"{cursor}"' if cursor else 'null')

In [21]:
def get_github_data(owner, date_form, query_limit, cursor):
    """
    Get Github data for the supplied owner/org
    """
    issues = []
    repos = []
    query = get_gh_query(owner, date_from, query_limit, cursor)
    results = json.loads(client.execute(query))

    if results['data']['search']['edges']:
        nodes = [ edge['node'] for edge in results['data']['search']['edges']]
        for node in nodes:
            issue, repo = parse_data(node)
            repo_name = repo['full_name']
            print(f'Processed repo: {repo_name}')

            issues.append(issue)
            repos.append(repo)
    
    has_next = results['data']['search']['pageInfo']['hasNextPage']
    cursor = results['data']['search']['pageInfo']['endCursor']
    
    return issues, repos, has_next, cursor

In [20]:
def get_rate_limit():
    """
    Get the Github API rate limit current state for the used token
    """
    query = '''query {
        rateLimit {
            limit
            remaining
            resetAt
        }
    }'''
    response = client.execute(query)
    json_response = json.loads(response)
    return json_response['data']['rateLimit']

In [7]:
def handle_rate_limit(rate_limit):
    """
    Handle Github API rate limit and wait times
    """
    remaining = rate_limit['remaining']
    limit = rate_limit['limit']
    percent_remaining = remaining / limit
    reset_at = rate_limit['resetAt']
    if percent_remaining < 0.15:
        reset_at = datetime.strptime(reset_at, '%Y-%m-%dT%H:%M:%SZ')
        current_time = datetime.now()
        time_diff = reset_at - current_time
        seconds = time_diff.total_seconds()
        time.sleep(seconds)

In [8]:
def parse_data(node):
    """
    Parse Github node data.
    """
    repo = {
        'name': node['repository']['name'],
        'owner': node['repository']['owner']['login'],
        'forks': node['repository']['forks']['totalCount'],
        'stargazers': node['repository']['stargazers']['totalCount'],
        'watchers': node['repository']['watchers']['totalCount'],
        'forkCount': node['repository']['forkCount'],
        'full_name': node['repository']['nameWithOwner'],
        'created_at': node['repository']['createdAt'],
        'isPrivate': node['repository']['isPrivate'],
    }

    issue = {
        'type': node['__typename'],
        'owner': node['repository']['owner']['login'],
        'repo_name': node['repository']['name'],
        'title': node['title'],
        'created_at': node['createdAt'],
        'last_edit_date': node['lastEditedAt'],
        'state': node['state'],
        'updated_at': node['updatedAt'],
    }
    return issue, repo

In [16]:
def create_csv(file_name, data, fields):
    """
    Create a CSV file from the supplied data and fields
    """
    print(f'Creating {file_name}')
    with open(file_name, 'w') as csv_file:
        
        writer = csv.DictWriter(csv_file, fieldnames=fields)
        writer.writeheader()
        writer.writerows(data)

In [17]:
def write_issues_csv(owner, data):
    """
    Write issues to a CSV file
    """
    fields = [
        'type',
        'owner',
        'repo_name',
        'title',
        'created_at',
        'last_edit_date',
        'state',
        'updated_at',
    ]
    create_csv(f'github-{owner}-issues-data-{datetime.now()}.csv', data, fields)

In [18]:
def write_repos_csv(owner, data):
    """
    Write repos to a CSV file
    """
    fields = [
        'name',
        'owner',
        'forks',
        'stargazers',
        'watchers',
        'forkCount',
        'full_name',
        'created_at',
        'isPrivate',
    ]
    create_csv(f'github-{owner}-repos-data-{datetime.now()}.csv', data, fields)

In [19]:
def filter_repos(repos):
    """
    Filter the repo objects into a single entry
    """
    filtered_repos = {}
    for repo in repos:
        repo_name = repo['full_name']
        filtered_repos[repo_name] = repo
    return filtered_repos.values()

In [13]:
processed_owners = []
for owner in owner_list:
    has_next = True
    cursor = None
    issues = []
    repos = []

    processed_owners.append(owner)
    print(f'Fetching data for {owner}')
    
    while has_next:
        rate_limit = get_rate_limit()
        handle_rate_limit(rate_limit)
    
        result_issues, result_repos, has_next, cursor = get_github_data(owner, '2016-08-01', 100, cursor)
        issues.extend(result_issues)
        repos.extend(result_repos)

    if repos:
        write_repos_csv(owner, filtered_repos.values())

    if issues:
        write_issues_csv(owner, issues)
            
    clear_output()
    print(f'processed {processed_owners}')

processed ['18f', 'cfpb', 'cmsgov', 'commercegov', 'doecode', 'fcc', 'fda', 'fecgov', 'gsa', 'nasa', 'SSAgov', 'usdepartmentoflabor', 'USDeptVeteransAffairs', 'usdoj', 'usds', 'whitehouse']
