In [1]:
from graphqlclient import GraphQLClient
import json
import csv
from datetime import datetime
import time

In [2]:
client = GraphQLClient('https://api.github.com/graphql')
client.inject_token('1c7a5846a532e81c075c7999587983c52c3f6aad')

In [3]:
query_initial = '''query {
    repositoryOwner(login: "%s") {
      repositories(first: 100) {
        edges {
          node {
            owner {
              ... on User {
                name
                url
                email
              }
              ... on Organization {
                name
                url
                email
              }
            }
            nameWithOwner
            createdAt
            forkCount
            isFork
            watchers {
              totalCount
            }
            stargazers {
              totalCount
            }
            pullRequests {
              totalCount
            }
            issues(labels: ["help wanted", "good first issue"], first: 100) {
              nodes {
                locked
                publishedAt
                title
                activeLockReason
                lastEditedAt
                url
                body
                bodyHTML
                bodyText
                participants {
                  totalCount
                }
                comments {
                  totalCount
                }
              }
            }
            languages(first: 20) {
              nodes {
                name
              }
            }
          }
        }
        pageInfo {
          hasNextPage
          endCursor
        }
      }
    }
    rateLimit {
      limit
      cost
      remaining
      resetAt
    }
  }'''

In [4]:
query_after = query = '''query {
    repositoryOwner(login: "%s") {
      repositories(first: 100, after: "%s") {
        edges {
          node {
            owner {
              ... on User {
                name
                url
                email
              }
              ... on Organization {
                name
                url
                email
              }
            }
            nameWithOwner
            createdAt
            forkCount
            isFork
            watchers {
              totalCount
            }
            stargazers {
              totalCount
            }
            pullRequests {
              totalCount
            }
            issues(labels: ["help wanted", "good first issue"], first: 100) {
              nodes {
                locked
                publishedAt
                title
                activeLockReason
                lastEditedAt
                url
                body
                bodyHTML
                bodyText
                participants {
                  totalCount
                }
                comments {
                  totalCount
                }
              }
            }
            languages(first: 20) {
              nodes {
                name
              }
            }
          }
        }
        pageInfo {
          hasNextPage
          endCursor
        }
      }
    }
    rateLimit {
      limit
      cost
      remaining
      resetAt
    }
  }'''

In [13]:
total_repos = []
owner_list = [
#     'cfpb',
#     'usdoj',
#     'USDepartmentofLabor',
#     'USFRA',
#     'RTICWDT',
#     'usedgov',
#     'GSA',
    '18F',
#     'presidential-innovation-fellows',
#     'HHS',
#     'usnationalarchives',
#     'NASA-Tensegrity-Robotics-Toolkit',
#     'cboshuizen',
#     'KeplerGO',
#     'nasa',
#     'pkolano',
#     'NASARace',
#     'geocam',
#     'JustinSGray',
#     'nasa-gibs',
#     'vightel',
#     'wkiri',
#     'dataplumber',
#     'Open-MBEE',
#     'JPL-IMCE',
#     'kleb',
#     'ceos-seo',
#     'NASA-DEVELOP',
#     'NASAWorldWind',
#     'NASAWorldWindResearch',
#     'nsf-open',
#     'ssagov',
#     'fedspendingtransparency',
#     'NAVADMC',
#     'USDA',
#     'USDA-ARS-AGIL',
#     'USDA-ERS',
#     'USDA-FSA',
#     'USDA-VS',
#     'FDA',
#     'iadgov',
#     'dhs-ncats',
#     'USAID-MCIO',
]
for owner in owner_list:
    hasNext = True
    cursor = None
    while hasNext:
        if cursor:
            query = query_after % (owner, cursor)
        else:
            query = query_initial % owner

        response = client.execute(query)
        json_response = json.loads(response)
        if json_response['data']:
            for item in json_response['data']['repositoryOwner']['repositories']['edges']:
                total_repos.append({
                    'owner_name': item['node']['owner']['name'],
                    'owner_email': item['node']['owner']['email'],
                    'owner_url': item['node']['owner']['url'],
                    'name_with_owner': item['node']['nameWithOwner'],
                    'created_at': item['node']['createdAt'],
                    'fork_count': item['node']['forkCount'],
                    'is_fork': item['node']['isFork'],
                    'watchers': item['node']['watchers']['totalCount'],
                    'stargazers': item['node']['stargazers']['totalCount'],
                    'pull_request': item['node']['pullRequests']['totalCount'],
                    'issues': item['node']['issues'],
                })

            hasNext = json_response['data']['repositoryOwner']['repositories']['pageInfo']['hasNextPage']
            cursor = json_response['data']['repositoryOwner']['repositories']['pageInfo']['endCursor']
            remaining = json_response['data']['rateLimit']['remaining']
            limit = json_response['data']['rateLimit']['limit']
            percent_remaining = remaining / limit
            reset_at = json_response['data']['rateLimit']['resetAt']
            if percent_remaining < 0.15:
                reset_at = datetime.strptime(reset_at, '%Y-%m-%dT%H:%M:%SZ')
                current_time = datetime.now()
                time_diff = current_time - reset_at
                seconds = time_diff.total_seconds()
                time.sleep(seconds)
            else:
                time.sleep(2)
        else:
            hasNext = false

In [14]:
print(len(total_repos))

1033


In [15]:
fieldnames = [
    'owner_name',
    'owner_email',
    'owner_url',
    'name_with_owner',
    'created_at',
    'fork_count',
    'is_fork',
    'watchers',
    'stargazers',
    'pull_request',
    'issues',
]
with open('repos.csv', 'a') as csvfile:
    csv_writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    csv_writer.writeheader()
    csv_writer.writerows(total_repos)

In [16]:
total_repos[0]

{'owner_name': '18F',
 'owner_email': '18f@gsa.gov',
 'owner_url': 'https://github.com/18F',
 'name_with_owner': '18F/api.data.gov',
 'created_at': '2013-07-17T05:21:11Z',
 'fork_count': 21,
 'is_fork': False,
 'watchers': 138,
 'stargazers': 45,
 'pull_request': 59,
 'issues': {'nodes': []}}