In [1]:
# This notebook is meant to be executed using Python 3 and is a quick example on how to use the Github v4 API
# Please fork this repo to add your own code.
# If you'd like to make a contribution, please take a look at our README.md and submit a Pull Request.
# For instructions on how to run this Jupyter Notebook please take a look at our README.md

from graphqlclient import GraphQLClient
import json
import csv
from datetime import datetime
import time

In [6]:
client = GraphQLClient('https://api.github.com/graphql')

# You need a token to use the Github API in an efficient maner.
# Please take a look at https://blog.github.com/2013-05-16-personal-api-tokens/ to see how to generate your own
# Once you have your token please replace `your-token` with it.
client.inject_token('bearer your-token')

In [7]:
# Our initial GraphQL query. Any updates done here need to also be made to the query_after query in the next cell.
query_initial = '''query {
    repositoryOwner(login: "%s") {
      repositories(first: 100) {
        edges {
          node {
            owner {
              ... on User {
                name
                url
                email
              }
              ... on Organization {
                name
                url
                email
              }
            }
            nameWithOwner
            createdAt
            forkCount
            isFork
            watchers {
              totalCount
            }
            stargazers {
              totalCount
            }
            pullRequests {
              totalCount
            }
            issues(labels: ["help wanted", "good first issue"], first: 100) {
              nodes {
                locked
                publishedAt
                title
                activeLockReason
                lastEditedAt
                url
                body
                bodyHTML
                bodyText
                participants {
                  totalCount
                }
                comments {
                  totalCount
                }
              }
            }
            languages(first: 20) {
              nodes {
                name
              }
            }
          }
        }
        pageInfo {
          hasNextPage
          endCursor
        }
      }
    }
    rateLimit {
      limit
      cost
      remaining
      resetAt
    }
  }'''

In [8]:
# GraphQL query that uses a cursor
query_after = query = '''query {
    repositoryOwner(login: "%s") {
      repositories(first: 100, after: "%s") {
        edges {
          node {
            owner {
              ... on User {
                name
                url
                email
              }
              ... on Organization {
                name
                url
                email
              }
            }
            nameWithOwner
            createdAt
            forkCount
            isFork
            watchers {
              totalCount
            }
            stargazers {
              totalCount
            }
            pullRequests {
              totalCount
            }
            issues(labels: ["help wanted", "good first issue"], first: 100) {
              nodes {
                locked
                publishedAt
                title
                activeLockReason
                lastEditedAt
                url
                body
                bodyHTML
                bodyText
                participants {
                  totalCount
                }
                comments {
                  totalCount
                }
              }
            }
            languages(first: 20) {
              nodes {
                name
              }
            }
          }
        }
        pageInfo {
          hasNextPage
          endCursor
        }
      }
    }
    rateLimit {
      limit
      cost
      remaining
      resetAt
    }
  }'''

In [9]:
total_repos = []

# This list is the U.S. Federal subset found here:
# https://github.com/github/government.github.com/blob/gh-pages/_data/governments.yml
owner_list = [
    '18f',
    'adl-aicc',
    'adlnet',
    'afrl',
    'arcticlcc',
    'BBGInnovate',
    'bfelob',
    'blue-button',
    'businessusa',
    'CA-CST-Library',
    'CA-CST-SII',
    'ccmc',
    'CDCgov',
    'cfpb',
    'cmsgov',
    'commercedataservice',
    'commercegov',
    'defense-cyber-crime-center',
    'demand-driven-open-data',
    'department-of-veterans-affairs',
    'deptofdefense',
    'dhs-ncats',
    'didsr',
    'digital-analytics-program',
    'doecode',
    'EEOC',
    'energyapps',
    'erdc-cm',
    'eregs',
    'fcc',
    'fda',
    'fecgov',
    'Federal-Aviation-Administration',
    'federaltradecommission',
    'fedspendingtransparency',
    'GIST-ORNL',
    'globegit',
    'gopleader',
    'government-services',
    'GreatSmokyMountainsNationalPark',
    'gsa',
    'gsa-oes',
    'hhs',
    'HHS-AHRQ',
    'HHSDigitalMediaAPIPlatform',
    'HHSIDEAlab',
    'historyatstate',
    'IIP-Design',
    'IMDProjects',
    'imls',
    'informaticslab',
    'Innovation-Toolkit',
    'internationaltradeadministration',
    'ioos',
    'IRSgov',
    'keplergo',
    'libraryofcongress',
    'M-O-S-E-S',
    'mcc-gov',
    'missioncommand',
    'nasa',
    'nasa-develop',
    'nasa-gibs',
    'NASA-rdt',
    'nasa-tournament-lab',
    'nasaworldwind',
    'NationalGuard',
    'nationalparkservice',
    'NCRN',
    'NeoGeographyToolkit',
    'nesii',
    'ngds',
    'nhanes',
    'nidcd',
    'NIEM',
    'NMB-Dev',
    'NMML',
    'noaa-gfdl',
    'NOAA-ORR-ERD',
    'ntia',
    'ombegov',
    'Open-Sat',
    'opengovplatform',
    'ozoneplatform',
    'peacecorps',
    'petsc',
    'presidential-innovation-fellows',
    'project-open-data',
    'radiofreeasia',
    'redhawksdr',
    'regulationsgov',
    'sbstusa',
    'SelectUSA',
    'SERVIR',
    'SSAgov',
    'state-hiu',
    'sunpy',
    'us-bea',
    'US-CBP',
    'usagov',
    'usaid',
    'usajobs',
    'usasearch',
    'usbr',
    'uscensusbureau',
    'uscis',
    'usda',
    'usda-ars-agil',
    'USDA-ERS',
    'USDA-FSA',
    'usda-vs',
    'usdepartmentoflabor',
    'USDeptVeteransAffairs',
    'usdoj',
    'usds',
    'usdot-jpo-ode',
    'useda',
    'usepa',
    'USFWS',
    'usg-scope',
    'USGCRP',
    'usgpo',
    'usgs',
    'USGS-Astrogeology',
    'USGS-CIDA',
    'usgs-cmg',
    'usgs-eros',
    'USGS-OWI',
    'USGS-R',
    'USGS-WiM',
    'usindianaffairs',
    'usinterior',
    'usnationalarchives',
    'USPS',
    'USPTO',
    'USSBA',
    'usstatedept',
    'ustaxcourt',
    'VHAINNOVATIONS',
    'virtual-world-framework',
    'visionworkbench',
    'WFMRDA',
    'whitehouse',
]

for owner in owner_list:
    hasNext = True
    cursor = None
    while hasNext:
        
        if cursor:
            query = query_after % (owner, cursor)
        else:
            query = query_initial % owner

        response = client.execute(query)
        json_response = json.loads(response)
        if json_response['data']:
            for item in json_response['data']['repositoryOwner']['repositories']['edges']:
                # These fields are later used to create the CSV
                total_repos.append({
                    'owner_name': item['node']['owner']['name'],
                    'owner_email': item['node']['owner']['email'],
                    'owner_url': item['node']['owner']['url'],
                    'name_with_owner': item['node']['nameWithOwner'],
                    'created_at': item['node']['createdAt'],
                    'fork_count': item['node']['forkCount'],
                    'is_fork': item['node']['isFork'],
                    'watchers': item['node']['watchers']['totalCount'],
                    'stargazers': item['node']['stargazers']['totalCount'],
                    'pull_request': item['node']['pullRequests']['totalCount'],
                    'issues': item['node']['issues'],
                })

            hasNext = json_response['data']['repositoryOwner']['repositories']['pageInfo']['hasNextPage']
            cursor = json_response['data']['repositoryOwner']['repositories']['pageInfo']['endCursor']
            remaining = json_response['data']['rateLimit']['remaining']
            limit = json_response['data']['rateLimit']['limit']
            percent_remaining = remaining / limit
            reset_at = json_response['data']['rateLimit']['resetAt']
            if percent_remaining < 0.15:
                reset_at = datetime.strptime(reset_at, '%Y-%m-%dT%H:%M:%SZ')
                current_time = datetime.now()
                time_diff = current_time - reset_at
                seconds = time_diff.total_seconds()
                time.sleep(seconds)
            else:
                time.sleep(2)
        else:
            hasNext = false

In [10]:
print(len(total_repos))

5870


In [11]:
# Same fields as above
fieldnames = [
    'owner_name',
    'owner_email',
    'owner_url',
    'name_with_owner',
    'created_at',
    'fork_count',
    'is_fork',
    'watchers',
    'stargazers',
    'pull_request',
    'issues',
]
with open('repos.csv', 'a') as csvfile:
    csv_writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    csv_writer.writeheader()
    csv_writer.writerows(total_repos)

In [38]:
# Example of what we can do with this.
# Let's filter out only the repos that are from NASA
# Then we'll filter out the repos that have issues
# Then we'll print out the full name of the repositories

# make a copy of our data set
repos_to_play_with = total_repos

# To filter our data set to only get the repos that are from NASA we can use a list comprehension.
# More info here: https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions
nasa_repos = [repo for repo in repos_to_play_with if repo['owner_name'] == 'NASA']
print(f'Total NASA repos: {len(nasa_repos)}')

# Or for a more functional way (some say more "correct").
# For filters more info here: https://docs.python.org/3/library/functions.html#filter
# For lambda more info here: https://docs.python.org/3/tutorial/controlflow.html#lambda-expressions
nasa_repos2 = list(filter(lambda repo: repo['owner_name'] == 'NASA', repos_to_play_with))
print(f'Total NASA repos using filter function: {len(nasa_repos)}')

Total NASA repos: 189
Total NASA repos using filter function: 189


In [43]:
# Getting all repos with issues
nasa_repos_with_issues = list(filter(lambda repo: len(repo['issues']['nodes']) > 0, nasa_repos))

for repo in nasa_repos_with_issues:
    print(f"Repo Full Name: {repo['name_with_owner']}")

Repo Full Name: nasa/openmct
Repo Full Name: nasa/astrobot
Repo Full Name: nasa/RHEAS
Repo Full Name: nasa/podaacpy
Repo Full Name: nasa/sitepod
Repo Full Name: nasa/utm-apis
