In [None]:
import requests

This guide is aimed to help you understand how to use the pagination to get the data you want. We will use the [GitHub API](https://developer.github.com/v3/) as an example. The GitHub API provides a [search API](https://developer.github.com/v3/search/) to search for repositories, issues, users, commits, and code, and a [REST API](https://developer.github.com/v3/) to get the data of these resources. In this guide, we will use the search API to search for repositories. The search API has a [rate limit](https://developer.github.com/v3/search/#rate-limit) of 30 requests per minute. To get more data, we need to use the pagination to get the next page of the search results. We will use the [requests](http://docs.python-requests.org/en/master/) library to send HTTP requests to the GitHub API. We will use the [json](https://docs.python.org/3/library/json.html) library to parse the JSON data returned by the GitHub API. We will use the [time](https://docs.python.org/3/library/time.html) library to sleep for a while to avoid hitting the rate limit.

In [3]:
query_file = "query/query.graphql"

def get_query(query):
    # This function just reads the query file and returns the query
    # looking at the query we can see what parameters it expects and what the query does
    # 
    with open(query_file, 'r') as f:
        query = f.read()
    return query

query = get_query(query_file)
print(query)



query ($counter: Int!, $owner: String!, $repo: String!, $issuecursor: String, $commcursor: String) {
  repository(owner: $owner, name: $repo) {
    id
    issues(first: $counter, after: $issuecursor) {
      totalCount
      pageInfo {
        hasNextPage
        endCursor
      }
      edges {
        node {
          id
          body
          title
          state
          number
          comments(first: $counter, after: $commcursor) {
            pageInfo {
              endCursor
              hasNextPage
            }
            edges {
              node {
                body
                id
              }
            }
            totalCount
          }
        }
      }
    }
  }
}


- The parameters you can make use of are 
$counter: Int!, $owner: String!, $repo: String!, $issuecursor: String, $commcursor: String

- The counter is the number of issues you want to get. It takes an integer value between 1 and 100. 
- The owner is the owner of the repository. You can find the owner of a repository in the URL of the repository. For example, github.com/microsoft/vscode, the owner is microsoft. 
- The repo is the name of the repository. You can find the name of a repository in the URL of the repository. For example, github.com/microsoft/vscode, the name is vscode.
- The issuecursor is the cursor of the issue. You can use the cursort to get the next page of the issues. You can obtain the cursor by parsing the JSON data returned by the GitHub API when you make the first call with no cursor.
- The commcursor is the cursor of the commit. You can use the cursort to get the next page of the commits. You can obtain the cursor by parsing the JSON data returned by the GitHub API when you make the first call with no cursor.
- make sure the params key is in the format of {"owner": owner, "repo": repo, "issuecursor": issuecursor, "commcursor": commcursor}
- issuecursor and commcursor are optional. They can be empty strings or None when you are making the first call and every time you make a call, you should update the cursor with the cursor returned by the previous call.

In [12]:
import requests

def headers(token):
    headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'Authorization': f"bearer {str(token)}",
    'Accept': 'application/vnd.github.v3+json' 
    }
    return headers


owner, repo = "tensorflow", "tensorflow"

import os


# token =  os.environ.get('GITHUB_TOKEN')
token =  "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"

header = headers(token)
query = get_query(query_file)
params = {"owner": owner , "repo": repo, "counter": 3, "cursor": None}
response = requests.post('https://api.github.com/graphql', json = {'query': query, 'variables':params}, headers = header)

In [13]:
response.status_code

200

In [14]:
data = response.json()
data

{'data': {'repository': {'id': 'MDEwOlJlcG9zaXRvcnk0NTcxNzI1MA==',
   'issues': {'totalCount': 38326,
    'pageInfo': {'hasNextPage': True, 'endCursor': 'Y3Vyc29yOnYyOpHOBuhxUA=='},
    'edges': [{'node': {'id': 'MDU6SXNzdWUxMTU4ODYzMDI=',
       'body': 'Currently we only support Python 2.7, but we should support Python 3.\n',
       'title': 'Add support for Python 3.x',
       'state': 'CLOSED',
       'number': 1,
       'comments': {'pageInfo': {'endCursor': 'Y3Vyc29yOnYyOpHOCT6V6A==',
         'hasNextPage': True},
        'edges': [{'node': {'body': 'Main things this involves: `print -> print()`, handle `__floordiv__` / `__truediv__` / `__div__` correctly.\n',
           'id': 'MDEyOklzc3VlQ29tbWVudDE1NTA3NzU5NQ=='}},
         {'node': {'body': ':+1:  to this issue\n',
           'id': 'MDEyOklzc3VlQ29tbWVudDE1NTA4NzcxMw=='}},
         {'node': {'body': ':+1:\n',
           'id': 'MDEyOklzc3VlQ29tbWVudDE1NTA5NjU1Mg=='}}],
        'totalCount': 48}}},
     {'node': {'id': 'MDU6SX

- Look at the hasnextpage field in the response and the endcursor field in the response
- If hasnextpage is true, then there is a next page
- If hasnextpage is false, then there is no next page
- If hasnextpage is true, then the endcursor field in the response contains the cursor for the next page
- pass the cursor as variable to the query to get the next page
- this can be done by update the params dictionary with the cursor 
- The scaper and parser have to work together to get the data
- the parser extracts the data from page 1 and check if there is a next page
- if there is a next page, then the parser will pass the cursor to the scraper
- the scraper will use the cursor to get the next page
- the parser will extract the data from the next page 
- and so on until there is no next page



In [24]:
import requests

class GitHubParser:
    def __init__(self, token):
        self.token = token

    def headers(self):
        return {
            'Content-Type': 'application/x-www-form-urlencoded',
            'Authorization': f"bearer {str(self.token)}",
            'Accept': 'application/vnd.github.v3+json'
        }

    def get_query(self, query_file):
        with open(query_file, 'r') as f:
            query = f.read()
        return query

    def parse_response(self, response):
        data = response.json().get('data', {}).get('repository', {})
        issues = data.get('issues', {})
        total_count = issues.get('totalCount', 0)
        page_info = issues.get('pageInfo', {})
        has_next_page = page_info.get('hasNextPage', False)
        end_cursor = page_info.get('endCursor', None)
        edges = issues.get('edges', [])

        parsed_data = {
            'total_count': total_count,
            'has_next_page': has_next_page,
            'end_cursor': end_cursor,
            'issues': edges
        }

        return parsed_data

def github_scraper(parser, params):
    query = parser.get_query(query_file)
    header = parser.headers()

    response = requests.post('https://api.github.com/graphql', json={'query': query, 'variables': params},
                             headers=header)
    

    parsed_data = parser.parse_response(response)
    print(parsed_data.keys())
    issues = parsed_data['issues']
    for issue in issues:
        issue_number = issue['node']['number']
        print(f"Issue Number: {issue_number}")
    
    if int(issue_number) > 22:
        # Stoping the scraper after 20 issues for demo purpose
        return

    # Check if there is a next page and call the scraper recursively
    if parsed_data['has_next_page']:
        params['issuecursor'] = parsed_data['end_cursor']
        github_scraper(parser, params)

# Example usage
token = "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"
parser = GitHubParser(token)
owner, repo = "tensorflow", "tensorflow"
counter = 3
cursor = None
params = {"owner": owner, "repo": repo, "counter": counter, "cursor": cursor}
github_scraper(parser, params)


dict_keys(['total_count', 'has_next_page', 'end_cursor', 'issues'])
Issue Number: 1
Issue Number: 2
Issue Number: 3
dict_keys(['total_count', 'has_next_page', 'end_cursor', 'issues'])
Issue Number: 4
Issue Number: 5
Issue Number: 6
dict_keys(['total_count', 'has_next_page', 'end_cursor', 'issues'])
Issue Number: 7
Issue Number: 8
Issue Number: 9
dict_keys(['total_count', 'has_next_page', 'end_cursor', 'issues'])
Issue Number: 10
Issue Number: 11
Issue Number: 12
dict_keys(['total_count', 'has_next_page', 'end_cursor', 'issues'])
Issue Number: 14
Issue Number: 15
Issue Number: 16
dict_keys(['total_count', 'has_next_page', 'end_cursor', 'issues'])
Issue Number: 17
Issue Number: 18
Issue Number: 19
dict_keys(['total_count', 'has_next_page', 'end_cursor', 'issues'])
Issue Number: 20
Issue Number: 21
Issue Number: 22


We have now demonstrated how to use the pagination to get the data you want, you can try setting the counter to 90 and get 90 issues / comments from the api at a time which makes the scapring efficient and ensure you are not hitting the rate limit which is based on the number of calls you make to the api. 
