In [34]:
from pymongo import MongoClient # https://pymongo.readthedocs.io/en/stable/tutorial.html
import requests
import requests_cache # https://requests-cache.readthedocs.io/en/stable/
import datetime
from typing import List, Dict, Tuple, Optional
import json
import time

In [27]:
stack_session = requests_cache.CachedSession('.cache/stack_cache', cache_control=True, stale_if_error=True backend='filesystem')
requests_cache.install_cache(stack_session)
assert requests_cache.patcher.is_installed()

requests_cache.patcher.get_cache()

<SQLiteCache(name=<CachedSession(cache=<FileCache(name=.cache/stack_cache)>, expire_after=-1, urls_expire_after=None, allowable_codes=(200,), allowable_methods=('GET', 'HEAD'), stale_if_error=False, cache_control=True)>)>

# Getting StackOverflow Posts

Questions are procured from the [StackOverflow REST API](https://api.stackexchange.com/docs), specifically the [/questions endpoint](https://api.stackexchange.com/docs/questions#order=desc&sort=activity&tagged=c%3Bc%2B%2B&filter=default&site=stackoverflow).

In [33]:
def get_stackoverflow_questions(**kwargs):
    
    pagesize = kwargs.get('pagesize', 100) # How many questions to return per page
    assert 1 <= pagesize <= 100            # Stack allows [0, 100] but why waste API calls?

    page = kwargs.get('page', 1)           # Starting page index, 1-indexed
    assert page >= 1                       

    maxpages = kwargs.get('maxpages', 10)  # Max number of pages to return
    assert maxpages >= 1

    question_boundary_younger = datetime.datetime(2021, 12, 4) # No questions posted more recently than this will be returned
    done = False # Set to True if we hit our request quota or no more question data is available
    requests_made = 0

    while not done and requests_made < maxpages:
        query_params = {
            'site': 'stackoverflow',
            'sort': 'activity',
            'order': 'desc',
            'tagged': 'c;c++',
            'page': page,
            'pagesize': pagesize,
            'todate': int(question_boundary_younger.timestamp())
        }

        # Returns a Common Wrapper Object
        # https://api.stackexchange.com/docs/wrapper
        r = requests.get('https://api.stackexchange.com/2.3/questions', params=query_params)
        r.raise_for_status()
        assert 'json' in r.headers['content-type'] # We're expecting JSON back

        requests_made += 1
        page += 1

        # Yield each question in the response
        body = r.json()
        assert 'items' in body
        assert isinstance(body['items'], list)
        yield from body['items']

        # Check if we're done
        done = body['has_more'] or body['quota_remaining'] <= 0

        # Check if we need to back off before sending more requests. Only necessary if we're not done.
        if not done:
            backoff = body.get('backoff', 0)
            if backoff > 0:
                print(f'Sleeping for {backoff} seconds')
                time.sleep(backoff)


requests left: 250


In [21]:
# Ensure the request was successful
assert r.status_code == 200

# Assert that the response is JSON
content_type: str = r.headers['content-type']
assert content_type is not None and 'json' in content_type

body = r.json() # https://docs.python-requests.org/en/latest/user/quickstart/#json-response-content
print(json.dumps(body, indent=4))

{
    "items": [
        {
            "tags": [
                "c++",
                "c"
            ],
            "owner": {
                "account_id": 7828538,
                "reputation": 81,
                "user_id": 5919420,
                "user_type": "registered",
                "profile_image": "https://lh6.googleusercontent.com/-rRpxaKDpB1g/AAAAAAAAAAI/AAAAAAAAAJs/h7rcFLQxHJs/photo.jpg?sz=256",
                "display_name": "ayushi grover",
                "link": "https://stackoverflow.com/users/5919420/ayushi-grover"
            },
            "is_answered": true,
            "view_count": 5365,
            "accepted_answer_id": 35367222,
            "answer_count": 4,
            "score": 6,
            "last_activity_date": 1638807991,
            "creation_date": 1455293589,
            "question_id": 35367208,
            "content_license": "CC BY-SA 3.0",
            "link": "https://stackoverflow.com/questions/35367208/why-can-i-call-a-function-in-c-withou

In [16]:
body = r.json()
items: List[Dict] = body['items']
has_more: bool = body['has_more']
quota_max: int = body['quota_max']
quota_remaining: int = body['quota_remaining']

In [17]:
r.headers

{'cache-control': 'private', 'content-length': '5779', 'content-type': 'application/json; charset=utf-8', 'content-encoding': 'gzip', 'strict-transport-security': 'max-age=15552000', 'access-control-allow-origin': '*', 'access-control-allow-methods': 'GET, POST', 'access-control-allow-credentials': 'false', 'x-content-type-options': 'nosniff', 'x-request-guid': 'b8eb5ee1-9826-4423-86f1-eed8cd9a705b', 'set-cookie': 'prov=6770896c-e821-4846-a4f5-1dc2d2c05b8c; expires=Fri, 01 Jan 2055 00:00:00 GMT; domain=.stackexchange.com; path=/; secure; samesite=none; httponly', 'content-security-policy': "upgrade-insecure-requests; frame-ancestors 'self' https://stackexchange.com", 'date': 'Mon, 06 Dec 2021 19:32:06 GMT'}