In [30]:
from pymongo import MongoClient, UpdateOne # https://pymongo.readthedocs.io/en/stable/tutorial.html
import dns
import requests
import requests_cache # https://requests-cache.readthedocs.io/en/stable/
import datetime
from typing import List, Dict, Tuple, Optional
import json
import time
from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup


load_dotenv()
None

In [8]:
# Connect to MongoDB
_mongo_uri = os.getenv('MONGO_URI')
# _mongo_uri = 'mongodb+srv://admin:<password>@cmsc320-final-tutorial.i5dh9.mongodb.net/myFirstDatabase?retryWrites=true&w=majority'
# print(_mongo_uri)
if not _mongo_uri:
    raise Exception('MONGO_URI not set')

client = MongoClient(_mongo_uri, tls=True)
db = client['stackOverflowDB']
questions = db['questions']

In [9]:
# Set up request caching for StackOverflow API
session = requests_cache.CachedSession('.cache/stack_cache', cache_control=True, stale_if_error=True, backend='filesystem')

# Getting StackOverflow Questions

Questions are procured from the [StackOverflow REST API](https://api.stackexchange.com/docs), specifically the [/questions endpoint](https://api.stackexchange.com/docs/questions#order=desc&sort=activity&tagged=c%3Bc%2B%2B&filter=default&site=stackoverflow). We'll be limiting our search to C/C++ code snippets for simplicity.

In [25]:
def get_stackoverflow_questions(**kwargs):
    
    pagesize = kwargs.get('pagesize', 100) # How many questions to return per page
    assert 1 <= pagesize <= 100            # Stack allows [0, 100] but why waste API calls?

    page = kwargs.get('page', 1)           # Starting page index, 1-indexed
    assert page >= 1                       

    maxpages = kwargs.get('maxpages', 10)  # Max number of pages to return
    assert maxpages >= 1

    question_boundary_younger = datetime.datetime(2021, 12, 4) # No questions posted more recently than this will be returned
    done = False # Set to True if we hit our request quota or no more question data is available
    requests_made = 0

    while not done and requests_made < maxpages:
        query_params = {
            'site': 'stackoverflow',
            'sort': 'activity',
            'order': 'desc',
            'tagged': 'c;c++',
            'page': page,
            'pagesize': pagesize,
            'todate': int(question_boundary_younger.timestamp())
        }

        # Returns a Common Wrapper Object
        # https://api.stackexchange.com/docs/wrapper
        r = session.get('https://api.stackexchange.com/2.3/questions', params=query_params)
        r.raise_for_status()
        assert 'json' in r.headers['content-type'] # We're expecting JSON back

        requests_made += 1
        page += 1

        # Yield each question in the response
        body = r.json()
        assert 'items' in body
        assert isinstance(body['items'], list)
        yield body['items']

        # Check if we're done
        quota_remaining = body['quota_remaining']
        quota_max = body['quota_max']
        has_more: bool = body['has_more']
        done = not body['has_more'] or body['quota_remaining'] <= 0

        print(f'Got {pagesize} questions from page #{page} (quota: {quota_remaining}/{quota_max})', end='\r')


        # Check if we need to back off before sending more requests. Only necessary if we're not done.
        backoff = body.get('backoff', 0)
        if not done and backoff > 0:
            print(f'Backoff requested, sleeping for {backoff} seconds')
            time.sleep(backoff)


In [28]:
# This takes a while, is expensive, and is only necessary once. This flag
# lets you skip this step if you've already run it.
should_scrape = False

if should_scrape:
    # Make sure we connected to the database
    try:
        client.admin.command('ping')
    except ConnectionFaliure:
        print('Failed to connect to MongoDB')

    # Scrape each page, bulk inserting each one into mongo
    for page in get_stackoverflow_questions(page=1, maxpages=100):
        if type(page) is not list:
            assert type(page) is dict
            page = [page]

        page = filter(lambda q: q['answer_count'] > 0, page)
        upserts = [UpdateOne({'_id': q['question_id']}, {'$set': q}, upsert=True) for q in page]
        questions.bulk_write(upserts)


# Scraping StackOverflow Answers

In [29]:
def get_questions(**kwargs):
    pagesize = kwargs.get('pagesize', 100) # How many questions to return per page
    assert 1 <= pagesize 

    page = kwargs.get('page', 1)           # Starting page index, 1-indexed
    assert page >= 1

    # Calculate number of documents to skip
    skips = page_size * (page_num - 1)

    # Skip and limit
    cursor = questions.find().skip(skips).limit(page_size)
    for doc in cursor:
        yield doc

In [47]:
def scrape_stackoverflow_page(url: str) -> List:
    r = session.get(url)
    html_doc = r.text
    soup = BeautifulSoup(html_doc, 'html.parser')

    # const answerTags = document.querySelectorAll('.answer')
    answers = soup.select('.answer')

    answers_parsed = []
    for answer in answers:
        answer_cell = answer.select_one('.answercell')
        snippet_elems = answer_cell.select('pre > code')

        answer_data = {
            'snippets': [code_block.text for code_block in snippet_elems],
            'score': int(answer['data-score']),
            'answer_id': int(answer['data-answerid']),
            'page_pos': int(answer['data-position-on-page']),
            'is_highest_scored': answer['data-highest-scored'] == '1',
            'question_has_highest_accepted_answer': answer['data-question-has-accepted-highest-score'] == '1',
            # 'is_accepted': answer.has_class('accepted-answer'),
            'is_accepted': 'accepted-answer' in answer['class'],
            'source': answer.select_one('a.js-share-link').get('href').strip(),
        }

        answers_parsed.append(answer_data)

    return answers_parsed

scrape_stackoverflow_page('https://stackoverflow.com/questions/69729326/endless-sine-generation-in-c')

[{'snippets': ['static const double a = 2 * M_PI * 280 * 30e-6;\nstatic const double dx = cos(a);\nstatic const double dy = sin(a);\ndouble x = 1, y = 0; // complex x + iy\nint counter = 0;\n\nvoid control_loop() {\n    double xx = dx*x - dy*y;\n    double yy = dx*y + dy*x;\n    x = xx, y = yy;\n\n    // renormalize once in a while, based on\n    // https://www.gamedev.net/forums/topic.asp?topic_id=278849\n    if((counter++ & 0xff) == 0) {\n        double d = 1 - (x*x + y*y - 1)/2;\n        x *= d, y *= d;\n    }\n\n    double sine = y; // this is your sine\n}\n',
   'xx = cos((n+1)*a) = cos(n*a)*cos(a) - sin(n*a)*sin(a) = x*dx - y*dy\nyy = sin((n+1)*a) = sin(n*a)*cos(a) + cos(n*a)*sin(a) = y*dx + x*dy\n',
   'double d = 1/sqrt(x*x + y*y);\nx *= d, y *= d;\n',
   'd = 1 - (x*x + y*y - 1)/2\n'],
  'score': 110,
  'answer_id': 69729390,
  'page_pos': 1,
  'is_highest_scored': True,
  'question_has_highest_accepted_answer': False,
  'is_accepted': False,
  'source': '/a/69729390'},
 {'sni