In [165]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import math
from tqdm import tqdm
import sys
import csv
import concurrent.futures
import time
MAX_THREADS = 30

In [166]:
def tag_scrape(tag_keyword):
    url = "https://stackoverflow.com/filter/tags-for-index"
    data_obj = { 'filter': tag_keyword, 'tab': 'Name', 'fkey': 'StackExchange.options.user.fkey' }
    
    response = s.post(url, data = data_obj)
    soup = BeautifulSoup(response.content, 'html.parser')
    tags = soup.find_all("a", class_="post-tag")
    
    tags_list = []
    for tag in tags: 
        tags_list.append(tag.text)
    
    print(f'\nFound {len(tags_list)} tag(s):\n{tags_list}')
    
    return tags_list

In [167]:
def page_scrape(page_count, sort_param="Newest", filter_b=False, tag_keyword=None):
    try:
        page_no = 1

        if(filter_b):
            tag_keyword = tag_keyword.replace("#", "%23")
            url = f'https://stackoverflow.com/questions/tagged/{tag_keyword}?tab={sort_param}&page={page_no}&pagesize=50'
        else:
            url = f'https://stackoverflow.com/questions?tab={sort_param}&page={page_no}&pagesize=50'

        response = s.get(url, allow_redirects=True)
        soup = BeautifulSoup(response.content, 'html.parser')

        qns_t = soup.find_all("div", class_="fs-body3 flex--item fl1 mr12 sm:mr0 sm:mb12")

        max_qns = int(re.sub('[^0-9]', "", qns_t[0].text.replace(",","")))
        max_pages = math.ceil(max_qns/50)

        if(page_count < max_pages):
            max_qns = page_count * 50

        page_count = min(page_count, max_pages)

        if(filter_b):
            print(f'\nScraping {page_count} page(s) or {max_qns} questions for tag {tag_keyword}')
        else:
            print(f'\nScraping {page_count} page(s) or {max_qns} questions')

        with tqdm(total=page_count, file=sys.stdout) as pbar:
            for page_no in range(1, page_count+1):
                if(filter_b):
                    url = f'https://stackoverflow.com/questions/tagged/{tag_keyword}?tab={sort_param}&page={page_no}&pagesize=50'
                else:
                    url = f'https://stackoverflow.com/questions?tab={sort_param}&page={page_no}&pagesize=50'

                resp_page = s.get(url, allow_redirects=True)
                soup = BeautifulSoup(resp_page.content, 'html.parser')

                qn_links = soup.select("a[class=question-hyperlink]")
                threads = min(MAX_THREADS, len(qn_links))

                with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
                    executor.map(qn_scrape, qn_links)
                pbar.update(1)
    except Exception as e:
        print(e)

In [168]:
def qn_scrape(qn_link):
    try:
        url = f'https://stackoverflow.com{qn_link["href"]}'
        qn_page = s.get(url, allow_redirects=True)
        qn_soup = BeautifulSoup(qn_page.content, 'html.parser')
        
        qn = qn_soup.find("div", class_="postcell")
        qn_title = qn_soup.find("a", href=qn_link["href"])
        qn_body = qn.find("div", class_="s-prose")
        qn_tags = qn.find_all("a", class_="post-tag")
                    
        dataset.append([str(qn_title.text), str(qn_body.text), [tag.text for tag in qn_tags], len(qn_tags)])
    except:
        pass

In [169]:
def scraper():
    """
    Scraping function
    """
    print("\n----------------Stackoverflow Scraper----------------\n")
    sort_param = input("Sort questions based on - Newest (Default), Active, Bounties, Unanswered, Frequent, Votes\n>")
    sort_param = sort_param.capitalize()
    
    filter_b = ((input("Filter questions by tag [Y/n] >").capitalize()) == "Y")
    
    tag_keyword = None
    if(filter_b):
        while(True):
            tag_keyword = input("Enter a keyword to find all matching tags >")
            tags_list = tag_scrape(tag_keyword)
            
            if(input("Proceed with the above tags [Y/n] >").lower() == "y"):
                break
        
        page_count = int(input("Enter number of pages to scrape per tag >"))
        for tag in tags_list:
            page_scrape(page_count=page_count, sort_param=sort_param, filter_b=filter_b, tag_keyword=tag)
    else:
        page_count = int(input("Enter number of pages to scrape >"))
        page_scrape(page_count=page_count, sort_param=sort_param, filter_b=filter_b)

In [170]:
if __name__ == "__main__":
    s = requests.session()
    dataset = []
    while(True):
        scraper()
        if(input("Quit scraper [Y/n] >").lower() == "y"):
            break
    
    file_name = input("Enter output filename (e.g. output.csv): ")
    df = pd.DataFrame(dataset, columns = ['Title', 'Body', 'Tags', 'Tag Count'])
    df.to_csv(file_name, sep=";", encoding='utf-8-sig', index_label='ID')


----------------Stackoverflow Scraper----------------

Sort questions based on - Newest (Default), Active, Bounties, Unanswered, Frequent, Votes
>
Filter questions by tag [Y/n] >n
Enter number of pages to scrape >3

Scraping 3 page(s) or 150 questions
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:53<00:00, 17.94s/it]
Quit scraper [Y/n] >Y
Enter output filename (e.g. output.csv): test.csv


In [171]:
data = pd.read_csv(file_name, sep=";")
data.head()

Unnamed: 0,ID,Title,Body,Tags,Tag Count
0,0,Can I run react-native app on a real ios devic...,\nI have a window machine and an Iphone and I ...,"['ios', 'react-native']",2
1,1,How to select with conditions in relation in T...,"\nI want to find row in Friendship, where frie...","['javascript', 'typescript', 'typeorm']",3
2,2,How to pass value of a form to another form PHP?,\nI have a form A in index.php which I need to...,"['php', 'model-view-controller']",2
3,3,Why does C# compiler generate anonymous delega...,\nConsider the following code creating and run...,"['c#', 'delegates', 'anonymous-methods']",3
4,4,Why l got this error when i try to change conn...,"\nSalem,\nI'm new in laravel 8 and i'm using s...","['php', 'laravel-8', 'laravel-blade', 'sql-ser...",4


In [172]:
data.isna().any()

ID           False
Title        False
Body         False
Tags         False
Tag Count    False
dtype: bool

In [173]:
len(data)

148