In [60]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re
import math
from tqdm import tqdm
import sys
import csv

In [61]:
def tag_scrape(tag_keyword):
    url = "https://stackoverflow.com/filter/tags-for-index"
    data_obj = { 'filter': tag_keyword, 'tab': 'Name', 'fkey': 'StackExchange.options.user.fkey' }
    
    response = requests.post(url, data = data_obj)
    soup = BeautifulSoup(response.content, 'html.parser')
    tags = soup.find_all("a", class_="post-tag")
    
    tags_list = []
    for tag in tags: 
        tags_list.append(tag.text)
    
    print(f'\nFound {len(tags_list)} tag(s):\n{tags_list}')
    
    return tags_list

In [62]:
def question_scrape(page_count, sort_param="Newest", filter_b=False, tag_keyword=None):
    page_no = 1

    if(filter_b):
        tag_keyword = tag_keyword.replace("#", "%23")
        url = f'https://stackoverflow.com/questions/tagged/{tag_keyword}?tab={sort_param}&page={page_no}&pagesize=50'
    else:
        url = f'https://stackoverflow.com/questions?tab={sort_param}&page={page_no}&pagesize=50'
    
    response = requests.get(url, allow_redirects=True)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    qns_t = soup.find_all("div", class_="fs-body3 flex--item fl1 mr12 sm:mr0 sm:mb12")
    
    max_qns = int(re.sub('[^0-9]', "", qns_t[0].text.replace(",","")))
    max_pages = math.ceil(max_qns/50)
    
    if(page_count < max_pages):
        max_qns = page_count * 50
    
    page_count = min(page_count, max_pages)
    
    if(filter_b):
        print(f'\nScraping {page_count} page(s) or {max_qns} questions for tag {tag_keyword}')
    else:
        print(f'\nScraping {page_count} page(s) or {max_qns} questions')
    
    with tqdm(total=max_qns, file=sys.stdout) as pbar:
        for page_no in range(1, page_count+1):
            if(filter_b):
                url = f'https://stackoverflow.com/questions/tagged/{tag_keyword}?tab={sort_param}&page={page_no}&pagesize=50'
            else:
                url = f'https://stackoverflow.com/questions?tab={sort_param}&page={page_no}&pagesize=50'
            
            resp_page = requests.get(url, allow_redirects=True)
            soup = BeautifulSoup(resp_page.content, 'html.parser')
            
            qn_links = soup.select("a[class=question-hyperlink]")
            for qn_link in qn_links:
                try:
                    url = f'https://stackoverflow.com{qn_link["href"]}'
                    qn_page = requests.get(url, allow_redirects=True)
                    qn_soup = BeautifulSoup(qn_page.content, 'html.parser')
                    
                    qn = qn_soup.find("div", class_="postcell")
                    qn_title = qn_soup.find("a", href=qn_link["href"])
                    qn_body = qn.find("div", class_="s-prose")
                    qn_tags = qn.find_all("a", class_="post-tag")
                    
                    dataset.append([str(qn_title.text), str(qn_body.text), [tag.text for tag in qn_tags], len(qn_tags)])
                except:
                    pass
                pbar.update(1)

In [63]:
def scraper():
    """
    Scraping function
    """
    print("\n----------------Stackoverflow Scraper----------------\n")
    sort_param = input("Sort questions based on - Newest (Default), Active, Bounties, Unanswered, Frequent, Votes\n>")
    sort_param = sort_param.capitalize()
    
    filter_b = ((input("Filter questions by tag [Y/n] >").capitalize()) == "Y")
    
    tag_keyword = None
    if(filter_b):
        while(True):
            tag_keyword = input("Enter a keyword to find all matching tags >")
            tags_list = tag_scrape(tag_keyword)
            
            if(input("Proceed with the above tags [Y/n] >").lower() == "y"):
                break
        
        page_count = int(input("Enter number of pages to scrape per tag >"))
        for tag in tags_list:
            question_scrape(page_count=page_count, sort_param=sort_param, filter_b=filter_b, tag_keyword=tag)
    else:
        page_count = int(input("Enter number of pages to scrape >"))
        question_scrape(page_count=page_count, sort_param=sort_param, filter_b=filter_b)

In [64]:
if __name__ == "__main__":
    dataset = []
    while(True):
        scraper()
        if(input("Quit scraper [Y/n] >").lower() == "y"):
            break
    
    file_name = input("Enter output filename (e.g. output.csv): ")
    df = pd.DataFrame(dataset, columns = ['Title', 'Body', 'Tags', 'Tag Count'])
    df.to_csv(file_name, sep=";", encoding='utf-8-sig', index_label='ID')


----------------Stackoverflow Scraper----------------

Sort questions based on - Newest (Default), Active, Bounties, Unanswered, Frequent, Votes
>Votes
Filter questions by tag [Y/n] >y
Enter a keyword to find all matching tags >angular1

Found 5 tag(s):
['angular1.6', 'angular1.x', 'angular10', 'angular11', 'angular12']
Proceed with the above tags [Y/n] >y
Enter number of pages to scrape per tag >1

Scraping 1 page(s) or 50 questions for tag angular1.6
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:52<00:00,  1.05s/it]

Scraping 1 page(s) or 50 questions for tag angular1.x
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:25<00:00,  1.70s/it]

Scraping 1 page(s) or 50 questions for tag angular10
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:56<00:00,  1.13s/it]

Scraping 1 page(s) or 50 questions for tag angular11
100%|██████████████

In [65]:
#test
data = pd.read_csv(file_name, sep=";")
data.head()

Unnamed: 0,ID,Title,Body,Tags,Tag Count
0,0,how to upload files with angular using CORS,\nI'm using restangular 1.51 with angular 1.6....,"['angularjs', 'laravel', 'laravel-5', 'restang...",5
1,1,webpack uglify error: Unexpected token: keywor...,\nI am trying to run npm run build but I can't...,"['javascript', 'babeljs', 'webpack-2', 'uglify...",5
2,2,component communication angularjs 1.6,\nI need to implement component communication ...,"['angularjs', 'angular-components', 'angular1.6']",3
3,3,Monaco editor does not work in angular tab,"\nI am using Monaco editor with angular 1.6, I...","['angularjs', 'monaco-editor', 'angular1.6']",3
4,4,How to enable backspace on autocomplete field ...,\nI have some autocomplete fields in a form th...,"['angularjs', 'autocomplete', 'angularjs-mater...",4
