In [2]:
import requests
import time
import pandas as pd
from urllib import parse
from bs4 import BeautifulSoup

In [3]:
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, sdch, br",
    "Accept-Language": "en-US,en;q=0.8",
    "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/51.0.2704.103 Safari/537.36")
}

In [4]:
def format_url(url):
    u = parse.urlparse(url)
    scheme = u.scheme or "https"
    host = u.netloc
    path = u.path

    if not u.query:
        query = ""
    else:
        query = "?"
        for piece in u.query.split("&"):
            k, v = piece.split("=")
            if k in ['search', 'q', 'page']:
                query += "{}={}&".format(k, v)
        query = query[:-1]
    print("{}://{}{}{}".format(scheme, host, path, query))
    return "{}://{}{}{}".format(scheme, host, path, query)

In [13]:
def make_request(url):
    url = format_url(url)
    try:
        r = requests.get(url, headers=headers)
    except Exception as e:
        print("""WARNING: {}
Request for {} failed, trying again.""".format(e, url))

    if r.status_code != 200:
        print("WARNING: Got a {} status code for URL: {}"
              .format(r.status_code, url))

    return BeautifulSoup(r.content, "html.parser")

In [6]:
def parse_model(page):
    docs_list = []
    try:
        posts = page.find_all(class_="post-text")
        for post in posts:
            docs_list.append(post.text.strip())
    except Exception as e:
        print('\n')
        print("WARNING: {}".format(e))
    except KeyboardInterrupt:
        print('\n')
        print("WARNING: Fetching docs interrupted by user.")
    return docs_list

In [15]:
def crawl_stack_exchange(search_query, page_number = 338, all_posts = []):
    try:
        url = 'https://stackexchange.com/search?'
        url += 'q=' + '+'.join(search_query.split(' '))
        url += '&page=' + str(page_number)
        url = url.strip()
        page = make_request(url)
        result_divs = page.find_all(class_='result-link')
        if(not result_divs):
            return []
        for div in result_divs:
            link = div.findAll('a')
            all_posts += parse_model(make_request(link[0]['href']))
            print('Posts found so far:', len(all_posts))
            #time.sleep(1) #seconds
        crawl(search_query, page_number + 1, all_posts)
    except Exception as e:
        print('\n')
        print("WARNING: {}".format(e))
    except KeyboardInterrupt:
        print('WARNING: Fetching posts interrupted by user.')
        print('Posts found:', len(all_posts))
    return all_posts
all_posts = crawl_stack_exchange(search_query='agile scrum')
pd.DataFrame(all_posts).to_csv('imposters3.0.csv')

https://stackexchange.com/search?q=agile+scrum&page=338
https://sqa.stackexchange.com/questions/14136/different-testing-approaches-for-different-teams/14314
Posts found so far: 7
https://softwareengineering.stackexchange.com/questions/345679/is-daily-commit-and-push-necessary-for-managers-and-supervisors-to-check-my-prog/345683
Posts found so far: 13
https://softwareengineering.stackexchange.com/questions/228969/fixing-bugs-may-lead-to-delayed-feedback/228971
Posts found so far: 17
https://softwareengineering.stackexchange.com/questions/188059/i-want-to-adopt-an-agile-methodology-in-a-company-that-has-no-process-at-all/188062
Posts found so far: 20
https://softwareengineering.stackexchange.com/questions/343573/in-scrum-agile-how-to-deliver-a-validation-rules-engine-incrementally/343580
Posts found so far: 24
https://softwareengineering.stackexchange.com/questions/80751/how-can-we-make-agile-enjoyable-for-developers-that-like-to-personally-independ/246026
Posts found so far: 36
https://

Posts found so far: 377
https://stackexchange.com/search?q=agile+scrum&page=342
https://stackoverflow.com/questions/1720662/adopting-software-project-management-and-testing-protocol-from-scratch/1720693
Posts found so far: 382
https://stackoverflow.com/questions/307248/how-does-a-scrum-master-manage-an-out-of-control-product-owner/307720
Posts found so far: 393
https://stackoverflow.com/questions/28298641/gherkin-how-do-you-write-an-unambiguous-test-case-in-english/28303414
Posts found so far: 396
https://stackoverflow.com/questions/45844490/methodologies-for-requirement-analysis-database-development/45847903
Posts found so far: 398
https://stackoverflow.com/questions/13061757/what-is-story-point/13061839
Posts found so far: 402
https://stackoverflow.com/questions/2850916/agile-and-code-release/2850968
Posts found so far: 406
https://stackoverflow.com/questions/1237515/story-estimates-in-scrum/1237628
Posts found so far: 411
https://pm.stackexchange.com/questions/17540/scrum-client-wit

Posts found so far: 809
https://pt.stackoverflow.com/questions/3274/quando-usar-waterfall-e-quando-usar-scrum/3359
Posts found so far: 817
https://softwareengineering.stackexchange.com/questions/226613/what-development-process-encourages-frequent-releases-rolling-code-to-a-live-si/227718
Posts found so far: 819
https://softwareengineering.stackexchange.com/questions/247154/is-there-any-software-development-methodologies-for-small-teams/247159
Posts found so far: 821
https://stackoverflow.com/questions/4417835/what-does-the-approved-state-for-a-work-item-signify-in-team-foundation-server-2/4550537
Posts found so far: 825
https://pm.stackexchange.com/questions/12668/how-to-display-time-tracking-fields-in-issue-detail-view-using-kanban-board-in-j
Posts found so far: 830
https://softwareengineering.stackexchange.com/questions/316323/how-the-time-of-agile-sprint-is-calculated/316327
Posts found so far: 833
https://stackoverflow.com/questions/2234851/time-tracking-and-agile-methodology/17325

https://stackoverflow.com/questions/1891742/dual-bandwidth-agile-team/1891785
Posts found so far: 1146
https://stackoverflow.com/questions/1891742/dual-bandwidth-agile-team/1891785
Posts found so far: 1152
https://stackoverflow.com/questions/1219241/how-do-you-draw-the-line-between-agile-development-and-scope-creep/1219374
Posts found so far: 1162
https://softwareengineering.stackexchange.com/questions/251464/in-general-should-an-organization-adopt-a-single-methodology-or-decide-on-a-per/251509
Posts found so far: 1167
https://stackoverflow.com/questions/2164588/ways-to-improve-communication-between-members-on-a-software-team/2164829
Posts found so far: 1177
https://stackoverflow.com/questions/4362090/how-to-deploy-web-apps-in-the-agile-way/4362266
Posts found so far: 1180
https://stackoverflow.com/questions/13141785/jira-issue-with-done-tasks/13733831
Posts found so far: 1183
https://stackoverflow.com/questions/3094679/feasibility-of-scrum-with-certain-modifications-to-the-philosophy/

Posts found so far: 1521
https://stackexchange.com/search?q=agile+scrum&page=354
https://stackoverflow.com/questions/29104/requirements-gathering/881170
Posts found so far: 1542
https://softwareengineering.stackexchange.com/questions/229536/scrum-for-embedded-system-devices/229630
Posts found so far: 1546
https://softwareengineering.stackexchange.com/questions/56447/good-books-about-scrum-and-xp/56535
Posts found so far: 1552
https://softwareengineering.stackexchange.com/questions/209708/does-scrum-make-sense-when-implementing-a-new-compiler-backend/209716
Posts found so far: 1558
https://pm.stackexchange.com/questions/11532/is-there-room-for-good-tested-design-in-agile-setting/12870
Posts found so far: 1564
https://stackoverflow.com/questions/1007007/what-should-i-write-in-order-to-become-a-better-developer/1846479
Posts found so far: 1584
https://pm.stackexchange.com/questions/23088/agile-for-infrastructure-projects/23092
Posts found so far: 1586
https://pm.stackexchange.com/question

Posts found so far: 1874
https://softwareengineering.stackexchange.com/questions/66502/project-closures-in-scrum/66520
Posts found so far: 1879
https://stackexchange.com/search?q=agile+scrum&page=358
https://stackoverflow.com/questions/2774708/agile-approach-for-wcm/5126113
Posts found so far: 1882
https://softwareengineering.stackexchange.com/questions/156771/how-to-get-good-design-when-using-agile-methods/156808
Posts found so far: 1889
https://pm.stackexchange.com/questions/21723/best-most-agile-way-to-develop-multiple-products-concurrently
Posts found so far: 1894
https://pm.stackexchange.com/questions/8838/what-is-the-name-for-this-type-of-chart/8884
Posts found so far: 1897
https://softwareengineering.stackexchange.com/questions/173125/rewriting-software-using-agile-methodologies/173128
Posts found so far: 1904
https://pm.stackexchange.com/questions/21524/how-best-to-conduct-a-sprint-review-when-the-sprint-had-work-for-multiple-client
Posts found so far: 1908
https://pm.stackexch

Posts found so far: 2168
https://stackexchange.com/search?q=agile+scrum&page=362
https://softwareengineering.stackexchange.com/questions/65758/how-does-scrum-manage-an-enviroment-where-team-members-are-shared/78720
Posts found so far: 2175
https://softwareengineering.stackexchange.com/questions/370133/product-backlog-vs-huge-features/370149
Posts found so far: 2178
https://softwareengineering.stackexchange.com/questions/151301/how-do-i-know-if-i-am-using-scrum-methodologies/151363
Posts found so far: 2182
https://softwareengineering.stackexchange.com/questions/272610/how-can-user-stories-not-contain-requirements-when-written-on-a-card-and-still/272615
Posts found so far: 2192
https://pm.stackexchange.com/questions/20630/how-to-get-sprint-progress-from-complexity-based-estimation/20632
Posts found so far: 2196
https://pm.stackexchange.com/questions/11136/how-to-estimate-employee-resources
Posts found so far: 2201
https://gamedev.stackexchange.com/questions/62941/do-you-plan-before-or-im

Posts found so far: 2481
https://softwareengineering.stackexchange.com/questions/37495/how-do-you-measure-the-value-of-your-software/37502
Posts found so far: 2488
https://softwareengineering.stackexchange.com/questions/37495/how-do-you-measure-the-value-of-your-software/37506
Posts found so far: 2495
https://softwareengineering.stackexchange.com/questions/159115/should-agile-teams-deliver-new-features-daily/159122
Posts found so far: 2503
https://stackexchange.com/search?q=agile+scrum&page=366
https://softwareengineering.stackexchange.com/questions/214056/can-a-team-that-uses-scrum-achieve-co-dev-and-domain-expertise-if-it-handles-man/214058
Posts found so far: 2505
https://softwareengineering.stackexchange.com/questions/185160/velocity-does-not-plateau-over-time-why/185905
Posts found so far: 2511
https://softwareengineering.stackexchange.com/questions/349312/all-full-stack-developers-vs-backend-and-frontend-developers/349315
Posts found so far: 2515
https://softwareengineering.stack

Posts found so far: 2846
https://softwareengineering.stackexchange.com/questions/228405/some-team-members-dont-actively-participate-in-sprint-planning/228501
Posts found so far: 2852
https://softwareengineering.stackexchange.com/questions/36925/weeding-out-real-agile-from-buzzword-agile-in-an-interview/36970
Posts found so far: 2869
https://softwareengineering.stackexchange.com/questions/50168/what-roles-do-people-take-after-scrum-master-technical-lead/50226
Posts found so far: 2877
https://softwareengineering.stackexchange.com/questions/103418/any-best-practices-for-a-distributed-agile-team/103421
Posts found so far: 2882
https://softwareengineering.stackexchange.com/questions/131540/cfos-expectations-and-scrum-development/132596
Posts found so far: 2887
https://softwareengineering.stackexchange.com/questions/224914/how-to-introduce-scrum-in-a-company-accustomed-to-scrumbutt-or-scrumfall/224927
Posts found so far: 2892
https://softwareengineering.stackexchange.com/questions/204529/mul

Posts found so far: 3209
https://softwareengineering.stackexchange.com/questions/37495/how-do-you-measure-the-value-of-your-software/110410
Posts found so far: 3216
https://stackexchange.com/search?q=agile+scrum&page=373
https://softwareengineering.stackexchange.com/questions/260190/in-an-agile-environment-who-is-responsible-for-software-architecture/260290
Posts found so far: 3225
https://softwareengineering.stackexchange.com/questions/269478/fixed-scope-and-resources-but-i-am-being-asked-for-an-exact-release-date/269516
Posts found so far: 3229
https://softwareengineering.stackexchange.com/questions/269478/fixed-scope-and-resources-but-i-am-being-asked-for-an-exact-release-date
Posts found so far: 3233
https://softwareengineering.stackexchange.com/questions/103418/any-best-practices-for-a-distributed-agile-team/103791
Posts found so far: 3238
https://softwareengineering.stackexchange.com/questions/191884/do-your-stories-include-tasks-across-disciplines-how-do-you-do-capacity-plannin/

Posts found so far: 3540
https://softwareengineering.stackexchange.com/questions/36925/weeding-out-real-agile-from-buzzword-agile-in-an-interview/36958
Posts found so far: 3557
https://softwareengineering.stackexchange.com/questions/185160/velocity-does-not-plateau-over-time-why
Posts found so far: 3563
https://softwareengineering.stackexchange.com/questions/210111/can-a-daily-scrum-meeting-be-replaced-by-a-status-email/210114
Posts found so far: 3567
https://softwareengineering.stackexchange.com/questions/260190/in-an-agile-environment-who-is-responsible-for-software-architecture/260201
Posts found so far: 3576
https://softwareengineering.stackexchange.com/questions/332557/what-should-be-the-input-of-a-scrum-team/333122
Posts found so far: 3586
https://softwareengineering.stackexchange.com/questions/204529/multiple-scrum-teams-moving-to-single-backlog/206214
Posts found so far: 3592
https://softwareengineering.stackexchange.com/questions/350364/where-to-put-backlog-items-like-as-a-dev