In [1]:
import requests
from bs4 import BeautifulSoup
from queue import Queue, Empty
from concurrent.futures import ThreadPoolExecutor
from urllib.parse import urljoin, urlparse

In [10]:
class MultiThreadScraper:

    def __init__(self, base_url):

        self.base_url = base_url
        self.root_url = '{}://{}'.format(urlparse(self.base_url).scheme, urlparse(self.base_url).netloc)
        self.pool = ThreadPoolExecutor(max_workers=150)
        lines = [line.rstrip('\n') for line in open('NUT_links.txt')]
        self.scraped_pages = set(lines)
        self.to_crawl = Queue()
        self.to_crawl.put(self.base_url)
        self.url_id = set([])

    def parse_links(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        links = soup.find_all('a', href=True)
        for link in links:
            url = link['href']
            if url.startswith('/') or url.startswith(self.root_url):
                url = urljoin(self.root_url, url)
                if url not in self.scraped_pages:
                    self.to_crawl.put(url)

    def scrape_info(self, html):
        return

    def post_scrape_callback(self, res):
        result = res.result()
        if result and result.status_code == 200:
            self.parse_links(result.text)
            self.scrape_info(result.text)

    def scrape_page(self, url):
        try:
            res = requests.get(url, timeout=(3, 30))
            return res
        except requests.RequestException:
            return

    def run_scraper(self):
        while True:
            with open('guardian_links.txt', 'a', encoding="utf-8") as file:
                try:
                    target_url = self.to_crawl.get(timeout=60)
                    if target_url not in self.scraped_pages:
                        print("Scraping URL: {}".format(target_url))
                        self.scraped_pages.add(target_url)
                        if '2019' in target_url:
                            file.write(target_url + '\n')
                        job = self.pool.submit(self.scrape_page, target_url)
                        job.add_done_callback(self.post_scrape_callback)
                except Empty:
                    return
                except Exception as e:
                    print(e)
                    continue
            file.close()

In [11]:
if __name__ == '__main__':
    s = MultiThreadScraper('https://www.theguardian.com/us-news/2019/aug/27/purdue-pharma-sackler-oycontin-opioids-drugs')
    s.run_scraper()

Scraping URL: https://www.theguardian.com/us-news/2019/aug/27/purdue-pharma-sackler-oycontin-opioids-drugs
Scraping URL: https://www.theguardian.com/international
Scraping URL: https://www.theguardian.com/preference/edition/int
Scraping URL: https://www.theguardian.com/preference/edition/uk
Scraping URL: https://www.theguardian.com/preference/edition/us
Scraping URL: https://www.theguardian.com/preference/edition/au
Scraping URL: https://www.theguardian.com/uk/commentisfree
Scraping URL: https://www.theguardian.com/uk/sport
Scraping URL: https://www.theguardian.com/uk/culture
Scraping URL: https://www.theguardian.com/uk/lifeandstyle
Scraping URL: https://www.theguardian.com/world
Scraping URL: https://www.theguardian.com/uk-news
Scraping URL: https://www.theguardian.com/science
Scraping URL: https://www.theguardian.com/cities
Scraping URL: https://www.theguardian.com/global-development
Scraping URL: https://www.theguardian.com/football
Scraping URL: https://www.theguardian.com/uk/techn

Scraping URL: https://www.theguardian.com/politics/2019/aug/27/brexit-party-nigel-farage-non-aggression-pact-boris-johnson-no-deal
Scraping URL: https://www.theguardian.com/politics/2019/aug/26/world-leaders-create-their-own-realities-at-the-g7-summit
Scraping URL: https://www.theguardian.com/commentisfree/2019/aug/26/bury-britain-gigg-lane-brexit
Scraping URL: https://www.theguardian.com/commentisfree/2019/aug/26/ben-stokes-ashes-brexit
Scraping URL: https://www.theguardian.com/commentisfree/2019/aug/26/dominic-cummings-yanis-varoufakis-2015-crisis
Scraping URL: https://www.theguardian.com/commentisfree/picture/2019/aug/27/steve-bell-on-boris-johnson-nigel-farage-and-a-non-aggression-pact-cartoon
Scraping URL: https://www.theguardian.com/politics/commentisfree/picture/2019/aug/26/martin-rowson-post-brexit-trade-deals-pork-pie-licence-fee-cartoon
Scraping URL: https://www.theguardian.com/commentisfree/picture/2019/aug/25/ben-jennings-on-boris-johnson-at-the-g7-summit-cartoon
Scraping U

Scraping URL: https://www.theguardian.com/world/2019/aug/26/g7-summit-biarritz-trump-theater-looms-next-year
Scraping URL: https://www.theguardian.com/world/2019/aug/25/g7-the-west-takes-its-eyes-off-africa-at-its-peril
Scraping URL: https://www.theguardian.com/world/2019/aug/24/trump-greenland-gambit-sad-sign-arctic-up-for-grabs
Scraping URL: https://www.theguardian.com/world/2019/aug/24/its-2019-we-need-to-talk-about-why-most-bras-are-still-so-terrible
Scraping URL: https://www.theguardian.com/world/video/2019/aug/27/macron-and-bolsonaros-war-of-words-over-amazon-fires-aid-and-their-wives-video-report
Scraping URL: https://www.theguardian.com/world/gallery/2019/aug/27/amazon-rainforest-fires-an-environmental-catastrophe-in-pictures
Scraping URL: https://www.theguardian.com/us-news/video/2019/aug/26/donald-trump-pitches-his-own-miami-resort-as-next-g7-venue-video
Scraping URL: https://www.theguardian.com/us-news/video/2019/aug/26/donald-trump-calls-iran-number-one-nation-of-terror-vid

Scraping URL: https://www.theguardian.com/world/2019/aug/10/bahrain-activist-jailed-for-criticising-grand-prix-released
Scraping URL: https://www.theguardian.com/stage/2019/aug/27/jordan-brookes-winner-edinburgh-comedy-award-interview-2019
Scraping URL: https://www.theguardian.com/guardian-masterclasses
Scraping URL: https://www.theguardian.com/music/2019/aug/16/eugene-onegin-review-edinburgh-festival-theatre-2019-asmik-grigorian
Scraping URL: https://www.theguardian.com/sport/nfl
Scraping URL: https://www.theguardian.com/environment/wildlife
Scraping URL: https://www.theguardian.com/uk/media
Scraping URL: https://www.theguardian.com/sport/cricket/2019/aug/27/all
Scraping URL: https://www.theguardian.com/sport/afl
Scraping URL: https://www.theguardian.com/lifeandstyle/series/sudoku
Scraping URL: https://www.theguardian.com/business/economics
Scraping URL: https://www.theguardian.com/business/banking
Scraping URL: https://www.theguardian.com/index/contributors/1-9
Scraping URL: https://

Scraping URL: https://www.theguardian.com/music/2019/aug/27/gary-crosby-godfather-british-jazz-renaissance-bassist-ella-fitzgerald
Scraping URL: https://www.theguardian.com/weekly?INTCMP=gdnwb_mawns_editorial_gweekly_GW_TopNav_UK
Scraping URL: https://www.theguardian.com/music/2019/aug/15/why-has-the-opera-world-rallied-round-placido-domingo
Scraping URL: https://www.theguardian.com/fashion/2019/aug/27/russell-tovey-pink-is-the-opposite-of-emasculating-not-shy-about-wearing
Scraping URL: https://www.theguardian.com/sport/nhl
Scraping URL: https://www.theguardian.com/us/film
Scraping URL: https://www.theguardian.com/environment/2019/aug/27/a-record-hot-summer-burned-the-first-fruit-of-my-apple-tree-and-left-a-bad-taste-in-my-mouth
Scraping URL: https://www.theguardian.com/sport/2019/aug/27/all-blacks-hope-beauden-barrett-gamble-pays-off-to-avoid-repeat-of-history
Scraping URL: https://www.theguardian.com/books/audio/2019/aug/27/elif-shafak-on-turkeys-treatment-of-novelists-books-podcast

Scraping URL: https://www.theguardian.com/books/2019/aug/22/richard-booth-obituary
Scraping URL: https://www.theguardian.com/politics/2019/jan/07/shaun-bailey-accused-of-worst-kind-of-casual-sexism-and-misogyny-2007-interview-abortion
Scraping URL: https://www.theguardian.com/au/lifeandstyle/relationships
Scraping URL: https://www.theguardian.com/commentisfree/video/2019/aug/14/ibram-x-kendi-racist-ideas-have-always-been-murderous
Scraping URL: https://www.theguardian.com/membership/2019/aug/27/the-inside-story-of-guardian-cities-a-new-kind-of-journalism
Scraping URL: https://www.theguardian.com/stage/2019/aug/26/edinburgh-rightwing-comedians-titania-mcgrath-brexit-party-edinburgh-fringe
Scraping URL: https://www.theguardian.com/business/2019/aug/27/dont-be-fooled-this-is-a-false-dawn-for-the-property-market
Scraping URL: https://www.theguardian.com/index/contributors/g
Scraping URL: https://www.theguardian.com/artanddesign/2019/aug/26/baroque-petworth-beauties-to-have-their-severed-le

Scraping URL: https://www.theguardian.com/crosswords/crossword-blog
Scraping URL: https://www.theguardian.com/stage/2019/aug/16/timmy-creed-spliced-edinburgh-festival-hurling
Scraping URL: https://www.theguardian.com/index/contributors/i
Scraping URL: https://www.theguardian.com/commentisfree/2019/aug/26/women-orgasm-sexism-pleasure-convict-rape
Scraping URL: https://www.theguardian.com/football/competitions
Scraping URL: https://www.theguardian.com/artanddesign/2019/aug/24/excellent-essex-gillian-darley-in-praise-of-britains-most-misunderstood-county
Scraping URL: https://www.theguardian.com/football/2019/aug/27/crewe-aston-villa-carabao-cup-match-report
Scraping URL: https://www.theguardian.com/commentisfree/2019/aug/08/trump-wanted-gamers-to-support-him-now-hes-blaming-them-for-gun-massacres
Scraping URL: https://www.theguardian.com/sport/2019/aug/26/serena-williams-maria-sharapova-us-open-first-round
Scraping URL: https://www.theguardian.com/sport/2019/aug/24/astana-vuelta-migeul-a

Scraping URL: https://www.theguardian.com/food/2019/aug/25/hetty-mckinnons-red-lentil-dahl-with-beetroot-raita-and-carrot-salad-recipe
Scraping URL: https://www.theguardian.com/crosswords/series/weekend-crossword
Scraping URL: https://www.theguardian.com/index/contributors/k
Scraping URL: https://www.theguardian.com/commentisfree/2019/aug/15/ray-mears-burglar-adrian-chiles-doppelgangers
Scraping URL: https://www.theguardian.com/lifeandstyle/2019/aug/13/lawyer-carrie-goldberg-online-harassment-revenge-porn
Scraping URL: https://www.theguardian.com/lifeandstyle/2019/aug/25/dear-mariella-frostrup-all-my-friends-have-babies
Scraping URL: https://www.theguardian.com/commentisfree/2019/aug/25/the-guardian-view-on-easier-gcses-at-private-schools-insult-added-to-injury
Scraping URL: https://www.theguardian.com/sport/tennis/2019/aug/26/all
Scraping URL: https://www.theguardian.com/sport/2019/aug/26/johanna-konta-wobble-us-open-tennis
Scraping URL: https://www.theguardian.com/tv-and-radio/2019/a


KeyboardInterrupt

