In [77]:
##general

import os


# Each website is a separate project (folder)
def create_project_dir(directory):
    if not os.path.exists(directory):
        print('Creating directory ' + directory)
        os.makedirs(directory)


# Create queue and crawled files (if not created)
def create_data_files(project_name, base_url):
    queue = os.path.join(project_name , 'queue.txt')
    crawled = os.path.join(project_name,"crawled.txt")
    if not os.path.isfile(queue):
        write_file(queue, base_url)
    if not os.path.isfile(crawled):
        write_file(crawled, '')


# Create a new file
def write_file(path, data):
    with open(path, 'w') as f:
        f.write(data)


# Add data onto an existing file
def append_to_file(path, data):
    with open(path, 'a') as file:
        file.write(data + '\n')


# Delete the contents of a file
def delete_file_contents(path):
    open(path, 'w').close()


# Read a file and convert each line to set items
def file_to_set(file_name):
    results = set()
    with open(file_name, 'rt') as f:
        for line in f:
            results.add(line.replace('\n', ''))
    return results


# Iterate through a set, each item will be a line in a file
def set_to_file(links, file_name):
    with open(file_name,"w") as f:
        for l in sorted(links):
            f.write(l+"\n")

In [78]:
##link finder

from html.parser import HTMLParser
from urllib import parse

class LinkFinder(HTMLParser):
    def __init__(self,base_url,page_url):
        super().__init__()
        self.base_url=base_url
        self.page_url=page_url
        self.links=set()
        
    def handle_starttag(self,tag,attrs):
        if tag=='a':
            for (attribute,value) in attrs:
                if attribute == 'href':
                    url=parse.urljoin(self.base_url,value)
                    self.links.add(url)
    
    def page_links(self):
        return self.links
    
    def error(self,message):
        pass

In [82]:
##spider

from urllib.request import urlopen

class Spider:

    project_name = ''
    base_url = ''
    domain_name = ''
    queue_file = ''
    crawled_file = ''
    queue = set()
    crawled = set()

    def __init__(self, project_name, base_url, domain_name):
        Spider.project_name = project_name
        Spider.base_url = base_url
        Spider.domain_name = domain_name
        Spider.queue_file = Spider.project_name + '/queue.txt'
        Spider.crawled_file = Spider.project_name + '/crawled.txt'
        self.boot()
        self.crawl_page('First spider', Spider.base_url)

    # Creates directory and files for project on first run and starts the spider
    @staticmethod
    def boot():
        create_project_dir(Spider.project_name)
        create_data_files(Spider.project_name, Spider.base_url)
        Spider.queue = file_to_set(Spider.queue_file)
        Spider.crawled = file_to_set(Spider.crawled_file)

    # Updates user display, fills queue and updates files
    @staticmethod
    def crawl_page(thread_name, page_url):
        if page_url not in Spider.crawled:
            print(thread_name + ' now crawling ' + page_url)
            print('Queue ' + str(len(Spider.queue)) + ' | Crawled  ' + str(len(Spider.crawled)))
            Spider.add_links_to_queue(Spider.gather_links(page_url))
            Spider.queue.remove(page_url)
            Spider.crawled.add(page_url)
            Spider.update_files()

    # Converts raw response data into readable information and checks for proper html formatting
    @staticmethod
    def gather_links(page_url):
        html_string = ''
        try:
            response = urlopen(page_url)
            if 'text/html' in response.getheader('Content-Type'):
                html_bytes = response.read()
                html_string = html_bytes.decode("utf-8")
            finder = LinkFinder(Spider.base_url, page_url)
            finder.feed(html_string)
        except Exception as e:
            print(str(e))
            return set()
        return finder.page_links()

    # Saves queue data to project files
    @staticmethod
    def add_links_to_queue(links):
        for url in links:
            if (url in Spider.queue) or (url in Spider.crawled):
                continue
            if Spider.domain_name != get_domain_name(url):
                continue
            Spider.queue.add(url)

    @staticmethod
    def update_files():
        set_to_file(Spider.queue, Spider.queue_file)
        set_to_file(Spider.crawled, Spider.crawled_file)
    
    
    
    

In [83]:
##domain

from urllib.parse import urlparse


# Get domain name (example.com)
def get_domain_name(url):
    try:
        results = get_sub_domain_name(url).split('.')
        return results[-2] + '.' + results[-1]
    except:
        return ''


# Get sub domain name (name.example.com)
def get_sub_domain_name(url):
    try:
        return urlparse(url).netloc
    except:
        return ''

In [96]:
##main
import threading
from queue import Queue

PROJECT_NAME = 'blackcoffer'
base_url = 'https://www.blackcoffer.com/'
DOMAIN_NAME = get_domain_name(base_url)
QUEUE_FILE = PROJECT_NAME + '/queue.txt'
CRAWLED_FILE = PROJECT_NAME + '/crawled.txt'
NUMBER_OF_THREADS = 8
queue = Queue()
Spider(PROJECT_NAME, base_url, DOMAIN_NAME)

# Create worker threads (will die when main exits)
def create_workers():
    for _ in range(NUMBER_OF_THREADS):
        t = threading.Thread(target=work)
        t.daemon = True
        t.start()


# Do the next job in the queue
def work():
    while True:
        url = queue.get()
        Spider.crawl_page(threading.current_thread().name, url)
        queue.task_done()


# Each queued link is a new job
def create_jobs():
    for link in file_to_set(QUEUE_FILE):
        queue.put(link)
    queue.join()
    crawl()


# Check if there are items in the queue, if so crawl them
def crawl():
    queued_links = file_to_set(QUEUE_FILE)
    if len(queued_links) > 0:
        print(str(len(queued_links)) + ' links in the queue')
        create_jobs()


create_workers()
crawl()

Creating directory blackcoffer
First spider now crawling https://www.blackcoffer.com/
Queue 1 | Crawled  0
27 links in the queue
Thread-14 now crawling https://www.blackcoffer.com/staff.htmlThread-21 now crawling https://www.blackcoffer.com/about.html
Thread-16 now crawling https://www.blackcoffer.com/industries.htmlQueue 27 | Crawled  1Thread-17 now crawling https://www.blackcoffer.com/index.html

Thread-20 now crawling https://qna.blackcoffer.com/Queue 27 | Crawled  1
Queue 27 | Crawled  1
Thread-18 now crawling https://www.blackcoffer.com/information.html
Queue 27 | Crawled  1
Thread-15 now crawling https://www.blackcoffer.com/research.html
Queue 27 | Crawled  1
Queue 27 | Crawled  1


Thread-19 now crawling https://www.blackcoffer.com/healthcare.html

Queue 27 | Crawled  1Queue 27 | Crawled  1

HTTP Error 403: Forbidden
Thread-20 now crawling https://insights.blackcoffer.com/
Queue 26 | Crawled  3
Thread-21 now crawling https://www.blackcoffer.com/bigdata.html
Queue 26 | Crawled  3

Thread-15 now crawling https://freelance.blackcoffer.com/job/5-mailchimp-template/
Queue 288 | Crawled  74
Thread-19 now crawling https://jobs.blackcoffer.com/term-and-conditions/
Queue 287 | Crawled  75
Thread-14 now crawling https://jobs.blackcoffer.com/job/?keyword=data+science
Thread-16 now crawling https://blackcoffer.com/request_demo.php
Queue 383 | Crawled  77Queue 383 | Crawled  77

Thread-15 now crawling https://jobs.blackcoffer.com/contact-us/
Queue 383 | Crawled  78
Thread-21 now crawling https://jobs.blackcoffer.com/job/uxui-designer/
Queue 387 | Crawled  79
Thread-16 now crawling https://freelance.blackcoffer.com/job/5-full-stack-development-react-js-node-js/
Queue 386 | Crawled  80
Thread-19 now crawling https://jobs.blackcoffer.com/lost-password/
Queue 385 | Crawled  81
Thread-17 now crawling https://jobs.blackcoffer.com/job/?keyword=linux
Queue 405 | Crawled  82
Thread-15 now crawling https://jobs.blackcoffer.com/job/front-end-engineer/
Queue 404 | Crawled  83
Thread-16

Thread-14 now crawling https://freelance.blackcoffer.com/Terms%20of%20Use
Queue 470 | Crawled  148
Thread-16 now crawling https://freelance.blackcoffer.com/employers-dashboard/
Queue 469 | Crawled  149
Thread-18 now crawling https://freelance.blackcoffer.com/how-it-works/
Queue 469 | Crawled  150
Thread-20 now crawling https://www.blackcoffer.com/CaseStudies/Finance/Blackcoffer Fraud Analytics.pdf
Queue 470 | Crawled  152
Thread-19 now crawling https://freelance.blackcoffer.com/job/5-sql-coding/
Queue 470 | Crawled  152
HTTP Error 400: Bad Request
Thread-20 now crawling https://freelance.blackcoffer.com/contact/
Queue 469 | Crawled  153
Thread-16 now crawling https://blackcoffer.com/products.html
Queue 468 | Crawled  154
Thread-19 now crawling https://freelance.blackcoffer.com/browse-project-categories/
Queue 467 | Crawled  155
Thread-17 now crawling https://jobs.blackcoffer.com/job/?keyword=developerThread-18 now crawling https://freelance.blackcoffer.com/Thread-20 now crawling https:

KeyboardInterrupt: 