In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import os, re, csv
from acquire import get_blog_articles

In [9]:
def fetch_all_urls(target: str)->list:
    '''Pass a target url to crawl to get all valid urls from the target, will return a list
    must pass a valid URL as target
    '''
    def add_valid_urls(parse_url, valid_urls, invalid_urls, done, base):
        '''Builds out a list of valid_urls, invalid_urls and flags if is done.
        '''
        headers = {'User-Agent': 'Codeup Data Science'}

        # What url is being parsed
        print(f'Parsing {parse_url}')

        # Pull the base url out
        response = get(parse_url, headers=headers)
        soup = BeautifulSoup(response.text)

        # Fetches all anchors from page
        anchors = soup.find_all('a')

        # Pulls all hyperlinks from the anchors
        regex = r'''<a\s+(?:[^>]*?\s+)?href="(.*?)"'''
        # checks if the urls have http in them or not and concats with base if not to test
        unchecked_urls = [base+url if 'http' not in url else url for url in re.findall(regex, str(anchors))]

        urls = list()

        # Check through urls to see what ones are not already checked
        for url in unchecked_urls:
            # Make string to check
            url = str(url)
            # Checks if url is in valids or invalid urls, then ensure's that 'codeup'
            # is within the url and ensures that it isn't already in the checked urls list
            if url in valid_urls or url in invalid_urls \
                or 'codeup' not in url or url in urls:
                pass
            else:
                # Add to urls to test
                urls.append(url)

        # If there are not any more url's to check, make done flag true
        if len(urls) == 0:
            done = True
            return valid_urls, invalid_urls, done

        # Iterates through all the new urls and checks to see if they are valid
        for url in urls:
            url = str(url)
            try:
                print('Testing', url)
                # Test to see if response
                response = get(url, headers=headers)

                # Check response codes
                code = response.status_code

                # Enusre is valid and that url is not already added
                if code == 200:
                    valid_urls.append(url)

            except Exception as e:
                # Add to invalid_urls
                invalid_urls.append(url)

        return valid_urls, invalid_urls, False
    
    # Set the valid urls to target to start there
    valid_urls = [target]
    invalid_urls = []
    tried = list()
    done = False
    base = re.match(r'^.*(?:com|org|gov|net|us|eu|tv|me|.co)', valid_urls[0])[0]
    while True:
        for url in valid_urls:
            if url not in tried:
                print(f'Trying {url}')
                tried.append(url)
                valid_urls, invalid_urls, done = add_valid_urls(url, valid_urls, invalid_urls, done, base)
            if done:
                break
        if done:
            break
            
    return valid_urls, invalid_urls

https://codeup.com
Trying https://codeup.com/blog/
Parsing https://codeup.com/blog/
Testing https://codeup.com/
Testing https://codeup.com/program/systems-engineering/
Testing https://codeup.com/program/cyber-cloud/
Testing https://codeup.com/program/full-stack-web-development/
Testing https://codeup.com/program/data-science/
Testing https://codeup.com/financial-aid/
Testing https://codeup.com/events/
Testing https://codeup.com/veterans/
Testing https://codeup.com/hire-tech-talent/
Testing https://codeup.com/resources/
Testing https://codeup.com/my-story/
Testing https://codeup.com/frequently-asked-questions/
Testing https://codeup.com/podcast/
Testing https://codeup.com/apply-now/
Testing https://codeup.com/about-codeup/
Testing https://codeup.com/category/behind-the-billboards/
Testing https://codeup.com/careers/
Testing https://codeup.com/index.php/
Testing https://codeup.com/programs/
Testing https://codeup.com/san-antonio/
Testing https://codeup.com/dallas/
Testing https://codeup.

In [8]:
valid_urls

['https://codeup.com/blog/',
 'https://codeup.com/',
 'https://codeup.com/program/systems-engineering/',
 'https://codeup.com/program/cyber-cloud/',
 'https://codeup.com/program/full-stack-web-development/',
 'https://codeup.com/program/data-science/',
 'https://codeup.com/financial-aid/',
 'https://codeup.com/events/',
 'https://codeup.com/veterans/',
 'https://codeup.com/hire-tech-talent/',
 'https://codeup.com/resources/',
 'https://codeup.com/my-story/',
 'https://codeup.com/frequently-asked-questions/',
 'https://codeup.com/podcast/',
 'https://codeup.com/apply-now/',
 'https://codeup.com/about-codeup/',
 'https://codeup.com/category/behind-the-billboards/',
 'https://codeup.com/careers/',
 'https://codeup.com/index.php/',
 'https://codeup.com/programs/',
 'https://codeup.com/san-antonio/',
 'https://codeup.com/dallas/',
 'https://codeup.com/houston/',
 'https://codeup.com/codeup-news/is-codeup-the-best-bootcamp-in-san-antonio-or-the-world/',
 'https://codeup.com/codeup-news/codeu