# Data Acquisition

In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import os, re, csv
from acquire import *

## Codeup Blog Articles

1. Scrape the article text from the following pages:

> - https://codeup.com/codeups-data-science-career-accelerator-is-here/
> - https://codeup.com/data-science-myths/
> - https://codeup.com/data-science-vs-data-analytics-whats-the-difference/
> - https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/
> - https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/

Encapsulate your work in a function named `get_blog_articles` that will return a list of dictionaries, with each dictionary representing one article. The shape of each dictionary should look like this:

``` python
{
    'title': 'the title of the article',
    'content': 'the full text content of the article'
}
```

In [2]:
urls = [
    'https://codeup.com/codeups-data-science-career-accelerator-is-here/',
    'https://codeup.com/data-science-myths/',
    'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/',
    'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
    'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/'
]
headers = {
    'User-Agent': 'Codeup Data Science'
}
        
articles = list()
for url in urls:
    articles.append(get_blog_article(url))

articles

[{'url': 'https://codeup.com/codeups-data-science-career-accelerator-is-here/',
  'title': 'Page not found - Codeup',
  'content': 'Oops!(210) 802–7289Why Codeup? Financial Aid Options Student Stories Employers Refund PolicyFAQsCareersMedia KitFull Stack Web Development Data Science Cyber Cloud Systems Engineering Free Networking Course© 2013-2021 Copyright. Privacy Policy  |  Complaint Policy  |  Inclusion | Accessibility | Sitemap',
  'phone_numbers': ['2108027289'],
  'date': [],
  'author': [],
  'copyright': ['2013-2021']},
 {'url': 'https://codeup.com/data-science-myths/',
  'title': 'Page not found - Codeup',
  'content': 'Oops!(210) 802–7289Why Codeup? Financial Aid Options Student Stories Employers Refund PolicyFAQsCareersMedia KitFull Stack Web Development Data Science Cyber Cloud Systems Engineering Free Networking Course© 2013-2021 Copyright. Privacy Policy  |  Complaint Policy  |  Inclusion | Accessibility | Sitemap',
  'phone_numbers': ['2108027289'],
  'date': [],
  'aut

In [3]:
def fetch_all_urls(target: str)->list:
    '''Pass a target url to crawl to get all valid urls from the target, will return a pandas 
    DataFrame must pass a valid URL as target
    '''
    
    def add_valid_urls(parse_url, valid_urls, invalid_urls, done, base):
        '''Builds out a list of valid_urls, invalid_urls and flags if is done.
        '''
        headers = {'User-Agent': 'Codeup Data Science'}

        # What url is being parsed
        print(f'Parsing {parse_url}')

        # Pull the base url out
        response = get(parse_url, headers=headers)
        soup = BeautifulSoup(response.text)

        # Fetches all anchors from page
        anchors = soup.find_all('a')

        # Pulls all hyperlinks from the anchors
        regex = r'''<a\s+(?:[^>]*?\s+)?href="(.*?)"'''
        # checks if the urls have http in them or not and concats with base if not to test
        unchecked_urls = [base+url if 'http' not in url else url for url in re.findall(regex, str(anchors))]

        urls = list()

        # Check through urls to see what ones are not already checked
        for url in unchecked_urls:
            # Make string to check
            url = str(url)
            # Checks if url is in valids or invalid urls, then ensure's that 'codeup'
            # is within the url and ensures that it isn't already in the checked urls list
            if url in valid_urls or url in invalid_urls \
                or 'codeup' not in url or url in urls:
                pass
            else:
                # Add to urls to test
                urls.append(url)

        # If there are not any more url's to check, make done flag true
        if len(urls) == 0:
            done = True
            return valid_urls, invalid_urls, done

        # Iterates through all the new urls and checks to see if they are valid
        for url in urls:
            url = str(url)
            try:
                print('Testing', url)
                # Test to see if response
                response = get(url, headers=headers)

                # Check response codes
                code = response.status_code

                # Enusre is valid and that url is not already added
                if code == 200:
                    valid_urls.append(url)

            except Exception as e:
                # Add to invalid_urls
                invalid_urls.append(url)

        return valid_urls, invalid_urls, False
    
    # Set the valid urls to target to start there
    valid_urls = [target]
    invalid_urls = []
    tried = list()
    done = False
    base = re.match(r'^.*(?:com|org|gov|net|us|eu|tv|me|.co)', valid_urls[0])[0]
    # extract only the alphanum
    filename = re.sub(r'[^a-zA-Z0-9]', '', base)+'_valid_urls.csv'
    if os.path.exists(filename):
        return pd.read_csv(filename, index_col=[0])
    while True:
        for url in valid_urls:
            if url not in tried:
                print(f'Trying {url}')
                tried.append(url)
                valid_urls, invalid_urls, done = add_valid_urls(url, valid_urls, invalid_urls, done, base)
            if done:
                break
        if done:
            break
    result = pd.DataFrame([{'http':url} for url in valid_urls])
    result.to_csv(filename)
    return result

## Bonus Scrape all Codeup Blogs

In [11]:
codeup_blog_df = get_all_blog_articles('https://codeup.com/blog/')
codeup_blog_df.head()

array(["['2108027289']", '[]'], dtype=object)