### Part 1: AWS S3 & Sourcing Datasets
Republish this open dataset in Amazon S3 and share with us a link.
You may run into 403 Forbidden errors as you test accessing this data. There is a way to comply with the BLS data access policies and re-gain access to fetch this data programatically - we have included some hints as to how to do this at the bottom of this README in the Q/A section.

# Required sub-steps to accomplisht this:
1. Set up S3 bucket environmnet
2. Be able to read and publish into S3 env. 
3. troubleshoot error

- Key notes: This is DataLake design. Create a landing-zone where data will be uploaded and parsed under upload date. The goal is just to have a starting point to being bringing in data into s3.

In [13]:
import boto3
import os
from dotenv import load_dotenv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import pandas as pd
import io

In [14]:

def generate_urls(base_url):
    # Set the user agent header
    headers = {'User-Agent': 'Mozilla/5.0'}

    # Send a GET request to the URL with the user agent header
    response = requests.get(base_url, headers=headers)

    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all the <a> tags in the HTML
        links = soup.find_all('a')

        # Extract the href attribute from each <a> tag
        urls = [link.get('href') for link in links if link.get('href')]

        # Prepend the base URL to each URL
        urls = [urljoin(base_url, url) for url in urls]

        return urls
    else:
        print(f"Failed to fetch HTML from {base_url}")
        return []

In [15]:
def fetch_data(urls):
    data = {}
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    for url in urls:
        # Send a GET request to the URL with the user agent header
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            # Get the response content as text
            page_source = response.text
            data[url] = page_source
        else:
            print(f"Failed to fetch data from {url}")
        
        # Add a delay between requests
        time.sleep(1)

    return data

In [16]:
# Load environment variables
load_dotenv()

# Initialize a session using your AWS credentials
session = boto3.Session(
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
    region_name="us-east-1"  # or your preferred region
)

# Create an S3 client
s3 = session.client('s3')

def fetch_and_upload_data(urls):
    for url in urls:
        data = fetch_data([url])
        dataframe = pd.DataFrame(data[url].split('\n'))
        dataframe = dataframe[0].str.split('\t', expand=True)

        # Create an in-memory file
        csv_buffer = io.StringIO()

        # Save the DataFrame to the in-memory file
        dataframe.to_csv(csv_buffer, index=False)

        # Define the key for the S3 object
        s3_key = 'landing-zone/' + url.split('/')[-1].replace('.txt', '.csv')

        # Upload the in-memory file to your S3 bucket
        s3.put_object(Body=csv_buffer.getvalue(), Bucket="rearc-datalake-bucket", Key=s3_key)

        # Print a log message
        print(f"Uploaded dataset from {url} to rearc-datalake-bucket/{s3_key}")

In [18]:
# Run the functions
base_url = 'https://download.bls.gov/pub/time.series/pr/'
urls_list = generate_urls(base_url)
# Remove the first one as that is just a repeat of the base URL
urls_list.pop(0)
for i in urls_list:
    print(i)

https://download.bls.gov/pub/time.series/pr/pr.class
https://download.bls.gov/pub/time.series/pr/pr.contacts
https://download.bls.gov/pub/time.series/pr/pr.data.0.Current
https://download.bls.gov/pub/time.series/pr/pr.data.1.AllData
https://download.bls.gov/pub/time.series/pr/pr.duration
https://download.bls.gov/pub/time.series/pr/pr.footnote
https://download.bls.gov/pub/time.series/pr/pr.measure
https://download.bls.gov/pub/time.series/pr/pr.period
https://download.bls.gov/pub/time.series/pr/pr.seasonal
https://download.bls.gov/pub/time.series/pr/pr.sector
https://download.bls.gov/pub/time.series/pr/pr.series
https://download.bls.gov/pub/time.series/pr/pr.txt


In [33]:
from typing import Optional, List
import logging

In [34]:
import boto3
import os

def fetch_and_upload_data(bucket_name: str, urls_list: List[str]) -> None:
    # Initialize the S3 client directly
    s3 = boto3.client(
        's3',
        aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
        aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
        region_name=os.getenv('AWS_REGION')
    )

    for url in urls_list:
        data = fetch_data([url])
        if data:  # Check if data is not empty
            for url, content in data.items():
                dataframe = pd.DataFrame(content.split('\n'))
                dataframe = dataframe[0].str.split('\t', expand=True)

                # Create an in-memory file
                csv_buffer = io.StringIO()

                # Save the DataFrame to the in-memory file
                dataframe.to_csv(csv_buffer, index=False)

                # Define the key for the S3 object
                s3_key = 'landing-zone/' + url.split('/')[-1].replace('.txt', '.csv')

                # Upload the in-memory file to your S3 bucket
                s3.put_object(Body=csv_buffer.getvalue(), Bucket=bucket_name, Key=s3_key)

                # Log the upload
                logging.info(f"Uploaded dataset from {url} to {bucket_name}/{s3_key}")
        else:
            logging.error(f"Failed to fetch data from {url}")


In [37]:
def run(base_url: str, bucket_name: str):
    # Initialize boto3 session
    load_dotenv()  # load environment variables
    aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
    aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
    region_name = "us-east-1"

    # Debug: Print out the AWS configurations (remove sensitive info)
    print("AWS Access Key:", aws_access_key_id)
    print("AWS Secret Access Key:", aws_secret_access_key)
    print("AWS Region:", region_name)

    session = boto3.Session(
        aws_access_key_id=aws_access_key_id,
        aws_secret_access_key=aws_secret_access_key,
        region_name=region_name
    )

    # Generate URLs
    urls_list = generate_urls(base_url)

    # Fetch and upload data
    if urls_list:
        fetch_and_upload_data(session, bucket_name, urls_list)
    else:
        logging.error("No URLs generated to fetch data.")


In [38]:
base_url = 'https://download.bls.gov/pub/time.series/pr/'
bucket_name = 'rearc-datalake-bucket'
run(base_url, bucket_name)

AWS Access Key: None
AWS Secret Access Key: PEHWJQpJcnXlN4U62L8XFblS8s4FwvYxv+xGO08+
AWS Region: us-east-1


TypeError: sequence item 0: expected str instance, NoneType found