### Part 1: AWS S3 & Sourcing Datasets
Republish this open dataset in Amazon S3 and share with us a link.
You may run into 403 Forbidden errors as you test accessing this data. There is a way to comply with the BLS data access policies and re-gain access to fetch this data programatically - we have included some hints as to how to do this at the bottom of this README in the Q/A section.

# Required sub-steps to accomplisht this:
1. Set up S3 bucket environmnet
2. Be able to read and publish into S3 env. 
3. troubleshoot error

- Key notes: This is DataLake design. Create a landing-zone where data will be uploaded and parsed under upload date. The goal is just to have a starting point to being bringing in data into s3.

In [1]:
import boto3
import os
from dotenv import load_dotenv

In [2]:
load_dotenv('.env')

True

In [7]:
access_key = os.getenv('AWS_ACCESS_KEY')
secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')

In [8]:
s3_client = boto3.client(
    's3',
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key
)

In [9]:
response = s3_client.list_buckets()

if 'Buckets' in response:
    buckets = response['Buckets']
    for bucket in buckets:
        print(bucket['Name'])

2021-04-02-ep-website
aws-emr-resources-386175835981-us-east-2
aws-logs-386175835981-us-east-2
bigbatbucket
canvas-bucket-iris-4123
dbdatalocation
ed-exp-cost-and-usage
eddysfistbuck
edwardplatagschoolcap
eplatacapstonedata
eplatacapstoneipynb
qep-sports-betting-bucket
rearc-datalake-bucket
redditdatacollectionwemeta
sagemaker-soln-ddf-js-2ruwg4-386175835981-us-east-1
sagemaker-soln-ddf-js-2seloa-386175835981-us-east-1
sagemaker-soln-ddf-js-2sf3s6-386175835981-us-east-1
sagemaker-soln-ddf-js-44xdya-386175835981-us-east-1
sagemaker-soln-documents-js-4htc2a-us-east-1-386175835981
sagemaker-studio-386175835981-l4ayz3cscdq
sagemaker-studio-386175835981-l9tzph12na
sagemaker-studio-386175835981-zzqac2052o
sagemaker-us-east-2-386175835981
someonesbucket
ss-discord-group-minecraft-bucket


We have access to the s3 bucekt

#### Part 1: AWS S3 & Sourcing Datasets
1. Republish [this open dataset](https://download.bls.gov/pub/time.series/pr/) in Amazon S3 and share with us a link.
    - You may run into 403 Forbidden errors as you test accessing this data. There is a way to comply with the BLS data access policies and re-gain access to fetch this data programatically - we have included some hints as to how to do this at the bottom of this README in the Q/A section.
2. Script this process so the files in the S3 bucket are kept in sync with the source when data on the website is updated, added, or deleted.
    - Don't rely on hard coded names - the script should be able to handle added or removed files.
    - Ensure the script doesn't upload the same file more than once.

In [12]:
import os
import requests
import boto3

# AWS credentials
access_key = os.getenv('AWS_ACCESS_KEY')
secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# S3 bucket and landing zone details
bucket_name = 'rearc-datalake-bucket'
landing_zone_prefix = 'landing-zone/'

# Create an S3 client
s3_client = boto3.client(
    's3',
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key
)

# Check if the landing zone exists, create it if it doesn't
response = s3_client.list_objects_v2(
    Bucket=bucket_name,
    Prefix=landing_zone_prefix
)

if 'Contents' not in response:
    s3_client.put_object(
        Bucket=bucket_name,
        Key=landing_zone_prefix
    )
    print(f"Landing zone '{landing_zone_prefix}' created in bucket '{bucket_name}'.")

# Fetch data from the provided link and upload to S3 landing zone
url = 'https://download.bls.gov/pub/time.series/pr/'
response = requests.get(url)

if response.status_code == 200:
    files = response.text.split('\n')
    for file in files:
        if file.endswith('.txt'):
            file_name = file.split('/')[-1]
            s3_client.put_object(
                Bucket=bucket_name,
                Key=f"{landing_zone_prefix}{file_name}",
                Body=requests.get(f"{url}{file}").content
            )
            print(f"Uploaded '{file_name}' to landing zone.")


In [1]:
import requests

url = 'https://download.bls.gov/pub/time.series/pr/'
response = requests.get(url)

if response.status_code == 200:
    files = response.text.split('\n')
    for file in files:
        print(file)


In [3]:
print(response)

<Response [403]>


In [6]:
import requests
from bs4 import BeautifulSoup
import re
import os

def fetch_front_page_data(url):
    headers = {'User-Agent': 'Mozilla/5.0'}

    # Fetch the content from the URL
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data from {url} with status code {response.status_code}")

    # Parse the HTML to extract file details
    soup = BeautifulSoup(response.content, 'html.parser')
    file_data = []

    # Regular expression to match the file details
    file_info_pattern = re.compile(r'(\d{1,2}/\d{1,2}/\d{4})\s+(\d{1,2}:\d{2} [APM]{2})\s+([\d,]+) (.+)')

    for line in soup.get_text().split('\n'):
        match = file_info_pattern.search(line)
        if match:
            data = {
                'update_date': match.group(1),
                'update_time': match.group(2),
                'size': match.group(3),
                'data': match.group(4),
                'data_link': os.path.join(url, match.group(4))
            }
            file_data.append(data)

    return file_data

# Example usage
front_page_data = fetch_front_page_data("https://download.bls.gov/pub/time.series/pr/")
front_page_data


[{'update_date': '12/6/2023',
  'update_time': '8:30 AM',
  'size': '102',
  'data': 'pr.class 9/13/2022  3:52 PM          562 pr.contacts 12/6/2023  8:30 AM      1477682 pr.data.0.Current 12/6/2023  8:30 AM      3101899 pr.data.1.AllData 12/6/2023  8:30 AM          176 pr.duration 12/6/2023  8:30 AM           40 pr.footnote 12/6/2023  8:30 AM          745 pr.measure  1/7/1994  2:53 PM          146 pr.period11/18/2011  3:05 PM           79 pr.seasonal 12/6/2023  8:30 AM          263 pr.sector 12/6/2023  8:30 AM        15657 pr.series11/17/2011  4:11 PM        18343 pr.txt',
  'data_link': 'https://download.bls.gov/pub/time.series/pr/pr.class 9/13/2022  3:52 PM          562 pr.contacts 12/6/2023  8:30 AM      1477682 pr.data.0.Current 12/6/2023  8:30 AM      3101899 pr.data.1.AllData 12/6/2023  8:30 AM          176 pr.duration 12/6/2023  8:30 AM           40 pr.footnote 12/6/2023  8:30 AM          745 pr.measure  1/7/1994  2:53 PM          146 pr.period11/18/2011  3:05 PM           79 p

In [10]:
import pandas as pd

df = pd.DataFrame(front_page_data)
df.head()


Unnamed: 0,update_date,update_time,size,data,data_link
0,12/6/2023,8:30 AM,102,pr.class 9/13/2022 3:52 PM 562 pr.co...,https://download.bls.gov/pub/time.series/pr/pr...


In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def fetch_front_page_data_df(url):
    headers = {'User-Agent': 'Mozilla/5.0'}

    # Fetch the content from the URL
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch data from {url} with status code {response.status_code}")

    # Parse the HTML to extract file details
    soup = BeautifulSoup(response.content, 'html.parser')
    file_data = []

    # Extract each file entry
    for line in soup.find_all('a'):
        line_text = line.find_previous('br').next_sibling.strip()
        if line_text:
            parts = line_text.split()
            if len(parts) >= 3:
                update_date = parts[0]
                update_time = parts[1] + ' ' + parts[2]
                size = parts[3]
                data = line.text
                data_link = url + line['href']

                file_info = {
                    'update_date': update_date,
                    'update_time': update_time,
                    'size': size,
                    'data': data,
                    'data_link': data_link
                }
                file_data.append(file_info)

    # Convert to DataFrame
    return pd.DataFrame(file_data)

# Fetching and displaying data in DataFrame format
front_page_data_df = fetch_front_page_data_df("https://download.bls.gov/pub/time.series/pr/")
front_page_data_df.head()  # Displaying first few rows for brevity


Exception: Failed to fetch data from https://download.bls.gov/pub/time.series/pr/ with status code 503

In [12]:
import os
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd

def fetch_front_page_data_df(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        raise Exception(f"Failed to fetch data from {url} with status code {response.status_code}")

    # Save the HTML content
    today = datetime.today().strftime('%Y_%m_%d')
    filename = f'downloaded_data/front_page_{today}.html'
    if not os.path.exists(filename):
        with open(filename, 'w') as f:
            f.write(response.text)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    data = {}  # replace this with your parsing logic

    return pd.DataFrame(data)

In [13]:
df = fetch_front_page_data_df(url)

Exception: Failed to fetch data from https://download.bls.gov/pub/time.series/pr/ with status code 503

In [None]:
#Lets move to local