# RSS Article Batch Download Examples

This notebook demonstrates how to batch download RSS articles from S3.

In [1]:
import os
import pandas as pd
from time import time


In [2]:
# Root imports

from src.search.batch import S3BatchDownloader
from dotenv import load_dotenv

load_dotenv(override=True)

downloader = S3BatchDownloader()


## Initialize the Downloader

In [3]:
import boto3
from botocore.exceptions import ClientError

s3 = boto3.client('s3')

# Test ListObjects
try:
    response = s3.list_objects_v2(Bucket=os.getenv("S3_BUCKET_NAME"))
    print("List permission: Allowed")
except ClientError as e:
    print("List permission: Denied")


List permission: Allowed


In [4]:
start = time()
output_path = "November-1.csv"  # or "consolidated_data.json"

# Define date range
start_date = "2024-11-17" # FIXME: Fix the error where data can't be collected before the date you started collecting.
end_date = "2024-11-22"

# Start downloading
downloader.download_to_file(
    output_path=output_path,
    file_format="csv",  # or "json"
    start_date=start_date,
    end_date=end_date
)

print(f"Downloaded data to {output_path} in {time() - start:.2f} seconds")

Found 59592 objects to process


 53%|█████▎    | 31707/59592 [04:33<6:08:33,  1.26object/s]IOStream.flush timed out
100%|██████████| 59592/59592 [08:38<00:00, 114.83object/s]  


Downloaded data to November-1.csv in 605.81 seconds


In [5]:
start = time()
output_path = "November-2.csv"  # or "consolidated_data.json"

# Define date range
start_date = "2024-11-23" # FIXME: Fix the error where data can't be collected before the date you started collecting.
end_date = "2024-11-27"

# Start downloading
downloader.download_to_file(
    output_path=output_path,
    file_format="csv",  # or "json"
    start_date=start_date,
    end_date=end_date
)

print(f"Downloaded data to {output_path} in {time() - start:.2f} seconds")

Found 50037 objects to process


 66%|██████▌   | 33067/50037 [04:07<11:49, 23.92object/s]  IOStream.flush timed out
100%|██████████| 50037/50037 [07:11<00:00, 116.06object/s] 


Downloaded data to November-2.csv in 539.74 seconds


In [None]:
start = time()
output_path = "November-3.csv"  # or "consolidated_data.json"

# Define date range
start_date = "2024-11-8" # FIXME: Fix the error where data can't be collected before the date you started collecting.
end_date = "2024-11-30"

# Start downloading
downloader.download_to_file(
    output_path=output_path,
    file_format="csv",  # or "json"
    start_date=start_date,
    end_date=end_date
)

print(f"Downloaded data to {output_path} in {time() - start:.2f} seconds")

# Aggregating