In [1]:
import os

# Set the environment variable within the notebook
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "top-suprstate-438620-h9-7a2effbc371a.json"

# Now proceed with Google Cloud Storage client initialization
from google.cloud import storage

# Instantiate a Google Cloud Storage client
client = storage.Client()

# List the buckets in your project
buckets = list(client.list_buckets())
print(buckets)


[<Bucket: gcf-v2-sources-892122457228-us-central1>, <Bucket: gcf-v2-uploads-892122457228-us-central1>, <Bucket: on-prem-ingestion-data>]


In [8]:
import requests
import pandas as pd
from google.cloud import storage
import time
from datetime import datetime

# Set up GCS bucket and folder details
BUCKET_NAME = 'on-prem-ingestion-data'
GCS_FOLDER = '2022_data_ingestion/'  # Folder for 2022 data chunks

# Set up GCS client
client = storage.Client()
bucket = client.get_bucket(BUCKET_NAME)

# Function to fetch data from the API
def fetch_data(page, page_size, api_key):
    url = f"http://127.0.0.1:5000/data?page={page}&page_size={page_size}&api_key={api_key}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()  # Return data in JSON format
    else:
        raise Exception(f"Failed to fetch data from API: {response.status_code}")

# Function to check which chunks have been uploaded to GCS
def get_uploaded_chunks():
    blobs = bucket.list_blobs(prefix=GCS_FOLDER)
    uploaded_pages = set()
    for blob in blobs:
        # Extract the page number from the file name: "data_page_X_YYYYMMDD_HHMMSS.csv"
        filename = blob.name.split("/")[-1]
        if "data_page_" in filename:
            # Extract the page number
            page_num = filename.split("_")[2]  # Get the page number
            uploaded_pages.add(int(page_num))
    return uploaded_pages

# Function to upload data to GCS with a timestamp in the file name
def upload_to_gcs(data, page):
    df = pd.DataFrame(data)
    
    # Create timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Create local file path with timestamp
    file_name_local = f"/tmp/data_page_{page}_{timestamp}.csv"
    
    # Save the DataFrame to a CSV file locally
    df.to_csv(file_name_local, index=False)
    
    # Create the file name with timestamp for GCS
    file_name_gcs = f"{GCS_FOLDER}data_page_{page}_{timestamp}.csv"
    blob = bucket.blob(file_name_gcs)
    blob.upload_from_filename(file_name_local)
    
    print(f"Uploaded {file_name_gcs} to GCS.")

# Ingest data in chunks from the API to GCS
def ingest_data(api_key, total_pages, page_size):
    # Get a set of already uploaded chunks from GCS
    uploaded_pages = get_uploaded_chunks()
    print(f"Uploaded chunks: {uploaded_pages}")

    for page in range(1, total_pages + 1):
        # If this chunk has already been uploaded, skip it
        if page in uploaded_pages:
            print(f"Chunk {page} already exists in GCS. Skipping...")
            continue

        print(f"Fetching page {page}...")
        try:
            data = fetch_data(page, page_size, api_key)
            if data:
                print(f"Uploading page {page} to GCS...")
                upload_to_gcs(data, page)
                
                # Sleep for 3 minutes to simulate real-time ingestion
                print("Waiting for 3 minutes before processing the next chunk...")
                time.sleep(10)
            else:
                print(f"No data found for page {page}. Skipping...")
        except Exception as e:
            print(f"Error processing page {page}: {e}")
            break  # Stop if there is any issue with the current page

# Define parameters
API_KEY = 'Accessgranted@4463'
TOTAL_PAGES = 40  # You can adjust based on the number of records / page_size
PAGE_SIZE = 10000   # Number of records per chunk

# Run the ingestion process
ingest_data(API_KEY, TOTAL_PAGES, PAGE_SIZE)


Uploaded chunks: {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13}
Chunk 1 already exists in GCS. Skipping...
Chunk 2 already exists in GCS. Skipping...
Chunk 3 already exists in GCS. Skipping...
Chunk 4 already exists in GCS. Skipping...
Chunk 5 already exists in GCS. Skipping...
Chunk 6 already exists in GCS. Skipping...
Chunk 7 already exists in GCS. Skipping...
Chunk 8 already exists in GCS. Skipping...
Chunk 9 already exists in GCS. Skipping...
Chunk 10 already exists in GCS. Skipping...
Chunk 11 already exists in GCS. Skipping...
Chunk 12 already exists in GCS. Skipping...
Chunk 13 already exists in GCS. Skipping...
Fetching page 14...
Uploading page 14 to GCS...
Uploaded 2022_data_ingestion/data_page_14_20241019_160028.csv to GCS.
Waiting for 3 minutes before processing the next chunk...
Fetching page 15...
Uploading page 15 to GCS...
Uploaded 2022_data_ingestion/data_page_15_20241019_160052.csv to GCS.
Waiting for 3 minutes before processing the next chunk...
Fetching page 16...
Uplo