In [15]:
import os

# Set the environment variable within the notebook
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "top-suprstate-438620-h9-7a2effbc371a.json"

# Now proceed with Google Cloud Storage client initialization
from google.cloud import storage

# Instantiate a Google Cloud Storage client
client = storage.Client()

# List the buckets in your project
buckets = list(client.list_buckets())
print(buckets)


[<Bucket: gcf-v2-sources-892122457228-us-central1>, <Bucket: gcf-v2-uploads-892122457228-us-central1>, <Bucket: on-prem-ingestion-data>]


In [22]:
import os
import pandas as pd
import time
import random
from google.cloud import storage
from datetime import datetime

# Function to check if a file already exists in GCS
def check_if_blob_exists(bucket_name, file_name):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(file_name)
    return blob.exists()

# Function to upload chunk of data to GCS
def upload_chunk_to_gcs(local_file, bucket_name, file_name_in_gcs):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(file_name_in_gcs)
    
    # Upload the file to GCS
    blob.upload_from_filename(local_file)
    print(f'{file_name_in_gcs} uploaded to GCS successfully!')

# Main function for chunking and uploading data
def chunk_and_upload(file_path, bucket_name, chunk_size_min=30000, chunk_size_max=50000, wait_time=15):
    # Load the CSV data
    data = pd.read_csv(file_path)

    # Calculate total rows in the CSV
    total_rows = data.shape[0]
    start = 0
    chunk_num = 1

    # Define the folder name for GCS (this will create a "folder" in GCS)
    folder_name_in_gcs = '2019_ingestion/'

    # Loop through the data, processing random chunks
    while start < total_rows:
        # Randomly choose a chunk size between 30,000 and 50,000
        chunk_size = random.randint(chunk_size_min, chunk_size_max)
        end = min(start + chunk_size, total_rows)

        # Extract the chunk of data
        data_chunk = data[start:end]

        # Add a timestamp to make the file name unique
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        file_name_local = f"2019_data_chunk_{chunk_num}_{timestamp}.csv"  # Save locally
        file_name_in_gcs = f"{folder_name_in_gcs}2019_data_chunk_{chunk_num}_{timestamp}.csv"  # Path in GCS

        # Save the chunk to a temporary CSV file locally
        data_chunk.to_csv(file_name_local, index=False)

        # Check if the file already exists in GCS
        if check_if_blob_exists(bucket_name, file_name_in_gcs):
            print(f'{file_name_in_gcs} already exists in GCS. Skipping upload.')
        else:
            # Upload the chunk to GCS
            upload_chunk_to_gcs(file_name_local, bucket_name, file_name_in_gcs)

        # Remove the local file after upload to save space
        os.remove(file_name_local)

        # Increment the start position for the next chunk
        start = end
        chunk_num += 1

        # Wait for the specified time (15 seconds) before uploading the next chunk
        time.sleep(wait_time)  # Time is in seconds (15 seconds)

    print("All chunks uploaded successfully!")

# Set the path to your local CSV file and GCS bucket
csv_file_path = '2019.csv'  # Update with the actual path
bucket_name = 'on-prem-ingestion-data'  # Replace with your GCS bucket name

# Call the function to start chunking and uploading
chunk_and_upload(csv_file_path, bucket_name)


2019_ingestion/2019_data_chunk_1_20241018_201402.csv uploaded to GCS successfully!
2019_ingestion/2019_data_chunk_2_20241018_201426.csv uploaded to GCS successfully!
2019_ingestion/2019_data_chunk_3_20241018_201452.csv uploaded to GCS successfully!
2019_ingestion/2019_data_chunk_4_20241018_201516.csv uploaded to GCS successfully!
2019_ingestion/2019_data_chunk_5_20241018_201544.csv uploaded to GCS successfully!
2019_ingestion/2019_data_chunk_6_20241018_201616.csv uploaded to GCS successfully!
2019_ingestion/2019_data_chunk_7_20241018_201642.csv uploaded to GCS successfully!
2019_ingestion/2019_data_chunk_8_20241018_201709.csv uploaded to GCS successfully!
2019_ingestion/2019_data_chunk_9_20241018_201738.csv uploaded to GCS successfully!
2019_ingestion/2019_data_chunk_10_20241018_201803.csv uploaded to GCS successfully!
2019_ingestion/2019_data_chunk_11_20241018_201826.csv uploaded to GCS successfully!
All chunks uploaded successfully!


In [24]:
print("h")

h
