In [0]:
%pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib

In [0]:
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
import io
import os
import pandas as pd

In [0]:
def get_file_id(service, folder_id, filename):
    query = f"'{folder_id}' in parents and name='{filename}' and trashed=false"
    results = service.files().list(q=query, fields="files(id, name)").execute()
    files = results.get('files', [])
    if not files:
        raise FileNotFoundError(f"{filename} not found in folder {folder_id}")
    return files[0]['id']


In [0]:
service_account_file = "/Workspace/Users/kushagraverma@live.in/Mechanism X/rapid-compound-463822-c8-74c43978101f.json"

In [0]:
# Step 1: Setup GDrive connection (already shared)
creds = service_account.Credentials.from_service_account_file(
        service_account_file,
        scopes=["https://www.googleapis.com/auth/drive"]
    )
service = build('drive', 'v3', credentials=creds)

# Step 2: Define your folder and filenames
folder_id = "1qryhdlgNsmecWRy2haI8S3uC63wKk5X-"
transactions_file = "transactions.csv"
customer_file = "CustomerImportance.csv"

# Step 3: Download both files
def read_csv_from_gdrive(service, file_id):
    request = service.files().get_media(fileId=file_id)
    file_buffer = io.BytesIO()
    downloader = MediaIoBaseDownload(file_buffer, request)
    done = False
    while not done:
        _, done = downloader.next_chunk()
    file_buffer.seek(0)
    pandas_df = pd.read_csv(file_buffer)
    return spark.createDataFrame(pandas_df)

# Get file IDs
transactions_id = get_file_id(service, folder_id, transactions_file)
customer_id = get_file_id(service, folder_id, customer_file)

# Read as PySpark DataFrames
transactions_df = read_csv_from_gdrive(service, transactions_id)
customer_df = read_csv_from_gdrive(service, customer_id)

In [0]:
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window
from datetime import datetime
import time

# === Config ===
base_path = "abfss://gdrive-ingest@devdolphinstorage.dfs.core.windows.net"
transactions_path = f"{base_path}/transactions"
reference_path = f"{base_path}/reference_data/customer_importance"

# === Function to rename part file directly in transactions folder ===
def rename_and_flatten_chunk(temp_dir: str, final_path: str):
    files = dbutils.fs.ls(temp_dir)
    for file in files:
        if file.name.startswith("part-") and file.name.endswith(".csv"):
            dbutils.fs.mv(file.path, final_path)
        else:
            dbutils.fs.rm(file.path)
    dbutils.fs.rm(temp_dir, recurse=True)

# === Step 1: Write Reference Data ===
customer_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(reference_path)
print(f"Uploaded customer importance data to: {reference_path}")

# === Step 2: Write transactions as flat CSVs directly in /transactions ===
chunk_size = 10000
total_rows = transactions_df.count()
num_chunks = (total_rows + chunk_size - 1) // chunk_size

# Add row number
window_spec = Window.orderBy(monotonically_increasing_id())
transactions_df = transactions_df.withColumn("row_num", row_number().over(window_spec))

for i in range(num_chunks):
    start_time = time.time()
    start = i * chunk_size
    end = start + chunk_size
    
    chunk_df = transactions_df.filter((transactions_df.row_num > start) & (transactions_df.row_num <= end)).drop("row_num")
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    temp_chunk_dir = f"{transactions_path}/tmp_chunk_{i+1}_{timestamp}"
    final_file_path = f"{transactions_path}/chunk_{i+1}_{timestamp}.csv"
    
    # Write chunk to temp location
    chunk_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(temp_chunk_dir)
    
    # Move part file to final flat location
    rename_and_flatten_chunk(temp_chunk_dir, final_file_path)
    
    print(f"Uploaded chunk {i+1}/{num_chunks} as {final_file_path}")
    
    # Wait 1 second
    elapsed = time.time() - start_time
    sleep_time = max(0, 1.0 - elapsed)
    time.sleep(sleep_time)