In [None]:
! pip install google-analytics-data azure-storage-file-datalake pandas python-dotenv

In [None]:
# -----------------------------------------------
# 📥 GA4 ➝ ADLS Gen2 Ingestion Script (VS Code)
# -----------------------------------------------

from google.oauth2 import service_account
from google.analytics.data_v1beta import BetaAnalyticsDataClient
from google.analytics.data_v1beta.types import RunReportRequest, DateRange, Dimension, Metric
from azure.storage.filedatalake import DataLakeServiceClient
from io import StringIO
import pandas as pd
import os
from dotenv import load_dotenv

load_dotenv(".env")  # Load environment variables from .env file

# -----------------------------------------------
# 1. Setup: Credentials and Config
# -----------------------------------------------

# 👇 Replace with your actual file and values
GA4_PROPERTY_ID = "489632671"  # GA4 Property ID
CREDENTIALS_FILE = "credentials.json"  # Path to your service account key
ADLS_ACCOUNT_NAME = os.getenv("ADLS_ACCOUNT_NAME")  # ADLS Gen2 account name
ADLS_ACCOUNT_KEY = os.getenv("ADLS_ACCOUNT_KEY")  # ADLS Gen2 account key
ADLS_FILE_SYSTEM = "ga4-data"
ADLS_FILE_PATH = "ga4/ga4_ingestion_output.csv"

# -----------------------------------------------
# 2. Authenticate Google Service Account
# -----------------------------------------------

scopes = ["https://www.googleapis.com/auth/analytics.readonly"]
credentials = service_account.Credentials.from_service_account_file(
    CREDENTIALS_FILE, scopes=scopes
)

# -----------------------------------------------
# 3. Build and Run GA4 Report
# -----------------------------------------------

client = BetaAnalyticsDataClient(credentials=credentials)

request = RunReportRequest(
    property=f"properties/{GA4_PROPERTY_ID}",
    dimensions=[
        Dimension(name="date"),
        Dimension(name="country"),
        Dimension(name="deviceCategory"),
        Dimension(name="browser"),
        Dimension(name="eventName")
    ],
    metrics=[
        Metric(name="activeUsers"),
        Metric(name="engagedSessions"),
        Metric(name="engagementRate"),
        Metric(name="averageSessionDuration"),
        Metric(name="userEngagementDuration"),
        Metric(name="bounceRate"),
        Metric(name="eventsPerSession"),
        Metric(name="screenPageViews"),
        Metric(name="screenPageViewsPerSession"),
        Metric(name="newUsers"),
    ],
    date_ranges=[DateRange(start_date="7daysAgo", end_date="today")]
)

response = client.run_report(request)

# -----------------------------------------------
# 4. Convert API Response to Pandas DataFrame
# -----------------------------------------------

rows = []
for row in response.rows:
    record = [dim.value for dim in row.dimension_values] + \
             [metric.value for metric in row.metric_values]
    rows.append(record)

columns = [dim.name for dim in request.dimensions] + \
          [metric.name for metric in request.metrics]

df = pd.DataFrame(rows, columns=columns)

# -----------------------------------------------
# 5. Upload DataFrame to ADLS Gen2 as CSV
# -----------------------------------------------

csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)

# Save DataFrame to a CSV file on disk
# df.to_csv("ga4_ingestion_output.csv", index=False)
# csv_buffer.seek(0)

# Connect to ADLS Gen2
service_client = DataLakeServiceClient(
    account_url=f"https://{ADLS_ACCOUNT_NAME}.dfs.core.windows.net",
    credential=ADLS_ACCOUNT_KEY
)

file_system_client = service_client.get_file_system_client(ADLS_FILE_SYSTEM)
file_client = file_system_client.get_file_client(ADLS_FILE_PATH)

# Upload the CSV
file_client.upload_data(csv_buffer.getvalue(), overwrite=True)

print("✅ GA4 data uploaded directly to ADLS Gen2.")


✅ GA4 data uploaded directly to ADLS Gen2.
