In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
import json
import time
import json
import zipfile
import pandas as pd
import os
from minio import Minio
from minio.error import S3Error
import shutil
from minio.commonconfig import CopySource

from azure.identity import ClientSecretCredential
from azure.storage.blob import BlobServiceClient, ContainerClient

In [None]:
# Base directory (fixed path inside Airflow container)
BASE_DIR = "/opt/airflow/dags/notebooks/gtfs_extract_workspace/Transitland_project"
os.makedirs(BASE_DIR, exist_ok=True)  # Ensure main folder exists

# Download directory (inside BASE_DIR)
download_dir = os.path.join(BASE_DIR, "latest")
os.makedirs(download_dir, exist_ok=True)

# Versions file path
VERSIONS_FILE = os.path.join(BASE_DIR, "versions.json")

# Create empty versions.json if it doesn't exist
if not os.path.exists(VERSIONS_FILE):
    with open(VERSIONS_FILE, "w") as f:
        f.write("{}")

# Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless=new")  # 'new' headless mode for better file downloads
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument(
    "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/115.0.0.0 Safari/537.36"
)
chrome_options.add_argument("--enable-features=NetworkService,NetworkServiceInProcess")

# Chrome preferences for automatic downloads
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True,
    "profile.default_content_settings.popups": 0,
    "profile.content_settings.exceptions.automatic_downloads.*.setting": 1,
    "profile.default_content_setting_values.automatic_downloads": 1
})

# run chromedriver
driver = webdriver.Chrome(service=Service("/usr/bin/chromedriver"), options=chrome_options)

# Step 1: Go to the TransitLand feed page
driver.get("https://www.transit.land/feeds/f-dr5r-mtanyctbusmanhattan")
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, "nav")))

# Step 2: Click the "Sign in" button in the navbar
sign_in_button = WebDriverWait(driver, 30).until(
    EC.element_to_be_clickable((By.XPATH, "//span[text()='Sign in']"))
)
sign_in_button.click()


# Step 3: Wait for email input on the Interline login page
email_input = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, "//input[@name='username']"))
)
email_input.send_keys("mohamedzaki.29893@gmail.com")

# Step 4: Enter password
password_input = driver.find_element(By.XPATH, "//input[@name='password']")
password_input.send_keys("joker298")

# Step 5: Click the Continue button
continue_button = driver.find_element(By.XPATH, "//button[@type='submit']")
continue_button.click()

# Optional: Wait until the page after login loads
WebDriverWait(driver, 30).until(
    EC.url_changes(driver.current_url)
)

print("Login successful!")

Login successful!


In [None]:
# Storage file
VERSIONS_FILE = "/opt/airflow/dags/notebooks/gtfs_extract_workspace/Transitland_project/versions.json"

# Read previous versions from JSON file
def load_versions():
    if os.path.exists(VERSIONS_FILE):
        if os.path.getsize(VERSIONS_FILE) > 0:  # File is not empty
            with open(VERSIONS_FILE, "r") as f:
                try:
                    return json.load(f)
                except json.JSONDecodeError:
                    return {}  # File contains invalid data
        else:
            return {}  # File is empty
    return {}

# Save updated versions to JSON file
def save_versions(versions):
    with open(VERSIONS_FILE, "w") as f:
        json.dump(versions, f, indent=4)

# List of URLs
urls = [
    "https://www.transit.land/feeds/f-dr5r-mtanyctbusstatenisland",
    "https://www.transit.land/feeds/f-dr72-mtanyctbusbronx",
    "https://www.transit.land/feeds/f-dr5r-mtanyctbusmanhattan",
    "https://www.transit.land/feeds/f-dr5r-mtanyctbusbrooklyn",
    "https://www.transit.land/feeds/f-dr5x-mtanyctbusqueens"
]

wait = WebDriverWait(driver, 50)

# Load stored versions
stored_versions = load_versions()

for url in urls:
    print(f"\n🔍 Visiting: {url}")
    driver.get(url)

    # Wait for the first row in the table
    first_row = wait.until(EC.presence_of_element_located(
        (By.XPATH, "//table[@class='table is-striped is-fullwidth']/tbody/tr[1]")
    ))

    # Get the latest version date
    latest_version = first_row.find_element(By.XPATH, ".//td[1]").text
    # latest_version_date = latest_version.split(" ")[0]
    latest_version_date = latest_version[:10]
    print(f"📅 Latest version on website: {latest_version_date}")

    # Check if there is a new version or first time downloading
    if stored_versions.get(url) == latest_version_date:
        print("⚠️ No update found, skipping download.")
        continue

    print("⬇️ New version found or first run, downloading...")

    # Click on the download icon
    download_icon = first_row.find_element(By.XPATH, ".//td[last()]/a")
    download_icon.click()

    # Wait for modal to appear
    download_modal_button = wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(@class,'button') and contains(@class,'is-primary')]"))
    )
    download_modal_button.click()

    # Update stored version
    stored_versions[url] = latest_version_date

    # Wait a bit before moving to the next URL
    time.sleep(40)

# Save updates to JSON file
save_versions(stored_versions)

print("\n✅ انتهى الفحص.")

In [None]:
!pwd

In [None]:
driver.quit()

In [None]:

# Paths
project_path = r"/opt/airflow/dags/notebooks/gtfs_extract_workspace/Transitland_project/latest"
json_path = "/opt/airflow/dags/notebooks/gtfs_extract_workspace/Transitland_project/versions.json"

# Load JSON
with open(json_path, "r", encoding="utf-8") as f:
    versions_data = json.load(f)

# Go through each .zip file
for filename in os.listdir(project_path):
    if filename.lower().endswith(".zip"):
        zip_path = os.path.join(project_path, filename)

        # Example filename: f-dr5r-mtanyctbusbrooklyn-latest.zip
        feed_id = filename.split("-latest.zip")[0]  # f-dr5r-mtanyctbusbrooklyn

        # Find JSON key that contains this feed_id
        matched_key = next((key for key in versions_data if feed_id in key), None)

        if matched_key:
            # Extract date only from JSON (before space)
            last_version_date = versions_data[matched_key].split()[0]
        else:
            last_version_date = "unknown_version"

        # Extract short name (mtanyctbusbrooklyn)
        short_name = feed_id.split("-")[-1] if "-" in feed_id else feed_id

        # Output folder: mtanyctbusbrooklyn_YYYY-MM-DD
        output_folder = os.path.join(project_path, f"{short_name}_{last_version_date}")

        # Unzip
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(output_folder)

        print(f"Unzipped: {filename} → {output_folder}")

In [None]:
for filename in os.listdir(project_path):
    if filename.lower().endswith(".zip"):
        file_path = os.path.join(project_path, filename)
        os.remove(file_path)
        print(f"Deleted file: {filename}")

print("✅ All ZIP files have been deleted.")


In [None]:

# Base directory (your "latest" folder)
base_dir = "/opt/airflow/dags/notebooks/gtfs_extract_workspace/Transitland_project/latest"

for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".txt"):
            txt_path = os.path.join(root, file)
            csv_path = os.path.join(root, file[:-4] + ".csv")

            try:
                # Read TXT (default: comma-delimited)
                df = pd.read_csv(txt_path)

                # Save as CSV
                df.to_csv(csv_path, index=False)

                # Remove original TXT file
                os.remove(txt_path)

                print(f"✅ Converted and removed: {txt_path} → {csv_path}")

            except Exception as e:
                print(f"❌ Failed to convert {txt_path}: {e}")


In [None]:
# Azure connection settings
TENANT_ID = ''
CLIENT_ID = ''
CLIENT_SECRET = ''
ACCOUNT_NAME = 'gtfsdls'
CONTAINER_NAME = 'transitbatchlatest'

# Local base dir
BASE_DIR = os.getenv(
    "BASE_DIR",
    "/opt/airflow/dags/notebooks/gtfs_extract_workspace/Transitland_project/latest"
)

# Connect to Azure Blob (ADLS Gen2)
account_url = f"https://{ACCOUNT_NAME}.blob.core.windows.net"
credential = ClientSecretCredential(TENANT_ID, CLIENT_ID, CLIENT_SECRET)
blob_service_client = BlobServiceClient(account_url=account_url, credential=credential)

# Ensure container exists
container_client = blob_service_client.get_container_client(CONTAINER_NAME)
try:
    container_client.create_container()
    print(f"✅ Created container: {CONTAINER_NAME}")
except Exception:
    print(f"ℹ️ Container '{CONTAINER_NAME}' already exists")

# Walk through all files in BASE_DIR
for root, dirs, files in os.walk(BASE_DIR):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)

            # Generate ADLS object path like: rowdata/latest/subfolder/file.csv
            relative_path = os.path.relpath(file_path, BASE_DIR)
            blob_name = f"rowdata/latest/{relative_path}"

            try:
                blob_client = container_client.get_blob_client(blob_name)
                with open(file_path, "rb") as data:
                    blob_client.upload_blob(data, overwrite=True)

                print(f"⬆️ Uploaded: {file_path} → {CONTAINER_NAME}/{blob_name}")
            except Exception as e:
                print(f"❌ Failed to upload {file_path}: {e}")






# # MinIO connection settings (read from environment variables or use defaults)
# MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "minio:9000")
# MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY", "minio")
# MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY", "minio123")
# BUCKET_NAME = os.getenv("BUCKET_NAME", "transitbatchlatest")
# BASE_DIR = os.getenv(
#     "BASE_DIR",
#     "/opt/airflow/dags/notebooks/gtfs_extract_workspace/Transitland_project/latest"
# )

# # Connect to MinIO
# client = Minio(
#     MINIO_ENDPOINT,
#     access_key=MINIO_ACCESS_KEY,
#     secret_key=MINIO_SECRET_KEY,
#     secure=False  # HTTP, not HTTPS
# )

# # Create the bucket if it doesn't exist
# if not client.bucket_exists(BUCKET_NAME):
#     client.make_bucket(BUCKET_NAME)
#     print(f"✅ Created bucket: {BUCKET_NAME}")
# else:
#     print(f"ℹ️ Bucket '{BUCKET_NAME}' already exists")

# # Walk through all files in BASE_DIR
# for root, dirs, files in os.walk(BASE_DIR):
#     for file in files:
#         if file.endswith(".csv"):
#             file_path = os.path.join(root, file)

#             # Generate MinIO object path like: rowdata/latest/subfolder/file.csv
#             relative_path = os.path.relpath(file_path, BASE_DIR)
#             object_name = f"rowdata/latest/{relative_path}"

#             try:
#                 client.fput_object(BUCKET_NAME, object_name, file_path)
#                 print(f"⬆️ Uploaded: {file_path} → {BUCKET_NAME}/{object_name}")
#             except S3Error as e:
#                 print(f"❌ Failed to upload {file_path}: {e}")

ℹ️ Bucket 'transitbatch' already exists
⬆️ Uploaded: ./Transitland_project/latest/mtanyctbusbrooklyn_2025-07-03/stop_times.csv → transitbatch/rowdata/latest/mtanyctbusbrooklyn_2025-07-03/stop_times.csv
⬆️ Uploaded: ./Transitland_project/latest/mtanyctbusbrooklyn_2025-07-03/stops.csv → transitbatch/rowdata/latest/mtanyctbusbrooklyn_2025-07-03/stops.csv
⬆️ Uploaded: ./Transitland_project/latest/mtanyctbusbrooklyn_2025-07-03/agency.csv → transitbatch/rowdata/latest/mtanyctbusbrooklyn_2025-07-03/agency.csv
⬆️ Uploaded: ./Transitland_project/latest/mtanyctbusbrooklyn_2025-07-03/calendar_dates.csv → transitbatch/rowdata/latest/mtanyctbusbrooklyn_2025-07-03/calendar_dates.csv
⬆️ Uploaded: ./Transitland_project/latest/mtanyctbusbrooklyn_2025-07-03/trips.csv → transitbatch/rowdata/latest/mtanyctbusbrooklyn_2025-07-03/trips.csv
⬆️ Uploaded: ./Transitland_project/latest/mtanyctbusbrooklyn_2025-07-03/calendar.csv → transitbatch/rowdata/latest/mtanyctbusbrooklyn_2025-07-03/calendar.csv
⬆️ Uploaded:

In [None]:

def clear_directory(path):
    if not os.path.exists(path):
        print(f"Path does not exist: {path}")
        return
    
    for item in os.listdir(path):
        item_path = os.path.join(path, item)
        try:
            if os.path.isfile(item_path) or os.path.islink(item_path):
                os.remove(item_path)
                print(f"Deleted file: {item_path}")
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)
                print(f"Deleted folder: {item_path}")
        except Exception as e:
            print(f"Error deleting {item_path}: {e}")

clear_directory(base_dir)

📋 Copied: rowdata/latest/mtanyctbusbronx_2025-06-30/agency.csv → rowdata/old/mtanyctbusbronx_2025-06-30/agency.csv
🗑️ Removed original: rowdata/latest/mtanyctbusbronx_2025-06-30/agency.csv
📋 Copied: rowdata/latest/mtanyctbusbronx_2025-06-30/calendar.csv → rowdata/old/mtanyctbusbronx_2025-06-30/calendar.csv
🗑️ Removed original: rowdata/latest/mtanyctbusbronx_2025-06-30/calendar.csv
📋 Copied: rowdata/latest/mtanyctbusbronx_2025-06-30/calendar_dates.csv → rowdata/old/mtanyctbusbronx_2025-06-30/calendar_dates.csv
🗑️ Removed original: rowdata/latest/mtanyctbusbronx_2025-06-30/calendar_dates.csv
📋 Copied: rowdata/latest/mtanyctbusbronx_2025-06-30/routes.csv → rowdata/old/mtanyctbusbronx_2025-06-30/routes.csv
🗑️ Removed original: rowdata/latest/mtanyctbusbronx_2025-06-30/routes.csv
📋 Copied: rowdata/latest/mtanyctbusbronx_2025-06-30/shapes.csv → rowdata/old/mtanyctbusbronx_2025-06-30/shapes.csv
🗑️ Removed original: rowdata/latest/mtanyctbusbronx_2025-06-30/shapes.csv
📋 Copied: rowdata/latest/