# Train CSV Downloader & Merger

Download daily Realtime Trains CSVs and merge into one file.

**Setup:**
1. Create a `.env` file in the project root with your credentials:
   ```
   REALTIME_TRAINS_USERNAME=your_username
   REALTIME_TRAINS_PASSWORD=your_password
   ```
2. Add `.env` to `.gitignore`
3. Run the cells below

In [None]:
import sys
import os
from pathlib import Path
from datetime import date, timedelta
from dotenv import load_dotenv

# Load credentials from .env
load_dotenv(override=False)
username = os.getenv("REALTIME_TRAINS_USERNAME")
password = os.getenv("REALTIME_TRAINS_PASSWORD")

# Add src/modules to path to import data_collection
sys.path.insert(0, str(Path("src/modules").resolve()))
from modules.data_collection import collect_csvs_with_browser

print("✓ Setup complete")

In [None]:
import subprocess
import sys

# Ensure required packages are installed
packages = ["python-dotenv", "requests", "selenium", "webdriver-manager", "pandas"]
for pkg in packages:
    try:
        __import__(pkg.replace("-", "_"))
        print(f"✓ {pkg} already installed")
    except ImportError:
        print(f"Installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
        print(f"✓ {pkg} installed")

print("\n✓ All packages ready")

## Configuration

Edit these values to change the date range and save location:

In [None]:
start_date = date.today() - timedelta(days=366)  # one year ago
end_date = date.today() - timedelta(days=1)  # yesterday
url_template = "https://www.realtimetrains.co.uk/search/detailed/gb-nr:RDG/{yyyy}-{mm}-{dd}/0000-2359?stp=WVC&show=all&order=wtt"
output_file = "../resources/merged_realtime_trains.csv"

print(f"Date range: {start_date} to {end_date}")
print(f"Output: {output_file}")

## Download & Merge

**Before running:** Customize the Playwright selectors in `data_collection.py` for login and download button (see `[CUSTOMIZE]` comments)

In [None]:
# Debug: Test single day download to see what error occurs
from datetime import date
from modules.data_collection import download_csv_with_browser

test_date = date(2024, 11, 8)
try:
    result = download_csv_with_browser(
        url_template, 
        test_date, 
        "../resources",  # dest_dir
        username=username, 
        password=password
    )
    print(f"✓ Download successful: {result}")
except Exception as e:
    print(f"✗ Error: {type(e).__name__}: {e}")
    import traceback
    traceback.print_exc()

In [None]:
collect_csvs_with_browser(start_date, end_date, url_template, output_file,username=username, password=password)

## View Results

In [None]:
import pandas as pd

df = pd.read_csv(output_file)
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()