# Train CSV Downloader & Merger

Download daily Realtime Trains CSVs and merge into one file.

**Setup:**
1. Create a `.env` file in the project root with your credentials:
   ```
   REALTIME_TRAINS_USERNAME=your_username
   REALTIME_TRAINS_PASSWORD=your_password
   ```
2. Add `.env` to `.gitignore`
3. Run the cells below

In [1]:
import sys
import os
from pathlib import Path
from datetime import date, timedelta
from dotenv import load_dotenv

# Load credentials from .env
load_dotenv(override=False)
username = os.getenv("REALTIME_TRAINS_USERNAME")
password = os.getenv("REALTIME_TRAINS_PASSWORD")

# Add src/modules to path to import data_collection
sys.path.insert(0, str(Path("src/modules").resolve()))
from modules.data_collection import collect_csvs_with_browser

print("✓ Setup complete")

✓ Setup complete


In [None]:
import subprocess
import sys

# Ensure required packages are installed
packages = ["python-dotenv", "requests", "playwright", "pandas"]
for pkg in packages:
    try:
        __import__(pkg.replace("-", "_"))
        print(f"✓ {pkg} already installed")
    except ImportError:
        print(f"Installing {pkg}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
        print(f"✓ {pkg} installed")

# Install playwright browsers (required for Playwright to work)
print("\nInstalling Playwright browsers...")
subprocess.check_call([sys.executable, "-m", "playwright", "install"])
print("✓ Playwright browsers installed")

print("\n✓ All packages ready")

Installing python-dotenv...
✓ python-dotenv installed
✓ requests already installed
✓ playwright already installed
✓ python-dotenv installed
✓ requests already installed
✓ playwright already installed
✓ pandas already installed
✓ nest-asyncio already installed

Installing Playwright browsers...
✓ pandas already installed
✓ nest-asyncio already installed

Installing Playwright browsers...
✓ Playwright browsers installed

✓ All packages ready
✓ Playwright browsers installed

✓ All packages ready


## Configuration

Edit these values to change the date range and save location:

In [3]:
start_date = date.today() - timedelta(days=366)  # one year ago
end_date = date.today() - timedelta(days=1)  # yesterday
url_template = "https://www.realtimetrains.co.uk/search/detailed/gb-nr:RDG/{yyyy}-{mm}-{dd}/0000-2359?stp=WVC&show=all&order=wtt"
output_file = "../resources/merged_realtime_trains.csv"

print(f"Date range: {start_date} to {end_date}")
print(f"Output: {output_file}")

Date range: 2024-12-03 to 2025-12-03
Output: ../resources/merged_realtime_trains.csv


## Download & Merge

**Before running:** Customize the Playwright selectors in `data_collection.py` for login and download button (see `[CUSTOMIZE]` comments)

In [4]:
# Debug: Test single day download to see what error occurs
from datetime import date
from modules.data_collection import download_csv_with_browser

test_date = date(2024, 11, 8)
try:
    result = download_csv_with_browser(
        url_template, 
        test_date, 
        "../resources",  # dest_dir
        username=username, 
        password=password
    )
    print(f"✓ Download successful: {result}")
except Exception as e:
    print(f"✗ Error: {type(e).__name__}: {e}")
    import traceback
    traceback.print_exc()

Task exception was never retrieved
future: <Task finished name='Task-11' coro=<Connection.run() done, defined at c:\Users\fcpen\Documents\GitHub\Train_delays_and_services\venv\Lib\site-packages\playwright\_impl\_connection.py:305> exception=NotImplementedError()>
Traceback (most recent call last):
  File "C:\Users\fcpen\AppData\Local\Programs\Python\Python313\Lib\asyncio\tasks.py", line 304, in __step_run_and_handle_result
    result = coro.send(None)
  File "c:\Users\fcpen\Documents\GitHub\Train_delays_and_services\venv\Lib\site-packages\playwright\_impl\_connection.py", line 312, in run
    await self._transport.connect()
  File "c:\Users\fcpen\Documents\GitHub\Train_delays_and_services\venv\Lib\site-packages\playwright\_impl\_transport.py", line 133, in connect
    raise exc
  File "c:\Users\fcpen\Documents\GitHub\Train_delays_and_services\venv\Lib\site-packages\playwright\_impl\_transport.py", line 120, in connect
    self._proc = await asyncio.create_subprocess_exec(
             

✗ Error: NotImplementedError: 


Traceback (most recent call last):
  File "C:\Users\fcpen\AppData\Local\Temp\ipykernel_31232\4012682927.py", line 7, in <module>
    result = download_csv_with_browser(
        url_template,
    ...<3 lines>...
        password=password
    )
  File "c:\Users\fcpen\Documents\GitHub\Train_delays_and_services\src\modules\data_collection.py", line 105, in download_csv_with_browser
    return asyncio.run(_download_csv_with_browser_async(url_template, d, dest_dir, username, password))
           ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\fcpen\Documents\GitHub\Train_delays_and_services\venv\Lib\site-packages\nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
           ~~~~~~~~~~~~~~~~~~~~~~~^^^^^^
  File "c:\Users\fcpen\Documents\GitHub\Train_delays_and_services\venv\Lib\site-packages\nest_asyncio.py", line 98, in run_until_complete
    return f.result()
           ~~~~~~~~^^
  File "C:\Users\fcpen\A

In [8]:
collect_csvs_with_browser(start_date, end_date, url_template, output_file,username=username, password=password)

Skipping 2024-12-03: Playwright is not installed. Run: pip install playwright && playwright install
Skipping 2024-12-04: Playwright is not installed. Run: pip install playwright && playwright install
Skipping 2024-12-05: Playwright is not installed. Run: pip install playwright && playwright install
Skipping 2024-12-06: Playwright is not installed. Run: pip install playwright && playwright install
Skipping 2024-12-07: Playwright is not installed. Run: pip install playwright && playwright install
Skipping 2024-12-08: Playwright is not installed. Run: pip install playwright && playwright install
Skipping 2024-12-09: Playwright is not installed. Run: pip install playwright && playwright install
Skipping 2024-12-10: Playwright is not installed. Run: pip install playwright && playwright install
Skipping 2024-12-11: Playwright is not installed. Run: pip install playwright && playwright install
Skipping 2024-12-12: Playwright is not installed. Run: pip install playwright && playwright install


RuntimeError: No files were downloaded; cannot create merged CSV

## View Results

In [None]:
import pandas as pd

df = pd.read_csv(output_file)
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()