In [40]:
import asyncio
from crawl4ai import AsyncWebCrawler
import json

async def main():
    async with AsyncWebCrawler() as crawler:
        # Replace with any RailYatri train URL
        url = "https://www.railyatri.in/trains/12951-mumbai-ltt-hyderabad-safal-express"

        # Crawl the page
        result = await crawler.arun(url=url)

        # Extract tables (Crawl4AI auto-detects tables)
        tables = result.tables

        if tables:
            # Save the first table as JSON
            with open("train_schedule.json", "w", encoding="utf-8") as f:
                json.dump(tables[0], f, indent=2, ensure_ascii=False)
            print(" Train schedule saved to train_schedule.json")
        else:
            print(" No tables found on the page.")

        # Optional: print table
        print(tables[0] if tables else "No table found.")

# Run the crawler
await main()

 Train schedule saved to train_schedule.json
{'headers': ['Train Number', 'Train Name', 'Starts', 'Ends'], 'rows': [['58719', 'Abhanpur Rajim Pasenger', 'Abhanpur Junction', 'Rajim'], ['58721', 'Abhanpur Rajim Pasenger', 'Abhanpur Junction', 'Rajim'], ['18242', 'Abkp Durg Exp', 'Ambikapur', 'Durg'], ['18756', 'Abkp Sdl Exp', 'Ambikapur', 'Shahdol'], ['14722', 'Abohar Jodhpur Expres', 'Abohar', 'Jodhpur Jn'], ['79438', 'Abu Road Mahesana Demu', 'Abu Road', 'Mahesana Jn'], ['22214', 'Ac Duronto Expres', 'Patna Jn', 'Kolkata Shalimar'], ['55338', 'Achnera Kasganj Fast Pasenger', 'Achnera Jn', 'Kasganj'], ['05348', 'Achnera Kasganj Pasenger Special', 'Achnera Jn', 'Kasganj'], ['07681', 'Adb Mas Rrb Spl', 'Adilabad', 'Mgr Chenai Central'], ['04166', 'Adi Agc Sup Spl', 'Ahmedabad Jn', 'Agra Cant'], ['09089', 'Adi Gkp Spl', 'Ahmedabad Jn', 'Gorakhpur Jn'], ['09410', 'Adi Heritage Spl', 'Ekta Nagar', 'Ahmedabad Jn'], ['09429', 'Adi Mfp Spl', 'Ahmedabad Jn', 'Muzafarpur Jn'], ['57554', 'Adilaba

In [41]:
import asyncio
from crawl4ai import AsyncWebCrawler
import pandas as pd

# List of RailYatri train URLs (replace with the trains you want)
train_urls = [
    "https://www.railyatri.in/trains/12951-mumbai-ltt-hyderabad-safal-express",
    "https://www.railyatri.in/trains/12723-delhi-hyderabad-hamsafar-express"
]

async def main():
    all_trains_data = []

    async with AsyncWebCrawler() as crawler:
        for url in train_urls:
            print(f"üöÄ Crawling {url} ...")
            result = await crawler.arun(url=url)

            # Crawl4AI auto-detects tables
            tables = result.tables

            if tables:
                # Take the first table (usually schedule) which is a dictionary
                table_data = tables[0]
                headers = table_data.get('headers', [])
                rows = table_data.get('rows', [])

                for list_row in rows:
                    # Convert the list row to a dictionary using headers
                    dict_row = dict(zip(headers, list_row))
                    dict_row['train_url'] = url
                    all_trains_data.append(dict_row)
            else:
                print(f" No table found for {url}")

    # Convert all data to DataFrame
    df = pd.DataFrame(all_trains_data)

    # Save to CSV
    df.to_csv("train_schedules.csv", index=False, encoding="utf-8")
    print("All train schedules saved to train_schedules.csv")

# Run crawler
await main()

üöÄ Crawling https://www.railyatri.in/trains/12951-mumbai-ltt-hyderabad-safal-express ...


üöÄ Crawling https://www.railyatri.in/trains/12723-delhi-hyderabad-hamsafar-express ...


All train schedules saved to train_schedules.csv


In [42]:
import pandas as pd

df_schedules = pd.read_csv('train_schedules.csv')
display(df_schedules)

Unnamed: 0,Train Number,Train Name,Starts,Ends,train_url
0,58719,Abhanpur Rajim Pasenger,Abhanpur Junction,Rajim,https://www.railyatri.in/trains/12951-mumbai-l...
1,58721,Abhanpur Rajim Pasenger,Abhanpur Junction,Rajim,https://www.railyatri.in/trains/12951-mumbai-l...
2,18242,Abkp Durg Exp,Ambikapur,Durg,https://www.railyatri.in/trains/12951-mumbai-l...
3,18756,Abkp Sdl Exp,Ambikapur,Shahdol,https://www.railyatri.in/trains/12951-mumbai-l...
4,14722,Abohar Jodhpur Expres,Abohar,Jodhpur Jn,https://www.railyatri.in/trains/12951-mumbai-l...
5,79438,Abu Road Mahesana Demu,Abu Road,Mahesana Jn,https://www.railyatri.in/trains/12951-mumbai-l...
6,22214,Ac Duronto Expres,Patna Jn,Kolkata Shalimar,https://www.railyatri.in/trains/12951-mumbai-l...
7,55338,Achnera Kasganj Fast Pasenger,Achnera Jn,Kasganj,https://www.railyatri.in/trains/12951-mumbai-l...
8,5348,Achnera Kasganj Pasenger Special,Achnera Jn,Kasganj,https://www.railyatri.in/trains/12951-mumbai-l...
9,7681,Adb Mas Rrb Spl,Adilabad,Mgr Chenai Central,https://www.railyatri.in/trains/12951-mumbai-l...


In [43]:
import asyncio
from crawl4ai import AsyncWebCrawler
import pandas as pd

# List of RailYatri train URLs
train_urls = [
    "https://www.railyatri.in/trains/12951-mumbai-ltt-hyderabad-safal-express",
    "https://www.railyatri.in/trains/12723-delhi-hyderabad-hamsafar-express"
]

async def main():
    all_data = []

    async with AsyncWebCrawler() as crawler:
        for url in train_urls:
            print(f" Crawling {url} ...")
            result = await crawler.arun(url=url)

            # Crawl4AI auto-detects tables
            tables = result.tables
            if not tables:
                print(f"No tables found for {url}")
                continue

            # Usually first table = schedule
            # The table is a dictionary with 'headers' and 'rows'
            schedule_table_data = tables[0]
            headers = schedule_table_data.get('headers', [])
            rows = schedule_table_data.get('rows', [])

            # Optional: extract running status / seat info from other tables
            running_status = "N/A"
            seat_availability = "N/A"

            # Assuming table 1 (tables[1]) might contain seat info
            if len(tables) > 1 and tables[1]:
                seat_info_table = tables[1]
                # Check if seat_info_table is also a dictionary with 'headers' and 'rows'
                if isinstance(seat_info_table, dict) and 'rows' in seat_info_table:
                    seat_headers = seat_info_table.get('headers', [])
                    seat_rows = seat_info_table.get('rows', [])

                    seat_availability_parts = []
                    for seat_list_row in seat_rows:
                        seat_dict_row = dict(zip(seat_headers, seat_list_row))
                        # Assuming 'Class' and 'Availability' are keys in seat_dict_row
                        seat_availability_parts.append(f"{seat_dict_row.get('Class','')}:{seat_dict_row.get('Availability','')}")
                    seat_availability = "; ".join(seat_availability_parts)

            # Process each actual data row
            for list_row in rows:
                # Convert the list row to a dictionary using headers
                dict_row = dict(zip(headers, list_row))
                dict_row['train_url'] = url
                dict_row['running_status'] = running_status
                dict_row['seat_availability'] = seat_availability
                all_data.append(dict_row)

    # Convert to DataFrame
    df = pd.DataFrame(all_data)

    # Save to CSV
    df.to_csv("train_full_data.csv", index=False, encoding="utf-8")
    print(" All train data saved to train_full_data.csv")

# Run crawler
await main()

 Crawling https://www.railyatri.in/trains/12951-mumbai-ltt-hyderabad-safal-express ...


 Crawling https://www.railyatri.in/trains/12723-delhi-hyderabad-hamsafar-express ...


 All train data saved to train_full_data.csv


In [44]:
import asyncio
from crawl4ai import AsyncWebCrawler
import json

async def inspect_tables():
    async with AsyncWebCrawler() as crawler:
        # Use one of the URLs from the previous list for inspection
        url = "https://www.railyatri.in/trains/12951-mumbai-ltt-hyderabad-safal-express"
        print(f" Inspecting tables for {url} ...")
        result = await crawler.arun(url=url)
        tables = result.tables

        if tables:
            print(f"Found {len(tables)} tables.")
            for i, table in enumerate(tables):
                print(f"\n--- Table {i} ---")
                print(json.dumps(table, indent=2, ensure_ascii=False))
        else:
            print("No tables found on the page.")

await inspect_tables()

 Inspecting tables for https://www.railyatri.in/trains/12951-mumbai-ltt-hyderabad-safal-express ...


Found 1 tables.

--- Table 0 ---
{
  "headers": [
    "Train Number",
    "Train Name",
    "Starts",
    "Ends"
  ],
  "rows": [
    [
      "58719",
      "Abhanpur Rajim Pasenger",
      "Abhanpur Junction",
      "Rajim"
    ],
    [
      "58721",
      "Abhanpur Rajim Pasenger",
      "Abhanpur Junction",
      "Rajim"
    ],
    [
      "18242",
      "Abkp Durg Exp",
      "Ambikapur",
      "Durg"
    ],
    [
      "18756",
      "Abkp Sdl Exp",
      "Ambikapur",
      "Shahdol"
    ],
    [
      "14722",
      "Abohar Jodhpur Expres",
      "Abohar",
      "Jodhpur Jn"
    ],
    [
      "79438",
      "Abu Road Mahesana Demu",
      "Abu Road",
      "Mahesana Jn"
    ],
    [
      "22214",
      "Ac Duronto Expres",
      "Patna Jn",
      "Kolkata Shalimar"
    ],
    [
      "55338",
      "Achnera Kasganj Fast Pasenger",
      "Achnera Jn",
      "Kasganj"
    ],
    [
      "05348",
      "Achnera Kasganj Pasenger Special",
      "Achnera Jn",
      "Kasganj"
    ],

In [45]:
import pandas as pd

df_full_data = pd.read_csv('train_full_data.csv')

print("Column Information for train_full_data.csv:")
df_full_data.info()

print("\nFirst 5 rows of train_full_data.csv:")
display(df_full_data.head())

Column Information for train_full_data.csv:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Train Number       40 non-null     int64  
 1   Train Name         40 non-null     object 
 2   Starts             40 non-null     object 
 3   Ends               40 non-null     object 
 4   train_url          40 non-null     object 
 5   running_status     0 non-null      float64
 6   seat_availability  0 non-null      float64
dtypes: float64(2), int64(1), object(4)
memory usage: 2.3+ KB

First 5 rows of train_full_data.csv:


Unnamed: 0,Train Number,Train Name,Starts,Ends,train_url,running_status,seat_availability
0,58719,Abhanpur Rajim Pasenger,Abhanpur Junction,Rajim,https://www.railyatri.in/trains/12951-mumbai-l...,,
1,58721,Abhanpur Rajim Pasenger,Abhanpur Junction,Rajim,https://www.railyatri.in/trains/12951-mumbai-l...,,
2,18242,Abkp Durg Exp,Ambikapur,Durg,https://www.railyatri.in/trains/12951-mumbai-l...,,
3,18756,Abkp Sdl Exp,Ambikapur,Shahdol,https://www.railyatri.in/trains/12951-mumbai-l...,,
4,14722,Abohar Jodhpur Expres,Abohar,Jodhpur Jn,https://www.railyatri.in/trains/12951-mumbai-l...,,


In [50]:
from crawl4ai import AsyncWebCrawler, JsonCssExtractionStrategy
import json

async def scrape_train_schedule():
    # Using a different URL from railyatri.in, which previously worked well
    url = "https://www.railyatri.in/trains/12701-mumbai-hyderabad-deccan-express"

    # Note: For RailYatri, Crawl4AI often auto-detects tables without explicit CSS extractors.
    # However, if specific fields are needed, we can define a JsonCssExtractionStrategy.
    # For now, let's try to get tables directly first.

    async with AsyncWebCrawler() as crawler:
        print(f"üöÄ Crawling {url} to get train schedule...")
        result = await crawler.arun(
            url=url,
            # For RailYatri, auto-detection of tables is usually sufficient.
            # We can define an extractor if we need more fine-grained control or other data.
            # extractor=extractor, # Removed for now to rely on auto-detection
            bypass_cache=True,
            goto_options={'timeout': 60000, 'waitUntil': 'domcontentloaded'} # Default or adjusted as needed
        )

        if result.tables:
            print(f"Found {len(result.tables)} tables. Displaying the first one:")
            print(json.dumps(result.tables[0], indent=2, ensure_ascii=False))
        else:
            print("‚ö† No tables found on the page for this URL using auto-detection.")

        # If specific content other than tables is needed, result.extracted_content can be used
        # print("\n--- Extracted Content (if any) ---")
        # print(result.extracted_content)

import asyncio
await scrape_train_schedule()

üöÄ Crawling https://www.railyatri.in/trains/12701-mumbai-hyderabad-deccan-express to get train schedule...


Found 1 tables. Displaying the first one:
{
  "headers": [
    "Train Number",
    "Train Name",
    "Starts",
    "Ends"
  ],
  "rows": [
    [
      "58719",
      "Abhanpur Rajim Pasenger",
      "Abhanpur Junction",
      "Rajim"
    ],
    [
      "58721",
      "Abhanpur Rajim Pasenger",
      "Abhanpur Junction",
      "Rajim"
    ],
    [
      "18242",
      "Abkp Durg Exp",
      "Ambikapur",
      "Durg"
    ],
    [
      "18756",
      "Abkp Sdl Exp",
      "Ambikapur",
      "Shahdol"
    ],
    [
      "14722",
      "Abohar Jodhpur Expres",
      "Abohar",
      "Jodhpur Jn"
    ],
    [
      "79438",
      "Abu Road Mahesana Demu",
      "Abu Road",
      "Mahesana Jn"
    ],
    [
      "22214",
      "Ac Duronto Expres",
      "Patna Jn",
      "Kolkata Shalimar"
    ],
    [
      "55338",
      "Achnera Kasganj Fast Pasenger",
      "Achnera Jn",
      "Kasganj"
    ],
    [
      "05348",
      "Achnera Kasganj Pasenger Special",
      "Achnera Jn",
      "Kasgan