In [3]:
# @title BLOCK 1 - Project Information

"""
ISRO Launch Market Trend Analysis & Risk Prediction
Student: Apurva Upadhyay (IIT Ropar, Minor in AI)
Notebook: 01_Data_Acquisition.ipynb

Purpose:
- Scrape ISRO launch history data from Wikipedia
- Extract PSLV, GSLV, and LVM3 launch records
- Consolidate into single dataset for ML pipeline

Run Instructions:
- FIRST TIME: Run all blocks sequentially (1-14)
- AFTER RUNTIME RESTART: Run blocks 1, 2, 14 only (skip scraping if data exists)

Data Output:
- isro_launch_history_raw.csv (91 missions)
"""

print("ISRO Launch Data Acquisition Pipeline")
print("Student: Apurva Upadhyay")


ISRO Launch Data Acquisition Pipeline
Student: Apurva Upadhyay


In [4]:
# @title BLOCK 2 - Mount Google Drive (RUN EVERY SESSION)

from google.colab import drive
import os

drive.mount('/content/drive')

# Define project paths
BASE_PATH = '/content/drive/My Drive/Course/Minor in AI/Final Project/ISRO Launch Trend Analysis - Apurva Upadhyay'
DATASET_PATH = os.path.join(BASE_PATH, 'Dataset')
CODE_PATH = os.path.join(BASE_PATH, 'Code')
OUTPUT_PATH = os.path.join(BASE_PATH, 'Output')

# Create directories
os.makedirs(DATASET_PATH, exist_ok=True)
os.makedirs(os.path.join(DATASET_PATH, 'individual'), exist_ok=True)
os.makedirs(CODE_PATH, exist_ok=True)
os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs('isro_scraper', exist_ok=True)

print("Google Drive mounted successfully")
print(f"Dataset path: {DATASET_PATH}")
print(f"Code path: {CODE_PATH}")
print("Directory structure created")


Mounted at /content/drive
Google Drive mounted successfully
Dataset path: /content/drive/My Drive/Course/Minor in AI/Final Project/ISRO Launch Trend Analysis - Apurva Upadhyay/Dataset
Code path: /content/drive/My Drive/Course/Minor in AI/Final Project/ISRO Launch Trend Analysis - Apurva Upadhyay/Code
Directory structure created


In [5]:
# @title BLOCK 3 - Install Dependencies (RUN ONCE)

!pip install requests beautifulsoup4 pydantic pandas -q

print("Dependencies installed")


Dependencies installed


In [6]:
# @title BLOCK 4 - Create config.py

%%writefile isro_scraper/config.py
"""Configuration module for ISRO Launch Scraper."""

from typing import Dict
from dataclasses import dataclass


@dataclass
class ScraperConfig:
    """Main configuration for the scraper application."""

    REQUEST_TIMEOUT: int = 30
    MAX_RETRIES: int = 3
    RETRY_DELAY: int = 5
    INTER_SCRAPER_DELAY: int = 10

    HEADERS: Dict[str, str] = None

    def __post_init__(self):
        """Initialize headers after dataclass creation."""
        if self.HEADERS is None:
            self.HEADERS = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.9',
                'Accept-Encoding': 'gzip, deflate, br',
                'DNT': '1',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
                'Sec-Fetch-Dest': 'document',
                'Sec-Fetch-Mode': 'navigate',
                'Sec-Fetch-Site': 'none',
                'Cache-Control': 'max-age=0'
            }


WIKIPEDIA_URLS = {
    'pslv': 'https://en.wikipedia.org/wiki/List_of_PSLV_launches',
    'gslv': 'https://en.wikipedia.org/wiki/List_of_GSLV_launches',
    'lvm3': 'https://en.wikipedia.org/wiki/List_of_LVM3_launches'
}

EXPECTED_COUNTS = {
    'pslv': 64,
    'gslv': 18,
    'lvm3': 9
}

VALID_OUTCOMES = {
    'Success', 'Failure', 'Partial failure', 'Partial Failure',
    'Scheduled', 'Cancelled', 'Planned'
}

OUTPUT_DIR = 'output'
LOG_DIR = 'logs'


Writing isro_scraper/config.py


In [7]:
# @title BLOCK 5 - Create models.py

%%writefile isro_scraper/models.py
"""Data models for ISRO launch records."""

from typing import Optional
from pydantic import BaseModel, Field, field_validator
import re


class PSLVLaunch(BaseModel):
    """Data model for PSLV launch records."""

    flight_number: str = Field(..., description="Flight number")
    date_time_utc: Optional[str] = Field(None, description="Launch date/time UTC")
    rocket_configuration: Optional[str] = Field(None, description="Rocket variant")
    launch_site: Optional[str] = Field(None, description="Launch site")
    payload: str = Field(..., description="Payload name(s)")
    payload_mass: Optional[str] = Field(None, description="Payload mass")
    orbit: Optional[str] = Field(None, description="Target orbit")
    user: Optional[str] = Field(None, description="Customer/Organization")
    launch_outcome: Optional[str] = Field(None, description="Outcome")
    remarks: Optional[str] = Field(None, description="Mission notes")

    @field_validator('payload_mass')
    @classmethod
    def validate_mass(cls, v: Optional[str]) -> Optional[str]:
        """Validate mass contains numeric values."""
        if v is None or v.strip() == '':
            return None
        if re.search(r'\d', v):
            return v.strip()
        return None


class GSLVLaunch(BaseModel):
    """Data model for GSLV launch records."""

    flight_number: str = Field(..., description="Flight number")
    date_time_utc: Optional[str] = Field(None, description="Launch date/time UTC")
    rocket_configuration: Optional[str] = Field(None, description="Rocket variant")
    launch_site: Optional[str] = Field(None, description="Launch site")
    payload: str = Field(..., description="Payload name(s)")
    payload_mass: Optional[str] = Field(None, description="Payload mass")
    orbit: Optional[str] = Field(None, description="Target orbit")
    user: Optional[str] = Field(None, description="Customer/Organization")
    launch_outcome: Optional[str] = Field(None, description="Outcome")
    remarks: Optional[str] = Field(None, description="Mission notes")

    @field_validator('payload_mass')
    @classmethod
    def validate_mass(cls, v: Optional[str]) -> Optional[str]:
        """Validate mass contains numeric values."""
        if v is None or v.strip() == '':
            return None
        if re.search(r'\d', v):
            return v.strip()
        return None


class LVM3Launch(BaseModel):
    """Data model for LVM3 launch records."""

    flight_number: str = Field(..., description="Flight number")
    date_time_utc: Optional[str] = Field(None, description="Launch date/time UTC")
    launch_site: Optional[str] = Field(None, description="Launch site")
    payload: str = Field(..., description="Payload name(s)")
    payload_mass: Optional[str] = Field(None, description="Payload mass with units")
    regime: Optional[str] = Field(None, description="Orbital regime")
    operator: Optional[str] = Field(None, description="Operator/Organization")
    function: Optional[str] = Field(None, description="Mission function")
    status: Optional[str] = Field(None, description="Launch status")
    remarks: Optional[str] = Field(None, description="Mission notes")

    @field_validator('payload_mass')
    @classmethod
    def validate_mass(cls, v: Optional[str]) -> Optional[str]:
        """Validate mass contains numeric values."""
        if v is None or v.strip() == '':
            return None
        if re.search(r'\d', v):
            return v.strip()
        return None


Writing isro_scraper/models.py


In [8]:
# @title BLOCK 6 - Create parsers.py

%%writefile isro_scraper/parsers.py
"""HTML parsing utilities."""

from typing import Optional
from bs4 import Tag
import re


def clean_text(text: Optional[str]) -> Optional[str]:
    """Clean and normalize text from HTML."""
    if text is None:
        return None
    text = re.sub(r'\[\d+\]', '', text)
    text = ' '.join(text.split())
    text = text.strip()
    return text if text else None


def extract_cell_text(cell: Tag) -> Optional[str]:
    """Extract text from table cell."""
    if cell is None:
        return None
    text = cell.get_text(separator='\n', strip=True)
    return clean_text(text)


def is_remark_row(row: Tag) -> bool:
    """Check if row is a remarks row."""
    cells = row.find_all(['td', 'th'])
    if len(cells) == 1 and cells[0].has_attr('colspan'):
        return True
    return False


Writing isro_scraper/parsers.py


In [9]:
# @title BLOCK 7 - Create exporters.py

%%writefile isro_scraper/exporters.py
"""Export utilities for launch data."""

import csv
import json
from typing import List, Union
from pathlib import Path
import logging

from .models import PSLVLaunch, GSLVLaunch, LVM3Launch


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def export_to_csv(launches: List[Union[PSLVLaunch, GSLVLaunch, LVM3Launch]], filepath: str) -> None:
    """Export launch data to CSV file."""
    if not launches:
        logger.warning("No data to export")
        return

    Path(filepath).parent.mkdir(parents=True, exist_ok=True)

    with open(filepath, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = list(launches[0].model_dump().keys())
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for launch in launches:
            writer.writerow(launch.model_dump())

    logger.info(f"Exported {len(launches)} records to {filepath}")


def export_to_json(launches: List[Union[PSLVLaunch, GSLVLaunch, LVM3Launch]], filepath: str) -> None:
    """Export launch data to JSON file."""
    if not launches:
        logger.warning("No data to export")
        return

    Path(filepath).parent.mkdir(parents=True, exist_ok=True)

    outcomes = {}
    for launch in launches:
        if hasattr(launch, 'launch_outcome'):
            outcome = launch.launch_outcome or 'Unknown'
        elif hasattr(launch, 'status'):
            outcome = launch.status or 'Unknown'
        else:
            outcome = 'Unknown'

        outcomes[outcome] = outcomes.get(outcome, 0) + 1

    data = {
        'metadata': {
            'total_launches': len(launches),
            'outcomes': outcomes
        },
        'launches': [launch.model_dump() for launch in launches]
    }

    with open(filepath, 'w', encoding='utf-8') as jsonfile:
        json.dump(data, jsonfile, indent=2, ensure_ascii=False)

    logger.info(f"Exported {len(launches)} records to {filepath}")


Writing isro_scraper/exporters.py


In [10]:
# @title BLOCK 8 - Create pslv_scraper.py

%%writefile isro_scraper/pslv_scraper.py
"""PSLV Launch Scraper Module."""

import requests
from bs4 import BeautifulSoup, Tag
from typing import List, Optional, Dict
import logging
from time import sleep

from .config import ScraperConfig, WIKIPEDIA_URLS
from .models import PSLVLaunch
from .parsers import clean_text, extract_cell_text, is_remark_row


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class PSLVScraper:
    """Scraper for PSLV launch data from Wikipedia."""

    def __init__(self, config: Optional[ScraperConfig] = None):
        """Initialize PSLV scraper with configuration."""
        self.config = config or ScraperConfig()
        self.launches: List[PSLVLaunch] = []
        self.url = WIKIPEDIA_URLS['pslv']

    def scrape(self) -> List[PSLVLaunch]:
        """Scrape PSLV launch data from Wikipedia."""
        self.launches = []

        for attempt in range(1, self.config.MAX_RETRIES + 1):
            try:
                logger.info(f"Fetching PSLV data (attempt {attempt}/{self.config.MAX_RETRIES})")
                response = requests.get(
                    self.url,
                    headers=self.config.HEADERS,
                    timeout=self.config.REQUEST_TIMEOUT
                )
                response.raise_for_status()
                logger.info(f"Successfully fetched page ({len(response.text):,} bytes)")
                break
            except requests.exceptions.RequestException as e:
                if attempt == self.config.MAX_RETRIES:
                    logger.error(f"Failed to fetch after {self.config.MAX_RETRIES} attempts: {e}")
                    raise
                logger.warning(f"Attempt {attempt} failed, retrying in {self.config.RETRY_DELAY}s...")
                sleep(self.config.RETRY_DELAY)

        soup = BeautifulSoup(response.text, 'html.parser')
        tables = soup.find_all('table', class_='wikitable')
        logger.info(f"Found {len(tables)} launch tables")

        for table_idx, table in enumerate(tables):
            self._parse_table(table, table_idx)

        logger.info(f"Scraped {len(self.launches)} PSLV launches")
        return self.launches

    def _parse_table(self, table: Tag, table_idx: int) -> None:
        """Parse a single PSLV launch table."""
        rows = table.find_all('tr')

        for row_idx, row in enumerate(rows[2:], start=2):
            if is_remark_row(row):
                if self.launches:
                    remark_text = extract_cell_text(row.find('td'))
                    if remark_text:
                        self.launches[-1].remarks = remark_text
                continue

            cells = row.find_all('td')
            if len(cells) < 8:
                continue

            flight_th = row.find('th')
            flight_number = extract_cell_text(flight_th) if flight_th else None

            if not flight_number:
                continue

            try:
                launch = PSLVLaunch(
                    flight_number=flight_number,
                    date_time_utc=extract_cell_text(cells[0]),
                    rocket_configuration=extract_cell_text(cells[1]),
                    launch_site=extract_cell_text(cells[2]),
                    payload=extract_cell_text(cells[3]),
                    payload_mass=extract_cell_text(cells[4]),
                    orbit=extract_cell_text(cells[5]) or None,
                    user=extract_cell_text(cells[6]) or None,
                    launch_outcome=extract_cell_text(cells[7]) if len(cells) > 7 else None,
                    remarks=None
                )
                self.launches.append(launch)
            except (IndexError, ValueError) as e:
                logger.debug(f"Error parsing row {row_idx}: {e}")
                continue

    def get_statistics(self) -> Dict[str, int]:
        """Calculate launch outcome statistics."""
        stats = {
            'total': len(self.launches),
            'success': 0,
            'failure': 0,
            'partial_failure': 0,
            'scheduled': 0,
            'unknown': 0
        }

        for launch in self.launches:
            outcome = launch.launch_outcome
            if not outcome:
                stats['unknown'] += 1
            elif outcome == 'Success':
                stats['success'] += 1
            elif outcome == 'Failure':
                stats['failure'] += 1
            elif 'Partial' in outcome:
                stats['partial_failure'] += 1
            elif outcome == 'Scheduled':
                stats['scheduled'] += 1
            else:
                stats['unknown'] += 1

        return stats


Writing isro_scraper/pslv_scraper.py


In [11]:
# @title BLOCK 9 - Create gslv_scraper.py

%%writefile isro_scraper/gslv_scraper.py
"""GSLV Launch Scraper Module."""

import requests
from bs4 import BeautifulSoup, Tag
from typing import List, Optional, Dict
import logging
from time import sleep

from .config import ScraperConfig, WIKIPEDIA_URLS
from .models import GSLVLaunch
from .parsers import clean_text, extract_cell_text, is_remark_row


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class GSLVScraper:
    """Scraper for GSLV launch data from Wikipedia."""

    def __init__(self, config: Optional[ScraperConfig] = None):
        """Initialize GSLV scraper with configuration."""
        self.config = config or ScraperConfig()
        self.launches: List[GSLVLaunch] = []
        self.url = WIKIPEDIA_URLS['gslv']

    def scrape(self) -> List[GSLVLaunch]:
        """Scrape GSLV launch data from Wikipedia."""
        self.launches = []

        for attempt in range(1, self.config.MAX_RETRIES + 1):
            try:
                logger.info(f"Fetching GSLV data (attempt {attempt}/{self.config.MAX_RETRIES})")
                response = requests.get(
                    self.url,
                    headers=self.config.HEADERS,
                    timeout=self.config.REQUEST_TIMEOUT
                )
                response.raise_for_status()
                logger.info(f"Successfully fetched page ({len(response.text):,} bytes)")
                break
            except requests.exceptions.RequestException as e:
                if attempt == self.config.MAX_RETRIES:
                    logger.error(f"Failed to fetch after {self.config.MAX_RETRIES} attempts: {e}")
                    raise
                logger.warning(f"Attempt {attempt} failed, retrying in {self.config.RETRY_DELAY}s...")
                sleep(self.config.RETRY_DELAY)

        soup = BeautifulSoup(response.text, 'html.parser')
        tables = soup.find_all('table', class_='wikitable')
        logger.info(f"Found {len(tables)} launch tables")

        for table_idx, table in enumerate(tables):
            self._parse_table(table, table_idx)

        logger.info(f"Scraped {len(self.launches)} GSLV launches")
        return self.launches

    def _parse_table(self, table: Tag, table_idx: int) -> None:
        """Parse a single GSLV launch table."""
        rows = table.find_all('tr')

        for row_idx, row in enumerate(rows[2:], start=2):
            if is_remark_row(row):
                if self.launches:
                    remark_text = extract_cell_text(row.find('td'))
                    if remark_text:
                        self.launches[-1].remarks = remark_text
                continue

            cells = row.find_all('td')
            if len(cells) < 8:
                continue

            flight_th = row.find('th')
            flight_number = extract_cell_text(flight_th) if flight_th else None

            if not flight_number:
                continue

            try:
                launch = GSLVLaunch(
                    flight_number=flight_number,
                    date_time_utc=extract_cell_text(cells[0]),
                    rocket_configuration=extract_cell_text(cells[1]),
                    launch_site=extract_cell_text(cells[2]),
                    payload=extract_cell_text(cells[3]),
                    payload_mass=extract_cell_text(cells[4]),
                    orbit=extract_cell_text(cells[5]) or None,
                    user=extract_cell_text(cells[6]) or None,
                    launch_outcome=extract_cell_text(cells[7]) if len(cells) > 7 else None,
                    remarks=None
                )
                self.launches.append(launch)
            except (IndexError, ValueError) as e:
                logger.debug(f"Error parsing row {row_idx}: {e}")
                continue

    def get_statistics(self) -> Dict[str, int]:
        """Calculate launch outcome statistics."""
        stats = {
            'total': len(self.launches),
            'success': 0,
            'failure': 0,
            'partial_failure': 0,
            'scheduled': 0,
            'unknown': 0
        }

        for launch in self.launches:
            outcome = launch.launch_outcome
            if not outcome:
                stats['unknown'] += 1
            elif outcome == 'Success':
                stats['success'] += 1
            elif outcome == 'Failure':
                stats['failure'] += 1
            elif 'Partial' in outcome:
                stats['partial_failure'] += 1
            elif outcome == 'Scheduled':
                stats['scheduled'] += 1
            else:
                stats['unknown'] += 1

        return stats


Writing isro_scraper/gslv_scraper.py


In [13]:
# @title BLOCK 10 - Create lvm3_scraper.py

%%writefile isro_scraper/lvm3_scraper.py
"""LVM3 Launch Scraper Module."""

import requests
from bs4 import BeautifulSoup, Tag
from typing import List, Optional, Dict
import logging
from time import sleep

from .config import ScraperConfig, WIKIPEDIA_URLS
from .models import LVM3Launch
from .parsers import clean_text, extract_cell_text


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class LVM3Scraper:
    """Scraper for LVM3 launch data from Wikipedia."""

    def __init__(self, config: Optional[ScraperConfig] = None):
        """Initialize LVM3 scraper with configuration."""
        self.config = config or ScraperConfig()
        self.launches: List[LVM3Launch] = []
        self.url = WIKIPEDIA_URLS['lvm3']

    def scrape(self) -> List[LVM3Launch]:
        """Scrape LVM3 launch data from Wikipedia."""
        self.launches = []

        for attempt in range(1, self.config.MAX_RETRIES + 1):
            try:
                logger.info(f"Fetching LVM3 data (attempt {attempt}/{self.config.MAX_RETRIES})")
                response = requests.get(
                    self.url,
                    headers=self.config.HEADERS,
                    timeout=self.config.REQUEST_TIMEOUT
                )
                response.raise_for_status()
                logger.info(f"Successfully fetched page ({len(response.text):,} bytes)")
                break
            except requests.exceptions.RequestException as e:
                if attempt == self.config.MAX_RETRIES:
                    logger.error(f"Failed to fetch after {self.config.MAX_RETRIES} attempts: {e}")
                    raise
                logger.warning(f"Attempt {attempt} failed, retrying in {self.config.RETRY_DELAY}s...")
                sleep(self.config.RETRY_DELAY)

        soup = BeautifulSoup(response.text, 'html.parser')
        tables = soup.find_all('table', class_='wikitable')
        logger.info(f"Found {len(tables)} tables")

        for table_idx in [1, 2]:
            if table_idx < len(tables):
                self._parse_hierarchical_table(tables[table_idx], table_idx)

        logger.info(f"Scraped {len(self.launches)} LVM3 launches")
        return self.launches

    def _parse_hierarchical_table(self, table: Tag, table_idx: int) -> None:
        """Parse LVM3 hierarchical multi-row table structure."""
        rows = table.find_all('tr')
        current_launch = {}

        for row_idx, row in enumerate(rows[4:], start=4):
            cells = row.find_all('td')

            if len(cells) == 0:
                continue

            if len(cells) == 1 and cells[0].get('colspan'):
                if current_launch and current_launch.get('flight_number'):
                    status = current_launch.get('status', '')
                    if status == 'Success':
                        current_launch['remarks'] = extract_cell_text(cells[0])
                        try:
                            launch = LVM3Launch(**current_launch)
                            self.launches.append(launch)
                        except Exception as e:
                            logger.debug(f"Error creating launch: {e}")
                current_launch = {}
                continue

            if len(cells) == 5:
                if current_launch and current_launch.get('flight_number'):
                    status = current_launch.get('status', '')
                    if status == 'Success':
                        try:
                            launch = LVM3Launch(**current_launch)
                            self.launches.append(launch)
                        except Exception as e:
                            logger.debug(f"Error creating launch: {e}")
                current_launch = {}

                date_text = extract_cell_text(cells[0])
                payload_raw = extract_cell_text(cells[1])

                payload_name = payload_raw
                payload_mass = None

                if payload_raw and '\n' in payload_raw:
                    parts = payload_raw.split('\n')
                    payload_name = parts[0].strip()
                    if len(parts) > 1:
                        payload_mass = parts[1].strip()

                current_launch = {
                    'date_time_utc': date_text,
                    'payload': payload_name,
                    'payload_mass': payload_mass,
                    'launch_site': extract_cell_text(cells[2]),
                    'regime': extract_cell_text(cells[3]),
                    'status': extract_cell_text(cells[4]),
                    'flight_number': None,
                    'operator': None,
                    'function': None,
                    'remarks': None
                }

            elif len(cells) == 3 and current_launch:
                current_launch['flight_number'] = extract_cell_text(cells[0])
                current_launch['operator'] = extract_cell_text(cells[1])
                current_launch['function'] = extract_cell_text(cells[2])

        if current_launch and current_launch.get('flight_number'):
            status = current_launch.get('status', '')
            if status == 'Success':
                try:
                    launch = LVM3Launch(**current_launch)
                    self.launches.append(launch)
                except Exception as e:
                    logger.debug(f"Error creating final launch: {e}")

    def get_statistics(self) -> Dict[str, int]:
        """Calculate launch status statistics."""
        stats = {
            'total': len(self.launches),
            'success': 0,
            'failure': 0,
            'partial_failure': 0,
            'scheduled': 0,
            'unknown': 0
        }

        for launch in self.launches:
            status = launch.status
            if not status:
                stats['unknown'] += 1
            elif status == 'Success':
                stats['success'] += 1
            elif status == 'Failure':
                stats['failure'] += 1
            elif 'Partial' in status:
                stats['partial_failure'] += 1
            elif status in ['Scheduled', 'Planned']:
                stats['scheduled'] += 1
            else:
                stats['unknown'] += 1

        return stats


Overwriting isro_scraper/lvm3_scraper.py


In [14]:
# @title BLOCK 11 - Create __init__.py

%%writefile isro_scraper/__init__.py
"""ISRO Launch Scraper Package."""

from .pslv_scraper import PSLVScraper
from .gslv_scraper import GSLVScraper
from .lvm3_scraper import LVM3Scraper
from .models import PSLVLaunch, GSLVLaunch, LVM3Launch
from .config import ScraperConfig

__all__ = [
    'PSLVScraper', 'GSLVScraper', 'LVM3Scraper',
    'PSLVLaunch', 'GSLVLaunch', 'LVM3Launch',
    'ScraperConfig'
]
__version__ = '1.0.0'


Writing isro_scraper/__init__.py


In [15]:
# @title BLOCK 12 - Scrape PSLV, GSLV, LVM3 (RUN ONCE OR IF DATA MISSING)

# Note: Check if CSVs exist before scraping to avoid Wikipedia rate limits
# Add delays between scrapers to respect robot policy

import sys
import os
from time import sleep
sys.path.insert(0, '/content')

# Force module reload
for module in list(sys.modules.keys()):
    if 'isro_scraper' in module:
        del sys.modules[module]

from isro_scraper import PSLVScraper, GSLVScraper, LVM3Scraper
from isro_scraper.exporters import export_to_csv, export_to_json
import pandas as pd

print("=" * 80)
print("ISRO LAUNCH DATA SCRAPING")
print("=" * 80)

# Create output directory
os.makedirs('output', exist_ok=True)

# Check-then-scrape logic
PSLV_CSV = 'output/pslv_launches.csv'
GSLV_CSV = 'output/gslv_launches.csv'
LVM3_CSV = 'output/lvm3_launches.csv'

results = {}

# Scrape PSLV
if os.path.exists(PSLV_CSV):
    print("\nPSLV data already exists - skipping scrape")
    pslv_df = pd.read_csv(PSLV_CSV)
    results['pslv'] = len(pslv_df)
else:
    print("\nScraping PSLV launches...")
    scraper = PSLVScraper()
    launches = scraper.scrape()
    export_to_csv(launches, PSLV_CSV)
    export_to_json(launches, 'output/pslv_launches.json')
    results['pslv'] = len(launches)
    print(f"PSLV: {len(launches)} launches scraped")
    sleep(10)

# Scrape GSLV
if os.path.exists(GSLV_CSV):
    print("\nGSLV data already exists - skipping scrape")
    gslv_df = pd.read_csv(GSLV_CSV)
    results['gslv'] = len(gslv_df)
else:
    print("\nScraping GSLV launches...")
    scraper = GSLVScraper()
    launches = scraper.scrape()
    export_to_csv(launches, GSLV_CSV)
    export_to_json(launches, 'output/gslv_launches.json')
    results['gslv'] = len(launches)
    print(f"GSLV: {len(launches)} launches scraped")
    sleep(10)

# Scrape LVM3
if os.path.exists(LVM3_CSV):
    print("\nLVM3 data already exists - skipping scrape")
    lvm3_df = pd.read_csv(LVM3_CSV)
    results['lvm3'] = len(lvm3_df)
else:
    print("\nScraping LVM3 launches...")
    scraper = LVM3Scraper()
    launches = scraper.scrape()
    export_to_csv(launches, LVM3_CSV)
    export_to_json(launches, 'output/lvm3_launches.json')
    results['lvm3'] = len(launches)
    print(f"LVM3: {len(launches)} launches scraped")

print("\n" + "=" * 80)
print("SCRAPING COMPLETE")
print("=" * 80)
print(f"PSLV: {results['pslv']} launches")
print(f"GSLV: {results['gslv']} launches")
print(f"LVM3: {results['lvm3']} launches")
print(f"Total: {sum(results.values())} launches")


ISRO LAUNCH DATA SCRAPING

Scraping PSLV launches...
PSLV: 64 launches scraped

Scraping GSLV launches...
GSLV: 18 launches scraped

Scraping LVM3 launches...
LVM3: 9 launches scraped

SCRAPING COMPLETE
PSLV: 64 launches
GSLV: 18 launches
LVM3: 9 launches
Total: 91 launches


In [16]:
# @title BLOCK 13 - Consolidate All Rockets (RUN EVERY SESSION)

# Note: Merge PSLV, GSLV, LVM3 with standardized columns

import pandas as pd

print("=" * 80)
print("DATA CONSOLIDATION")
print("=" * 80)

# Load individual datasets
pslv_df = pd.read_csv('output/pslv_launches.csv')
gslv_df = pd.read_csv('output/gslv_launches.csv')
lvm3_df = pd.read_csv('output/lvm3_launches.csv')

print(f"\nLoaded datasets:")
print(f"  PSLV - {len(pslv_df)} launches")
print(f"  GSLV - {len(gslv_df)} launches")
print(f"  LVM3 - {len(lvm3_df)} launches")

# Add rocket_type column
pslv_df.insert(0, 'rocket_type', 'PSLV')
gslv_df.insert(0, 'rocket_type', 'GSLV')
lvm3_df.insert(0, 'rocket_type', 'LVM3')

# Standardize LVM3 columns to match PSLV/GSLV
lvm3_df = lvm3_df.rename(columns={
    'status': 'launch_outcome',
    'regime': 'orbit',
    'operator': 'user'
})

# Add missing columns
lvm3_df['rocket_configuration'] = None
pslv_df['function'] = None
gslv_df['function'] = None

# Standard column order
standard_columns = [
    'rocket_type',
    'flight_number',
    'date_time_utc',
    'rocket_configuration',
    'launch_site',
    'payload',
    'payload_mass',
    'orbit',
    'user',
    'launch_outcome',
    'function',
    'remarks'
]

# Reorder columns
pslv_df = pslv_df[standard_columns]
gslv_df = gslv_df[standard_columns]
lvm3_df = lvm3_df[standard_columns]

# Concatenate all three
combined_df = pd.concat([pslv_df, gslv_df, lvm3_df], ignore_index=True)

# Sort by date
combined_df['date_parsed'] = pd.to_datetime(combined_df['date_time_utc'], errors='coerce')
combined_df = combined_df.sort_values('date_parsed')
combined_df = combined_df.drop('date_parsed', axis=1)
combined_df = combined_df.reset_index(drop=True)

# Save consolidated dataset
combined_df.to_csv('output/isro_launch_history_raw.csv', index=False)

print(f"\nConsolidated dataset:")
print(f"  Total missions - {len(combined_df)}")
print(f"  Date range - {combined_df['date_time_utc'].iloc[0]} to {combined_df['date_time_utc'].iloc[-1]}")

print(f"\nOutcome distribution:")
print(combined_df['launch_outcome'].value_counts())

print("\nFile saved - output/isro_launch_history_raw.csv")


DATA CONSOLIDATION

Loaded datasets:
  PSLV - 64 launches
  GSLV - 18 launches
  LVM3 - 9 launches

Consolidated dataset:
  Total missions - 91
  Date range - 20 September 1993 05:12 to 30 July 2025 12:10 [ 68 ]

Outcome distribution:
launch_outcome
Success            80
Failure             8
Partial failure     3
Name: count, dtype: int64

File saved - output/isro_launch_history_raw.csv


In [17]:
# @title BLOCK 14 - Save All Files to Google Drive (RUN EVERY SESSION)

# Note: Copy scraper package and datasets to Drive for persistence

import shutil
import os

print("=" * 80)
print("SAVING TO GOOGLE DRIVE")
print("=" * 80)

# Copy scraper package to Drive
drive_code_path = os.path.join(CODE_PATH, 'isro_scraper')
if os.path.exists(drive_code_path):
    shutil.rmtree(drive_code_path)
shutil.copytree('isro_scraper', drive_code_path)
print(f"\nScraper package saved to Drive")

# Copy consolidated dataset
shutil.copy(
    'output/isro_launch_history_raw.csv',
    os.path.join(DATASET_PATH, 'isro_launch_history_raw.csv')
)
print(f"Consolidated dataset saved to Drive")

# Copy individual files
for filename in ['pslv_launches.csv', 'gslv_launches.csv', 'lvm3_launches.csv']:
    src = os.path.join('output', filename)
    dst = os.path.join(DATASET_PATH, 'individual', filename)
    shutil.copy(src, dst)

print(f"Individual datasets saved to Drive")

print("\n" + "=" * 80)
print("ALL FILES SAVED TO GOOGLE DRIVE")
print("=" * 80)
print(f"\nLocations:")
print(f"  Code - {drive_code_path}")
print(f"  Dataset - {DATASET_PATH}")
print("\nData acquisition complete - ready for Notebook 02")


SAVING TO GOOGLE DRIVE

Scraper package saved to Drive
Consolidated dataset saved to Drive
Individual datasets saved to Drive

ALL FILES SAVED TO GOOGLE DRIVE

Locations:
  Code - /content/drive/My Drive/Course/Minor in AI/Final Project/ISRO Launch Trend Analysis - Apurva Upadhyay/Code/isro_scraper
  Dataset - /content/drive/My Drive/Course/Minor in AI/Final Project/ISRO Launch Trend Analysis - Apurva Upadhyay/Dataset

Data acquisition complete - ready for Notebook 02
