<a href="https://colab.research.google.com/github/FNS-Division/GIGA-applied-geospatial-tools/blob/main/get_ookla_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get open data from Ookla Speedtest on mobile and fixed network performance

In [None]:
!pip install s3fs

In [None]:
import pandas as pd
import pyarrow.parquet as pq
import os
from pathlib import Path
import ipywidgets as widgets
import s3fs

In [None]:
# @title Configure Ookla download
# Data Selection Parameters
data_params = {
    'data_type': widgets.Dropdown(
        options=['fixed', 'mobile'],
        value='fixed',
        description='Data Type:',
        style={'description_width': 'initial'}
    ),
    'year': widgets.IntText(
        value=2024,
        description='Year:',
        style={'description_width': 'initial'}
    ),
    'quarter': widgets.IntSlider(
        value=2,
        min=1,
        max=4,
        step=1,
        description='Quarter:',
        style={'description_width': 'initial'}
    )
}

# Coordinate Bounds
coordinate_bounds = {
    'min_x': widgets.FloatText(
        value=-73.8015,
        description='Min X:',
        style={'description_width': 'initial'}
    ),
    'max_x': widgets.FloatText(
        value=-56.0975,
        description='Max X:',
        style={'description_width': 'initial'}
    ),
    'min_y': widgets.FloatText(
        value=-9.8180,
        description='Min Y:',
        style={'description_width': 'initial'}
    ),
    'max_y': widgets.FloatText(
        value=2.2466,
        description='Max Y:',
        style={'description_width': 'initial'}
    )
}

# Create layout for data parameters
data_layout = widgets.VBox([
    widgets.HTML(value='<b>Data Selection Parameters</b>'),
    data_params['data_type'],
    data_params['year'],
    data_params['quarter']
])

# Create layout for coordinate bounds
coordinates_layout = widgets.VBox([
    widgets.HTML(value='<b>Coordinate Bounds</b>'),
    coordinate_bounds['min_x'],
    coordinate_bounds['max_x'],
    coordinate_bounds['min_y'],
    coordinate_bounds['max_y']
])

# Combine both layouts
combined_layout = widgets.VBox([
    data_layout,
    widgets.HTML(value='<br>'),  # Add some spacing
    coordinates_layout
])

display(combined_layout)


In [None]:
# @title Helper functions
def create_output_directory(base_path="ookla_data"):
    """
    Create a directory to store the output files if it doesn't exist.

    Args:
        base_path (str): The path where you want to create the directory

    Returns:
        Path: Path object pointing to the created directory
    """
    output_dir = Path(base_path)
    output_dir.mkdir(parents=True, exist_ok=True)
    return output_dir


def get_perf_tiles_parquet_url(service: str, year: int, quarter: int) -> str:
    """
    Generate the correct S3 URL for Ookla performance tiles data.

    Args:
        service (str): Either 'fixed' or 'mobile'
        year (int): Year of the data
        quarter (int): Quarter number (1-4)
    """
    quarter_start = f"{year}-{(((quarter - 1) * 3) + 1):02}-01"
    url = f"s3://ookla-open-data/parquet/performance/type={service}/year={year}/quarter={quarter}/{quarter_start}_performance_{service}_tiles.parquet"
    return url

def download_ookla_data(data_type, year, quarter, bounds, output_path):
    """
    Download and filter Ookla network performance data using efficient Parquet filtering.

    Args:
        data_type (str): Either 'fixed' or 'mobile'
        year (int): Year of the data
        quarter (int): Quarter of the data (1-4)
        bounds (dict): Dictionary containing geographical bounds
        output_path (Path): Path to save the output CSV
    """
    # Get the correct URL
    url = get_perf_tiles_parquet_url(data_type, year, quarter)

    # Create bbox filters for efficient Parquet reading
    bbox_filters = [
        ('tile_y', '<=', bounds['max_y']),
        ('tile_y', '>=', bounds['min_y']),
        ('tile_x', '<=', bounds['max_x']),
        ('tile_x', '>=', bounds['min_x'])
    ]

    # Columns to retrieve
    columns = ['tile_x', 'tile_y', 'tests', 'avg_d_kbps', 'avg_lat_ms']

    print(f"Downloading {data_type} data from {url}")
    print(f"Applying bounds filter: {bounds}")

    # Read parquet file with predicate pushdown filtering
    try:
        df = pd.read_parquet(
            url,
            filters=bbox_filters,
            columns=columns,
            storage_options={"s3": {"anon": True}}
        )

        # Save to CSV
        output_file = output_path / f"{data_type}_network_data.csv"
        df.to_csv(output_file, index=False)
        print(f"Saved filtered data to {output_file}")
        print(f"Retrieved {len(df)} records")

        return df

    except Exception as e:
        print(f"Error downloading data: {str(e)}")
        raise

In [None]:
# Dictionary of bounds
bounds = {
    'min_x': coordinate_bounds["min_x"].value,
    'max_x': coordinate_bounds["max_x"].value,
    'min_y': coordinate_bounds["min_y"].value,
    'max_y': coordinate_bounds["max_y"].value
}

# Create output directory
output_dir = create_output_directory()

# Download mobile data
download_ookla_data(
    data_type="mobile",
    year=data_params["year"].value,
    quarter=data_params["quarter"].value,
    bounds=bounds,
    output_path=output_dir
)

# Download fixed data
download_ookla_data(
    data_type="fixed",
    year=data_params["year"].value,
    quarter=data_params["quarter"].value,
    bounds=bounds,
    output_path=output_dir
)