In [1]:
# Simple test cell
print("Hello World!")
print(f"Python version: {3 + 3}")
import sys
print(f"Python path: {sys.executable}")

Hello World!
Python version: 6
Python path: /home/codespace/.python/current/bin/python3


SyntaxError: invalid syntax (2790270973.py, line 7)

In [4]:
# Import required libraries
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [5]:
# Scout Backend API Configuration
BASE_URL = "http://localhost:8080/api"

def make_api_request(endpoint, method="GET", data=None):
    """Helper function to make API requests to Scout backend"""
    url = f"{BASE_URL}/{endpoint}"
    
    try:
        if method == "POST":
            response = requests.post(url, json=data, timeout=30)
        else:
            response = requests.get(url, timeout=30)
        
        response.raise_for_status()
        return response.json()
    
    except requests.exceptions.RequestException as e:
        print(f"❌ API request failed: {e}")
        return None

# Test API connection
health_check = make_api_request("health")
if health_check:
    print("✅ Scout backend API is accessible!")
    print(f"Status: {health_check.get('status', 'Unknown')}")
else:
    print("❌ Cannot connect to Scout backend API")

✅ Scout backend API is accessible!
Status: healthy


## Step 1: Search for Automobile Accident Datasets

Let's search for automobile accident and collision datasets from NYC Open Data.

In [20]:
# Search for automobile accident datasets
search_terms = ["collision", "accident", "motor vehicle", "crash", "traffic"]

search_data = {
    "search_terms": search_terms,
    "limit": 20,
    "include_quality": True
}

print("🔍 Searching for automobile accident datasets...")
datasets = make_api_request("datasets/search", method="POST", data=search_data)

if datasets:
    print(f"✅ Found {len(datasets)} datasets!")
    
    # Convert to DataFrame for easier analysis
    df_datasets = pd.DataFrame(datasets)
    
    # Display key information
    print("\n📊 Dataset Overview:")
    print(f"Total datasets found: {len(df_datasets)}")
    print(f"Categories: {df_datasets['category'].value_counts().to_dict()}")
    
    # Show top datasets by download count
    top_datasets = df_datasets.nlargest(5, 'download_count')[
        ['name', 'download_count', 'updated_at', 'category']
    ]
    
    print("\n🔥 Top 5 Most Downloaded Datasets:")
    for idx, row in top_datasets.iterrows():
        print(f"{idx+1}. {row['name'][:60]}...")
        print(f"   Downloads: {row['download_count']:,} | Updated: {row['updated_at'][:10]} | Category: {row['category']}")
        print()
        
else:
    print("❌ No datasets found")

🔍 Searching for automobile accident datasets...
✅ Found 40 datasets!

📊 Dataset Overview:
Total datasets found: 40
Categories: {'Transportation': 26, 'Public Safety': 8, 'Housing & Development': 2, 'City Government': 2, 'Health': 1, 'Business': 1}

🔥 Top 5 Most Downloaded Datasets:
1. Motor Vehicle Collisions - Crashes...
   Downloads: 263,671 | Updated: 2025-09-16 | Category: Public Safety

11. DOB Complaints Received...
   Downloads: 52,089 | Updated: 2025-09-16 | Category: Housing & Development

3. Motor Vehicle Collisions - Vehicles...
   Downloads: 14,263 | Updated: 2025-09-16 | Category: Public Safety

2. Motor Vehicle Collisions - Person...
   Downloads: 10,899 | Updated: 2025-09-16 | Category: Public Safety

28. DOT Traffic Speeds NBE...
   Downloads: 9,201 | Updated: 2025-09-17 | Category: Transportation



In [7]:
# Get the most recently updated dataset
if datasets and len(df_datasets) > 0:
    # Sort by updated_at to find most recent
    df_datasets['updated_at'] = pd.to_datetime(df_datasets['updated_at'])
    most_recent = df_datasets.loc[df_datasets['updated_at'].idxmax()]
    
    print("🕐 Most Recently Updated Automobile Accident Dataset:")
    print(f"Name: {most_recent['name']}")
    print(f"ID: {most_recent['id']}")
    print(f"Updated: {most_recent['updated_at']}")
    print(f"Downloads: {most_recent['download_count']:,}")
    print(f"Category: {most_recent['category']}")
    print(f"Description: {most_recent['description'][:200]}...")
    
    # Store the dataset ID for further analysis
    target_dataset_id = most_recent['id']
    target_dataset_name = most_recent['name']
    
    print(f"\n✅ Selected dataset: {target_dataset_id}")

🕐 Most Recently Updated Automobile Accident Dataset:
Name: DOT Traffic Speeds NBE
ID: i4gi-tjb9
Updated: 2025-09-17 00:41:45+00:00
Downloads: 9,201
Category: Transportation
Description: No description available...

✅ Selected dataset: i4gi-tjb9


## Step 2: Download and Analyze Dataset Sample

Now let's get a sample of the most recent automobile accident dataset and perform initial analysis.

In [39]:
# Download dataset sample using Scout API
if 'target_dataset_id' in locals():
    print(f"📥 Downloading sample data for: {target_dataset_name}")
    
    # Get dataset sample (1000 records)
    sample_data = make_api_request(f"datasets/{target_dataset_id}/sample?sample_size=1000")
    
    if sample_data and sample_data.get('data'):
        print("✅ Sample data downloaded successfully!")
        
        # Convert to DataFrame
        df = pd.DataFrame(sample_data['data'])
        
        print(f"\n📊 Dataset Shape: {df.shape}")
        print(f"Columns: {len(df.columns)}")
        print(f"Sample Size: {len(df)} records")
        
        # Display basic info
        print("\n📋 Column Information:")
        print(f"Total Columns: {len(df.columns)}")
        print(f"Data Types: {df.dtypes.value_counts().to_dict()}")
        
        # Show first few column names
        print(f"\nFirst 10 Columns:")
        for i, col in enumerate(df.columns[:10]):
            print(f"{i+1:2d}. {col}")
        
        if len(df.columns) > 10:
            print(f"... and {len(df.columns) - 10} more columns")
            
    else:
        print("❌ Failed to download sample data")
else:
    print("❌ No target dataset selected")

📥 Downloading sample data for: DOT Traffic Speeds NBE
✅ Sample data downloaded successfully!

📊 Dataset Shape: (100, 13)
Columns: 13
Sample Size: 100 records

📋 Column Information:
Total Columns: 13
Data Types: {dtype('O'): 13}

First 10 Columns:
 1. id
 2. speed
 3. travel_time
 4. status
 5. data_as_of
 6. link_id
 7. link_points
 8. encoded_poly_line
 9. encoded_poly_line_lvls
10. owner
... and 3 more columns


In [21]:
# Examine the data structure and quality
if 'df' in locals() and not df.empty:
    print("🔍 Data Quality Assessment:")
    
    # Missing values analysis
    missing_data = df.isnull().sum()
    missing_pct = (missing_data / len(df)) * 100
    
    print(f"\n📊 Missing Data Summary:")
    print(f"Columns with missing data: {(missing_data > 0).sum()}")
    print(f"Total missing values: {missing_data.sum():,}")
    print(f"Missing data percentage: {(missing_data.sum() / (len(df) * len(df.columns))) * 100:.2f}%")
    
    # Show columns with highest missing data
    if missing_data.sum() > 0:
        print(f"\n📉 Top 10 Columns with Missing Data:")
        top_missing = missing_pct[missing_pct > 0].sort_values(ascending=False).head(10)
        for col, pct in top_missing.items():
            print(f"  {col}: {pct:.1f}% ({missing_data[col]:,} missing)")
    
    # Display basic statistics for numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\n📈 Numeric Columns Found: {len(numeric_cols)}")
        print("Sample numeric columns:", list(numeric_cols[:5]))
    
    # Display categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print(f"\n📝 Categorical Columns Found: {len(categorical_cols)}")
        print("Sample categorical columns:", list(categorical_cols[:5]))
    
    # Show data sample
    print(f"\n👀 First 3 Rows Preview:")
    pd.set_option('display.max_columns', 8)
    print(df.head(3))

## Step 3: Quality Assessment using Scout API

Let's use Scout's built-in quality assessment endpoint to get a comprehensive quality score for this dataset.

In [10]:
# Get quality assessment from Scout API
if 'target_dataset_id' in locals():
    print(f"📋 Getting quality assessment for: {target_dataset_name}")
    
    quality_data = make_api_request(f"datasets/{target_dataset_id}/quality")
    
    if quality_data:
        print("✅ Quality assessment completed!")
        
        # Display overall quality metrics
        print(f"\n🎯 Overall Quality Score: {quality_data.get('overall_score', 0):.1f}/100")
        print(f"📊 Quality Grade: {quality_data.get('grade', 'N/A')}")
        
        # Detailed quality metrics
        quality_metrics = {
            'Completeness': quality_data.get('completeness_score', 0),
            'Consistency': quality_data.get('consistency_score', 0),
            'Accuracy': quality_data.get('accuracy_score', 0),
            'Timeliness': quality_data.get('timeliness_score', 0),
            'Usability': quality_data.get('usability_score', 0)
        }
        
        print(f"\n📈 Detailed Quality Metrics:")
        for metric, score in quality_metrics.items():
            print(f"  {metric}: {score:.1f}/100")
        
        # Missing data percentage
        missing_pct = quality_data.get('missing_percentage', 0)
        print(f"\n📉 Missing Data: {missing_pct:.2f}%")
        
        # Key insights
        insights = quality_data.get('insights', [])
        if insights:
            print(f"\n💡 Key Quality Insights:")
            for i, insight in enumerate(insights, 1):
                print(f"  {i}. {insight}")
                
        # Create quality visualization
        if quality_metrics:
            fig, ax = plt.subplots(figsize=(10, 6))
            
            metrics = list(quality_metrics.keys())
            scores = list(quality_metrics.values())
            
            bars = ax.barh(metrics, scores, color=['#ff6b6b', '#4ecdc4', '#45b7d1', '#96ceb4', '#ffd93d'])
            ax.set_xlabel('Quality Score')
            ax.set_title(f'Data Quality Assessment: {target_dataset_name[:50]}...')
            ax.set_xlim(0, 100)
            
            # Add score labels on bars
            for bar, score in zip(bars, scores):
                ax.text(bar.get_width() + 1, bar.get_y() + bar.get_height()/2, 
                       f'{score:.1f}', va='center', fontweight='bold')
            
            plt.tight_layout()
            plt.show()
            
    else:
        print("❌ Failed to get quality assessment")

📋 Getting quality assessment for: DOT Traffic Speeds NBE
❌ API request failed: HTTPConnectionPool(host='localhost', port=8080): Read timed out. (read timeout=30)
❌ Failed to get quality assessment


## Step 4: Exploratory Data Analysis

Now let's perform detailed exploratory data analysis on the automobile accident dataset.

In [17]:
# Detailed exploratory data analysis
if 'df' in locals() and not df.empty:
    print("🔍 Starting Detailed EDA...")
    
    # Look for key accident-related columns
    accident_columns = {
        'date': [],
        'location': [],
        'casualties': [],
        'vehicles': [],
        'causes': []
    }
    
    # Categorize columns based on common patterns
    for col in df.columns:
        col_lower = col.lower()
        if any(word in col_lower for word in ['date', 'time', 'created', 'occurred']):
            accident_columns['date'].append(col)
        elif any(word in col_lower for word in ['borough', 'street', 'avenue', 'location', 'address', 'latitude', 'longitude']):
            accident_columns['location'].append(col)
        elif any(word in col_lower for word in ['injured', 'killed', 'death', 'casualt', 'person']):
            accident_columns['casualties'].append(col)
        elif any(word in col_lower for word in ['vehicle', 'car', 'truck', 'bike', 'motor']):
            accident_columns['vehicles'].append(col)
        elif any(word in col_lower for word in ['cause', 'factor', 'reason', 'type']):
            accident_columns['causes'].append(col)
    
    print("\n📊 Column Categories Identified:")
    for category, columns in accident_columns.items():
        if columns:
            print(f"\n{category.title()} Columns ({len(columns)}):")
            for col in columns[:5]:  # Show first 5
                print(f"  • {col}")
            if len(columns) > 5:
                print(f"  ... and {len(columns) - 5} more")
    
    # Find the most likely date column
    date_col = None
    for col in accident_columns['date']:
        if df[col].dtype == 'object':
            # Try to parse as datetime
            try:
                pd.to_datetime(df[col].head(), errors='raise')
                date_col = col
                break
            except:
                continue
    
    if date_col:
        print(f"\n📅 Using '{date_col}' as primary date column")
        
        # Convert to datetime
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        
        # Basic time analysis
        valid_dates = df[date_col].dropna()
        if len(valid_dates) > 0:
            print(f"Date range: {valid_dates.min().date()} to {valid_dates.max().date()}")
            print(f"Valid dates: {len(valid_dates):,} out of {len(df):,} records")
    
    print(f"\n✅ EDA preparation complete!")

## Step 5: Find Related Datasets

Let's use Scout's relationship analysis to find datasets related to our automobile accident data.

In [None]:
# Find related datasets using Scout's relationship analysis
if 'target_dataset_id' in locals():
    print(f"🔗 Finding datasets related to: {target_dataset_name}")
    
    relationship_data = {
        "dataset_id": target_dataset_id,
        "similarity_threshold": 0.3,
        "max_related": 10
    }
    
    relationships = make_api_request("datasets/relationships", method="POST", data=relationship_data)
    
    if relationships:
        related_datasets = relationships.get('related_datasets', [])
        network_stats = relationships.get('network_stats', {})
        
        print(f"✅ Found {len(related_datasets)} related datasets!")
        
        # Display network statistics
        print(f"\n📊 Network Analysis:")
        print(f"Total datasets analyzed: {network_stats.get('total_datasets', 0)}")
        print(f"Relationships found: {network_stats.get('relationships_found', 0)}")
        print(f"Graph density: {network_stats.get('graph_density', 0):.3f}")
        
        # Show top related datasets
        if related_datasets:
            print(f"\n🎯 Top Related Datasets:")
            
            for i, dataset in enumerate(related_datasets[:5], 1):
                print(f"\n{i}. {dataset.get('name', 'Unknown')[:60]}...")
                print(f"   Similarity Score: {dataset.get('similarity_score', 0):.3f}")
                print(f"   Category: {dataset.get('category', 'Unknown')}")
                print(f"   Relationship Reasons: {', '.join(dataset.get('relationship_reasons', []))}")
                
            # Create similarity visualization
            if len(related_datasets) > 0:
                df_related = pd.DataFrame(related_datasets)
                
                plt.figure(figsize=(12, 6))
                
                # Similarity scores plot
                plt.subplot(1, 2, 1)
                scores = df_related['similarity_score'].head(8)
                names = [name[:30] + '...' if len(name) > 30 else name 
                        for name in df_related['name'].head(8)]
                
                plt.barh(range(len(scores)), scores, color='skyblue')
                plt.yticks(range(len(scores)), names)
                plt.xlabel('Similarity Score')
                plt.title('Dataset Similarity Scores')
                plt.gca().invert_yaxis()
                
                # Category distribution
                plt.subplot(1, 2, 2)
                category_counts = df_related['category'].value_counts()
                plt.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
                plt.title('Related Datasets by Category')
                
                plt.tight_layout()
                plt.show()
                
    else:
        print("❌ Failed to find related datasets")

## Summary

This notebook demonstrates how to use the Scout Data Discovery backend API to:

1. **Search for datasets** using multiple search terms
2. **Identify the most recently updated** automobile accident dataset
3. **Download sample data** for analysis
4. **Get comprehensive quality assessments** with detailed metrics
5. **Perform exploratory data analysis** with automatic column categorization
6. **Find related datasets** using similarity analysis and network metrics

### Key Scout API Endpoints Used:

- `POST /api/datasets/search` - Search datasets with quality scores
- `GET /api/datasets/{id}/sample` - Download dataset samples
- `GET /api/datasets/{id}/quality` - Get quality assessments
- `POST /api/datasets/relationships` - Find related datasets

### Next Steps:

- Run individual cells to see the analysis in action
- Modify search terms to find other types of datasets
- Adjust similarity thresholds for relationship analysis
- Explore the Scout frontend at http://localhost:8501 for interactive analysis