# Scout Data Discovery Platform Exploration

## Overview
This notebook explores Scout (https://scout.tsdataclinic.com/), a next-generation data discovery platform developed by The Data Clinic. Scout provides an innovative approach to browsing open data portals with enhanced discoverability features, dataset recommendations, and automated data curation capabilities.

## What is Scout?
Scout is designed to solve the problem of data discovery in large open data portals. Unlike traditional catalog browsing, Scout offers:

- **Intelligent Dataset Recommendations**: Uses machine learning to suggest related datasets
- **Enhanced Search Capabilities**: Semantic search across dataset descriptions and metadata
- **Data Quality Insights**: Automated assessment of dataset completeness and usability
- **Collection Creation**: Ability to curate and share collections of related datasets
- **Cross-Portal Discovery**: Aggregates data from multiple Socrata-powered open data portals

## Architecture
Scout consists of:
- **Frontend**: React-based web application for interactive data exploration
- **Backend**: NestJS API server with PostgreSQL database
- **Search Engine**: OpenSearch for fast, semantic dataset discovery
- **Data Pipeline**: Automated ingestion from 120+ open data portals worldwide

## 1. Install and Import Required Libraries

In [None]:
# Core libraries for web scraping and API interaction
import requests
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
import re
from urllib.parse import urljoin, urlparse, parse_qs
from typing import Dict, List, Optional, Tuple

# For web scraping Scout's interface
from bs4 import BeautifulSoup
import urllib.request

# Data analysis and visualization
import matplotlib.pyplot as plt
import seaborn as sns

# For handling large datasets
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"Requests version: {requests.__version__}")

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', None)

## 2. Explore Scout API Endpoints

## 3. Extract Dataset Metadata

In [None]:
class ScoutDatasetExtractor:
    """
    Extract dataset metadata using Socrata Discovery API (Scout's underlying data source)
    Since Scout aggregates Socrata portals, we can access the same data through Socrata's API
    """
    
    def __init__(self):
        self.socrata_discovery_url = "http://api.us.socrata.com/api/catalog/v1"
        self.session = requests.Session()
    
    def get_nyc_datasets(self, limit: int = 100, offset: int = 0) -> Dict:
        """
        Get NYC datasets using Socrata Discovery API (same data Scout uses)
        """
        params = {
            'domains': 'data.cityofnewyork.us',
            'search_context': 'data.cityofnewyork.us',
            'limit': limit,
            'offset': offset
        }
        
        try:
            response = self.session.get(self.socrata_discovery_url, params=params)
            response.raise_for_status()
            return response.json()
        except requests.RequestException as e:
            return {'error': str(e)}
    
    def extract_dataset_metadata(self, dataset: Dict) -> Dict:
        """Extract key metadata from a dataset record"""
        resource = dataset.get('resource', {})
        classification = dataset.get('classification', {})
        
        return {
            'id': resource.get('id'),
            'name': resource.get('name'),
            'description': resource.get('description'),
            'attribution': resource.get('attribution'),
            'type': resource.get('type'),
            'updatedAt': resource.get('updatedAt'),
            'createdAt': resource.get('createdAt'),
            'download_count': resource.get('download_count'),
            'page_views_total': resource.get('page_views', {}).get('page_views_total', 0),
            'columns_count': len(resource.get('columns_name', [])),
            'columns_names': resource.get('columns_name', []),
            'columns_field_names': resource.get('columns_field_name', []),
            'columns_datatypes': resource.get('columns_datatype', []),
            'domain_category': classification.get('domain_category'),
            'domain_tags': classification.get('domain_tags', []),
            'categories': classification.get('categories', []),
            'tags': classification.get('tags', [])
        }
    
    def get_comprehensive_dataset_list(self, max_datasets: int = 1000) -> pd.DataFrame:
        """Get a comprehensive list of NYC datasets with metadata"""
        all_datasets = []
        offset = 0
        limit = 100
        
        print("Fetching NYC datasets from Socrata Discovery API...")
        
        while len(all_datasets) < max_datasets:
            print(f"Fetching batch: offset={offset}, current total={len(all_datasets)}")
            
            data = self.get_nyc_datasets(limit=limit, offset=offset)
            
            if 'error' in data:
                print(f"Error: {data['error']}")
                break
                
            results = data.get('results', [])
            if not results:
                print("No more results available")
                break
            
            for dataset in results:
                metadata = self.extract_dataset_metadata(dataset)
                all_datasets.append(metadata)
            
            offset += limit
            time.sleep(0.5)  # Rate limiting
            
            if len(results) < limit:
                break
        
        print(f"Collected {len(all_datasets)} datasets")
        return pd.DataFrame(all_datasets)

# Initialize extractor and get dataset metadata
extractor = ScoutDatasetExtractor()

# Get sample datasets first
print("=== SAMPLE DATASET EXTRACTION ===")
sample_data = extractor.get_nyc_datasets(limit=10)

if 'error' not in sample_data:
    print(f"Successfully retrieved {len(sample_data.get('results', []))} sample datasets")
    
    # Extract metadata from first few datasets
    sample_metadata = []
    for dataset in sample_data.get('results', [])[:5]:
        metadata = extractor.extract_dataset_metadata(dataset)
        sample_metadata.append(metadata)
    
    sample_df = pd.DataFrame(sample_metadata)
    print("\nSample dataset metadata:")
    print(sample_df[['name', 'type', 'columns_count', 'download_count', 'page_views_total']].head())
else:
    print(f"Error retrieving sample data: {sample_data['error']}")

## 4. Search and Filter Datasets

In [None]:
class ScoutDatasetSearcher:
    """
    Advanced search and filtering capabilities for datasets
    """
    
    def __init__(self, extractor: ScoutDatasetExtractor):
        self.extractor = extractor
        self.socrata_search_url = "http://api.us.socrata.com/api/catalog/v1"
    
    def search_datasets_by_keyword(self, keyword: str, limit: int = 50) -> pd.DataFrame:
        """Search datasets by keyword using Socrata Discovery API"""
        params = {
            'domains': 'data.cityofnewyork.us',
            'search_context': 'data.cityofnewyork.us',
            'q': keyword,
            'limit': limit
        }
        
        try:
            response = self.extractor.session.get(self.socrata_search_url, params=params)
            response.raise_for_status()
            data = response.json()
            
            datasets = []
            for dataset in data.get('results', []):
                metadata = self.extractor.extract_dataset_metadata(dataset)
                datasets.append(metadata)
            
            return pd.DataFrame(datasets)
            
        except requests.RequestException as e:
            print(f"Search error: {e}")
            return pd.DataFrame()
    
    def filter_by_category(self, datasets_df: pd.DataFrame, category: str) -> pd.DataFrame:
        """Filter datasets by domain category"""
        return datasets_df[datasets_df['domain_category'].str.contains(category, case=False, na=False)]
    
    def filter_by_popularity(self, datasets_df: pd.DataFrame, min_downloads: int = 1000) -> pd.DataFrame:
        """Filter datasets by download count"""
        return datasets_df[datasets_df['download_count'] >= min_downloads]
    
    def filter_by_recency(self, datasets_df: pd.DataFrame, days_ago: int = 365) -> pd.DataFrame:
        """Filter datasets updated within specified days"""
        if datasets_df.empty:
            return datasets_df
            
        datasets_df = datasets_df.copy()
        datasets_df['updatedAt'] = pd.to_datetime(datasets_df['updatedAt'], errors='coerce')
        cutoff_date = datetime.now() - timedelta(days=days_ago)
        
        return datasets_df[datasets_df['updatedAt'] >= cutoff_date]
    
    def find_datasets_with_columns(self, datasets_df: pd.DataFrame, column_keywords: List[str]) -> pd.DataFrame:
        """Find datasets containing specific column types"""
        def has_matching_columns(row):
            if not row['columns_names']:
                return False
            
            all_columns = ' '.join(row['columns_names']).lower()
            return any(keyword.lower() in all_columns for keyword in column_keywords)
        
        return datasets_df[datasets_df.apply(has_matching_columns, axis=1)]
    
    def get_dataset_recommendations(self, dataset_id: str, datasets_df: pd.DataFrame, top_n: int = 5) -> pd.DataFrame:
        """Simple recommendation based on similar tags and categories"""
        if datasets_df.empty:
            return datasets_df
            
        target_dataset = datasets_df[datasets_df['id'] == dataset_id]
        if target_dataset.empty:
            return pd.DataFrame()
        
        target_tags = set(target_dataset.iloc[0]['domain_tags'])
        target_category = target_dataset.iloc[0]['domain_category']
        
        def similarity_score(row):
            if row['id'] == dataset_id:
                return 0
            
            score = 0
            # Category match
            if row['domain_category'] == target_category:
                score += 3
            
            # Tag similarity
            row_tags = set(row['domain_tags'])
            common_tags = target_tags.intersection(row_tags)
            score += len(common_tags)
            
            return score
        
        datasets_df = datasets_df.copy()
        datasets_df['similarity_score'] = datasets_df.apply(similarity_score, axis=1)
        
        return datasets_df[datasets_df['similarity_score'] > 0].nlargest(top_n, 'similarity_score')

# Initialize searcher and demonstrate capabilities
searcher = ScoutDatasetSearcher(extractor)

print("=== DATASET SEARCH DEMONSTRATIONS ===")

# 1. Search by keyword
print("\n1. Searching for 'traffic' datasets:")
traffic_datasets = searcher.search_datasets_by_keyword("traffic", limit=10)
if not traffic_datasets.empty:
    print(f"Found {len(traffic_datasets)} traffic-related datasets")
    print(traffic_datasets[['name', 'domain_category', 'download_count']].head())
else:
    print("No traffic datasets found")

# 2. Search by keyword - crime
print("\n2. Searching for 'crime' datasets:")
crime_datasets = searcher.search_datasets_by_keyword("crime", limit=10)
if not crime_datasets.empty:
    print(f"Found {len(crime_datasets)} crime-related datasets")
    print(crime_datasets[['name', 'download_count', 'page_views_total']].head())

# 3. Search by keyword - housing
print("\n3. Searching for 'housing' datasets:")
housing_datasets = searcher.search_datasets_by_keyword("housing", limit=15)
if not housing_datasets.empty:
    print(f"Found {len(housing_datasets)} housing-related datasets")
    popular_housing = searcher.filter_by_popularity(housing_datasets, min_downloads=500)
    print(f"Popular housing datasets (500+ downloads): {len(popular_housing)}")
    if not popular_housing.empty:
        print(popular_housing[['name', 'download_count']].head())

## 5. Download Dataset Samples

In [None]:
class ScoutDatasetDownloader:
    """
    Download and sample datasets discovered through Scout/Socrata
    """
    
    def __init__(self):
        self.session = requests.Session()
        self.base_url = "https://data.cityofnewyork.us/resource"
    
    def download_dataset_sample(self, dataset_id: str, limit: int = 1000, format: str = 'json') -> pd.DataFrame:
        """Download a sample of the dataset"""
        url = f"{self.base_url}/{dataset_id}.{format}"
        params = {'$limit': limit}
        
        try:
            response = self.session.get(url, params=params)
            response.raise_for_status()
            
            if format == 'json':
                data = response.json()
                return pd.DataFrame(data)
            elif format == 'csv':
                return pd.read_csv(url, nrows=limit)
            else:
                raise ValueError(f"Unsupported format: {format}")
                
        except requests.RequestException as e:
            print(f"Error downloading {dataset_id}: {e}")
            return pd.DataFrame()
        except Exception as e:
            print(f"Error processing {dataset_id}: {e}")
            return pd.DataFrame()
    
    def get_dataset_schema(self, dataset_id: str) -> Dict:
        """Get detailed schema information for a dataset"""
        url = f"https://data.cityofnewyork.us/api/views/{dataset_id}.json"
        
        try:
            response = self.session.get(url)
            response.raise_for_status()
            data = response.json()
            
            schema_info = {
                'id': data.get('id'),
                'name': data.get('name'),
                'description': data.get('description'),
                'row_count': data.get('rowsUpdatedAt'),
                'columns': []
            }
            
            for col in data.get('columns', []):
                col_info = {
                    'id': col.get('id'),
                    'name': col.get('name'),
                    'field_name': col.get('fieldName'),
                    'data_type': col.get('dataTypeName'),
                    'description': col.get('description'),
                    'render_type': col.get('renderTypeName')
                }
                schema_info['columns'].append(col_info)
            
            return schema_info
            
        except requests.RequestException as e:
            return {'error': str(e)}
    
    def batch_download_samples(self, dataset_ids: List[str], sample_size: int = 100) -> Dict[str, pd.DataFrame]:
        """Download samples from multiple datasets"""
        results = {}
        
        for dataset_id in dataset_ids:
            print(f"Downloading sample from {dataset_id}...")
            sample_data = self.download_dataset_sample(dataset_id, limit=sample_size)
            
            if not sample_data.empty:
                results[dataset_id] = sample_data
                print(f"  ✓ Downloaded {len(sample_data)} rows, {len(sample_data.columns)} columns")
            else:
                print(f"  ✗ Failed to download {dataset_id}")
            
            time.sleep(0.5)  # Rate limiting
        
        return results

# Initialize downloader and demonstrate capabilities
downloader = ScoutDatasetDownloader()

print("=== DATASET DOWNLOAD DEMONSTRATIONS ===")

# Get some dataset IDs from our previous searches
sample_dataset_ids = []

# If we have traffic datasets, get their IDs
if 'traffic_datasets' in locals() and not traffic_datasets.empty:
    sample_dataset_ids.extend(traffic_datasets['id'].tolist()[:3])

# If we have crime datasets, get their IDs  
if 'crime_datasets' in locals() and not crime_datasets.empty:
    sample_dataset_ids.extend(crime_datasets['id'].tolist()[:2])

# Fallback to known NYC dataset IDs
if not sample_dataset_ids:
    sample_dataset_ids = [
        'erm2-nwe9',  # 311 Service Requests
        'h9gi-nx95',  # Motor Vehicle Collisions
        'qgea-i56i'   # NYPD Complaint Data
    ]

print(f"Attempting to download samples from {len(sample_dataset_ids)} datasets:")
print(sample_dataset_ids[:5])  # Show first 5 IDs

# Download a schema example
if sample_dataset_ids:
    print(f"\n=== SCHEMA EXPLORATION FOR {sample_dataset_ids[0]} ===")
    schema = downloader.get_dataset_schema(sample_dataset_ids[0])
    
    if 'error' not in schema:
        print(f"Dataset: {schema.get('name')}")
        print(f"Columns: {len(schema.get('columns', []))}")
        
        # Show first few columns
        for col in schema.get('columns', [])[:5]:
            print(f"  - {col['name']} ({col['data_type']}): {col['field_name']}")
    else:
        print(f"Schema error: {schema['error']}")

# Download actual sample data
print(f"\n=== DOWNLOADING SAMPLE DATA ===")
sample_downloads = downloader.batch_download_samples(sample_dataset_ids[:2], sample_size=50)

# Display info about downloaded samples
for dataset_id, df in sample_downloads.items():
    print(f"\nDataset {dataset_id}:")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)[:5]}...")  # First 5 columns
    if not df.empty:
        print(f"  Sample row:\n{df.iloc[0].head()}")

## 6. Data Quality Assessment

In [None]:
class ScoutDataQualityAssessor:
    """
    Assess data quality of datasets discovered through Scout
    """
    
    def __init__(self):
        self.assessment_results = {}
    
    def assess_dataset_quality(self, dataset_id: str, df: pd.DataFrame) -> Dict:
        """Comprehensive data quality assessment"""
        if df.empty:
            return {'error': 'Empty dataset'}
        
        assessment = {
            'dataset_id': dataset_id,
            'basic_stats': self._get_basic_stats(df),
            'missing_data': self._assess_missing_data(df),
            'data_types': self._assess_data_types(df),
            'duplicates': self._assess_duplicates(df),
            'outliers': self._assess_outliers(df),
            'completeness_score': 0,
            'usability_score': 0
        }
        
        # Calculate overall scores
        assessment['completeness_score'] = self._calculate_completeness_score(assessment)
        assessment['usability_score'] = self._calculate_usability_score(assessment)
        
        return assessment
    
    def _get_basic_stats(self, df: pd.DataFrame) -> Dict:
        """Basic dataset statistics"""
        return {
            'row_count': len(df),
            'column_count': len(df.columns),
            'memory_usage_mb': df.memory_usage(deep=True).sum() / (1024 * 1024),
            'column_names': list(df.columns)
        }
    
    def _assess_missing_data(self, df: pd.DataFrame) -> Dict:
        """Assess missing data patterns"""
        missing_counts = df.isnull().sum()
        missing_percentages = (missing_counts / len(df)) * 100
        
        return {
            'total_missing_values': missing_counts.sum(),
            'missing_percentage_overall': (missing_counts.sum() / (len(df) * len(df.columns))) * 100,
            'columns_with_missing': missing_counts[missing_counts > 0].to_dict(),
            'missing_percentages_by_column': missing_percentages[missing_percentages > 0].to_dict(),
            'completely_empty_columns': missing_percentages[missing_percentages == 100].index.tolist()
        }
    
    def _assess_data_types(self, df: pd.DataFrame) -> Dict:
        """Assess data type consistency and issues"""
        type_summary = df.dtypes.value_counts().to_dict()
        
        # Check for potential data type issues
        potential_dates = []
        potential_numbers = []
        
        for col in df.columns:
            # Check if text columns might be dates or numbers
            if df[col].dtype == 'object':
                sample_values = df[col].dropna().astype(str).head(10)
                
                # Check for date patterns
                date_patterns = [r'\d{4}-\d{2}-\d{2}', r'\d{2}/\d{2}/\d{4}', r'\d{2}-\d{2}-\d{4}']
                for pattern in date_patterns:
                    if sample_values.str.match(pattern).any():
                        potential_dates.append(col)
                        break
                
                # Check for number patterns
                if sample_values.str.match(r'^[\d.,]+$').any():
                    potential_numbers.append(col)
        
        return {
            'type_distribution': {str(k): v for k, v in type_summary.items()},
            'potential_date_columns': potential_dates,
            'potential_numeric_columns': potential_numbers
        }
    
    def _assess_duplicates(self, df: pd.DataFrame) -> Dict:
        """Assess duplicate records"""
        total_duplicates = df.duplicated().sum()
        
        return {
            'total_duplicate_rows': int(total_duplicates),
            'duplicate_percentage': (total_duplicates / len(df)) * 100,
            'unique_row_count': len(df.drop_duplicates())
        }
    
    def _assess_outliers(self, df: pd.DataFrame) -> Dict:
        """Assess outliers in numeric columns"""
        numeric_cols = df.select_dtypes(include=[np.number]).columns
        outlier_info = {}
        
        for col in numeric_cols:
            if not df[col].empty:
                Q1 = df[col].quantile(0.25)
                Q3 = df[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - 1.5 * IQR
                upper_bound = Q3 + 1.5 * IQR
                
                outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
                
                outlier_info[col] = {
                    'outlier_count': len(outliers),
                    'outlier_percentage': (len(outliers) / len(df)) * 100,
                    'min_value': float(df[col].min()),
                    'max_value': float(df[col].max()),
                    'mean': float(df[col].mean()),
                    'std': float(df[col].std())
                }
        
        return outlier_info
    
    def _calculate_completeness_score(self, assessment: Dict) -> float:
        """Calculate a completeness score (0-100)"""
        missing_data = assessment['missing_data']
        
        # Base score
        score = 100
        
        # Penalize missing data
        score -= missing_data['missing_percentage_overall']
        
        # Penalize completely empty columns
        empty_cols_penalty = len(missing_data['completely_empty_columns']) * 10
        score -= empty_cols_penalty
        
        return max(0, min(100, score))
    
    def _calculate_usability_score(self, assessment: Dict) -> float:
        """Calculate a usability score (0-100)"""
        score = 100
        
        # Penalize high duplicate percentage
        dup_penalty = assessment['duplicates']['duplicate_percentage'] * 0.5
        score -= dup_penalty
        
        # Reward proper data types
        basic_stats = assessment['basic_stats']
        if basic_stats['column_count'] > 0:
            # Bonus for having diverse data types
            type_diversity = len(assessment['data_types']['type_distribution'])
            score += min(10, type_diversity * 2)
        
        return max(0, min(100, score))
    
    def generate_quality_report(self, assessments: Dict[str, Dict]) -> pd.DataFrame:
        """Generate a summary quality report"""
        reports = []
        
        for dataset_id, assessment in assessments.items():
            if 'error' in assessment:
                continue
                
            report = {
                'dataset_id': dataset_id,
                'row_count': assessment['basic_stats']['row_count'],
                'column_count': assessment['basic_stats']['column_count'],
                'missing_percentage': round(assessment['missing_data']['missing_percentage_overall'], 2),
                'duplicate_percentage': round(assessment['duplicates']['duplicate_percentage'], 2),
                'completeness_score': round(assessment['completeness_score'], 2),
                'usability_score': round(assessment['usability_score'], 2),
                'overall_quality': round((assessment['completeness_score'] + assessment['usability_score']) / 2, 2)
            }
            reports.append(report)
        
        return pd.DataFrame(reports)

# Initialize quality assessor and run assessments
quality_assessor = ScoutDataQualityAssessor()

print("=== DATA QUALITY ASSESSMENT ===")

# Assess quality of downloaded samples
quality_assessments = {}

if 'sample_downloads' in locals():
    for dataset_id, df in sample_downloads.items():
        print(f"\nAssessing quality of {dataset_id}...")
        assessment = quality_assessor.assess_dataset_quality(dataset_id, df)
        quality_assessments[dataset_id] = assessment
        
        if 'error' not in assessment:
            print(f"  Completeness Score: {assessment['completeness_score']:.1f}/100")
            print(f"  Usability Score: {assessment['usability_score']:.1f}/100")
            print(f"  Missing Data: {assessment['missing_data']['missing_percentage_overall']:.1f}%")
            print(f"  Duplicates: {assessment['duplicates']['duplicate_percentage']:.1f}%")
        else:
            print(f"  Assessment Error: {assessment['error']}")

# Generate quality report
if quality_assessments:
    print("\n=== QUALITY SUMMARY REPORT ===")
    quality_report = quality_assessor.generate_quality_report(quality_assessments)
    print(quality_report)
    
    # Visualize quality scores
    if len(quality_report) > 1:
        plt.figure(figsize=(12, 6))
        
        plt.subplot(1, 2, 1)
        plt.bar(range(len(quality_report)), quality_report['completeness_score'])
        plt.title('Dataset Completeness Scores')
        plt.xlabel('Dataset Index')
        plt.ylabel('Completeness Score')
        plt.xticks(range(len(quality_report)), quality_report['dataset_id'], rotation=45)
        
        plt.subplot(1, 2, 2)
        plt.bar(range(len(quality_report)), quality_report['usability_score'])
        plt.title('Dataset Usability Scores')
        plt.xlabel('Dataset Index')
        plt.ylabel('Usability Score')
        plt.xticks(range(len(quality_report)), quality_report['dataset_id'], rotation=45)
        
        plt.tight_layout()
        plt.show()
else:
    print("No datasets available for quality assessment")

## 7. Automated Data Discovery Pipeline

In [None]:
class ScoutAutomatedPipeline:
    """
    Comprehensive automated data discovery and assessment pipeline
    """
    
    def __init__(self):
        self.extractor = ScoutDatasetExtractor()
        self.searcher = ScoutDatasetSearcher(self.extractor)
        self.downloader = ScoutDatasetDownloader()
        self.quality_assessor = ScoutDataQualityAssessor()
        
        self.pipeline_results = {
            'datasets_metadata': pd.DataFrame(),
            'quality_assessments': {},
            'recommendations': {},
            'search_results': {},
            'pipeline_stats': {}
        }
    
    def run_discovery_pipeline(self, 
                              search_terms: List[str] = None,
                              max_datasets_per_term: int = 20,
                              quality_sample_size: int = 100,
                              include_recommendations: bool = True) -> Dict:
        """
        Run complete automated discovery pipeline
        """
        
        print("🚀 Starting Scout Automated Data Discovery Pipeline")
        start_time = datetime.now()
        
        # Step 1: Search for datasets based on terms
        all_datasets = pd.DataFrame()
        
        if search_terms:
            print(f"\n📋 Step 1: Searching for datasets with terms: {search_terms}")
            for term in search_terms:
                print(f"  Searching for '{term}'...")
                search_results = self.searcher.search_datasets_by_keyword(term, limit=max_datasets_per_term)
                
                if not search_results.empty:
                    search_results['search_term'] = term
                    all_datasets = pd.concat([all_datasets, search_results], ignore_index=True)
                    self.pipeline_results['search_results'][term] = search_results
                    print(f"    Found {len(search_results)} datasets for '{term}'")
                else:
                    print(f"    No datasets found for '{term}'")
        else:
            print(f"\n📋 Step 1: Getting general dataset sample...")
            all_datasets = self.extractor.get_comprehensive_dataset_list(max_datasets=50)
        
        # Remove duplicates
        if not all_datasets.empty:
            initial_count = len(all_datasets)
            all_datasets = all_datasets.drop_duplicates(subset=['id']).reset_index(drop=True)
            print(f"  Removed {initial_count - len(all_datasets)} duplicate datasets")
            self.pipeline_results['datasets_metadata'] = all_datasets
        
        if all_datasets.empty:
            print("❌ No datasets found. Pipeline terminated.")
            return self.pipeline_results
        
        print(f"  Total unique datasets found: {len(all_datasets)}\")\n\")\n        \n        # Step 2: Download samples and assess quality\n        print(f\"🔍 Step 2: Quality Assessment (sampling {quality_sample_size} rows per dataset)\")\n        \n        # Select top datasets for quality assessment\n        assessment_datasets = self._select_datasets_for_assessment(all_datasets, max_count=10)\n        \n        quality_results = {}\n        for _, dataset in assessment_datasets.iterrows():\n            dataset_id = dataset['id']\n            dataset_name = dataset['name']\n            \n            print(f\"  Assessing {dataset_id}: {dataset_name[:50]}...\")\n            \n            # Download sample\n            sample_df = self.downloader.download_dataset_sample(dataset_id, limit=quality_sample_size)\n            \n            if not sample_df.empty:\n                # Assess quality\n                quality_assessment = self.quality_assessor.assess_dataset_quality(dataset_id, sample_df)\n                quality_results[dataset_id] = quality_assessment\n                \n                if 'error' not in quality_assessment:\n                    print(f\"    ✓ Quality scores - Completeness: {quality_assessment['completeness_score']:.1f}, Usability: {quality_assessment['usability_score']:.1f}\")\n                else:\n                    print(f\"    ✗ Assessment failed: {quality_assessment['error']}\")\n            else:\n                print(f\"    ✗ Failed to download sample\")\n                \n            time.sleep(0.5)  # Rate limiting\n        \n        self.pipeline_results['quality_assessments'] = quality_results\n        \n        # Step 3: Generate recommendations\n        if include_recommendations and len(quality_results) > 1:\n            print(f\"\\n🎯 Step 3: Generating Dataset Recommendations\")\n            \n            for dataset_id in list(quality_results.keys())[:3]:  # Top 3 for recommendations\n                recommendations = self.searcher.get_dataset_recommendations(\n                    dataset_id, all_datasets, top_n=5\n                )\n                self.pipeline_results['recommendations'][dataset_id] = recommendations\n                print(f\"  Generated {len(recommendations)} recommendations for {dataset_id}\")\n        \n        # Step 4: Generate pipeline statistics\n        end_time = datetime.now()\n        duration = (end_time - start_time).total_seconds()\n        \n        self.pipeline_results['pipeline_stats'] = {\n            'total_datasets_found': len(all_datasets),\n            'datasets_assessed': len(quality_results),\n            'search_terms_used': search_terms or [],\n            'execution_time_seconds': duration,\n            'average_completeness_score': np.mean([a['completeness_score'] for a in quality_results.values() if 'completeness_score' in a]),\n            'average_usability_score': np.mean([a['usability_score'] for a in quality_results.values() if 'usability_score' in a]),\n            'pipeline_timestamp': start_time.isoformat()\n        }\n        \n        print(f\"\\n✅ Pipeline completed in {duration:.1f} seconds\")\n        self._print_pipeline_summary()\n        \n        return self.pipeline_results\n    \n    def _select_datasets_for_assessment(self, datasets_df: pd.DataFrame, max_count: int = 10) -> pd.DataFrame:\n        \"\"\"Select most promising datasets for quality assessment\"\"\"\n        if len(datasets_df) <= max_count:\n            return datasets_df\n        \n        # Prioritize by download count and page views\n        scored_datasets = datasets_df.copy()\n        scored_datasets['priority_score'] = (\n            scored_datasets['download_count'].fillna(0) * 0.7 +\n            scored_datasets['page_views_total'].fillna(0) * 0.3\n        )\n        \n        return scored_datasets.nlargest(max_count, 'priority_score')\n    \n    def _print_pipeline_summary(self):\n        \"\"\"Print a summary of pipeline results\"\"\"\n        stats = self.pipeline_results['pipeline_stats']\n        \n        print(f\"\\n📊 PIPELINE SUMMARY\")\n        print(f\"  Datasets Discovered: {stats['total_datasets_found']}\")\n        print(f\"  Datasets Assessed: {stats['datasets_assessed']}\")\n        print(f\"  Execution Time: {stats['execution_time_seconds']:.1f} seconds\")\n        \n        if stats.get('average_completeness_score'):\n            print(f\"  Average Completeness: {stats['average_completeness_score']:.1f}/100\")\n            print(f\"  Average Usability: {stats['average_usability_score']:.1f}/100\")\n        \n        # Show top quality datasets\n        quality_assessments = self.pipeline_results['quality_assessments']\n        if quality_assessments:\n            print(f\"\\n🏆 TOP QUALITY DATASETS:\")\n            \n            # Sort by overall quality\n            sorted_quality = sorted(\n                quality_assessments.items(), \n                key=lambda x: (x[1].get('completeness_score', 0) + x[1].get('usability_score', 0)) / 2,\n                reverse=True\n            )\n            \n            for i, (dataset_id, assessment) in enumerate(sorted_quality[:3]):\n                if 'error' not in assessment:\n                    overall_score = (assessment['completeness_score'] + assessment['usability_score']) / 2\n                    dataset_name = next(\n                        (row['name'] for _, row in self.pipeline_results['datasets_metadata'].iterrows() \n                         if row['id'] == dataset_id), \n                        'Unknown'\n                    )\n                    print(f\"  {i+1}. {dataset_id}: {dataset_name[:40]}... (Score: {overall_score:.1f})\")\n\n# Initialize and run the automated pipeline\npipeline = ScoutAutomatedPipeline()\n\nprint(\"=== AUTOMATED SCOUT DATA DISCOVERY PIPELINE ===\")\n\n# Define search terms for targeted discovery\nsearch_terms = ['transportation', 'housing', 'health', 'education']\n\n# Run the complete pipeline\nresults = pipeline.run_discovery_pipeline(\n    search_terms=search_terms,\n    max_datasets_per_term=15,\n    quality_sample_size=50,\n    include_recommendations=True\n)\n\n# Display additional insights\nif results['datasets_metadata'] is not None and not results['datasets_metadata'].empty:\n    print(f\"\\n📈 DATASET INSIGHTS:\")\n    \n    metadata_df = results['datasets_metadata']\n    \n    # Category distribution\n    if 'domain_category' in metadata_df.columns:\n        category_counts = metadata_df['domain_category'].value_counts().head(5)\n        print(f\"\\n  Top Categories:\")\n        for category, count in category_counts.items():\n            print(f\"    {category}: {count} datasets\")\n    \n    # Most popular datasets\n    if 'download_count' in metadata_df.columns:\n        popular_datasets = metadata_df.nlargest(3, 'download_count')\n        print(f\"\\n  Most Downloaded:\")\n        for _, dataset in popular_datasets.iterrows():\n            print(f\"    {dataset['name'][:50]}... ({dataset['download_count']} downloads)\")")

## 8. Export Findings and Metadata

In [None]:
class ScoutDataExporter:
    """
    Export Scout discovery results to various formats
    """
    
    def __init__(self, pipeline_results: Dict):
        self.results = pipeline_results
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    def export_dataset_catalog(self, filename: Optional[str] = None) -> str:
        """Export comprehensive dataset catalog"""
        if filename is None:
            filename = f"scout_dataset_catalog_{self.timestamp}.csv"
        
        if self.results['datasets_metadata'].empty:
            print("No dataset metadata to export")
            return ""
        
        # Prepare export data
        export_df = self.results['datasets_metadata'].copy()
        
        # Add quality scores if available
        quality_assessments = self.results['quality_assessments']
        if quality_assessments:
            quality_data = []
            for _, row in export_df.iterrows():
                dataset_id = row['id']
                if dataset_id in quality_assessments:
                    assessment = quality_assessments[dataset_id]
                    if 'error' not in assessment:
                        quality_data.append({
                            'id': dataset_id,
                            'completeness_score': assessment['completeness_score'],
                            'usability_score': assessment['usability_score'],
                            'missing_percentage': assessment['missing_data']['missing_percentage_overall'],
                            'duplicate_percentage': assessment['duplicates']['duplicate_percentage']
                        })
            
            if quality_data:
                quality_df = pd.DataFrame(quality_data)
                export_df = export_df.merge(quality_df, on='id', how='left')
        
        # Export to CSV
        export_df.to_csv(filename, index=False)
        print(f"✅ Dataset catalog exported to: {filename}")
        print(f"   Records: {len(export_df)}")
        print(f"   Columns: {len(export_df.columns)}")
        
        return filename
    
    def export_quality_report(self, filename: Optional[str] = None) -> str:
        """Export detailed quality assessment report"""
        if filename is None:
            filename = f"scout_quality_report_{self.timestamp}.json"
        
        quality_assessments = self.results['quality_assessments']
        if not quality_assessments:
            print("No quality assessments to export")
            return ""
        
        # Prepare quality report
        quality_report = {
            'metadata': {
                'generated_at': datetime.now().isoformat(),
                'total_datasets_assessed': len(quality_assessments),
                'pipeline_stats': self.results.get('pipeline_stats', {})
            },
            'assessments': quality_assessments,
            'summary_statistics': self._calculate_quality_summary()
        }
        
        # Export to JSON
        with open(filename, 'w') as f:
            json.dump(quality_report, f, indent=2, default=str)
        
        print(f"✅ Quality report exported to: {filename}")
        return filename
    
    def export_recommendations(self, filename: Optional[str] = None) -> str:
        """Export dataset recommendations"""
        if filename is None:
            filename = f"scout_recommendations_{self.timestamp}.json"
        
        recommendations = self.results['recommendations']
        if not recommendations:
            print("No recommendations to export")
            return ""
        
        # Prepare recommendations export
        recommendations_export = {
            'metadata': {
                'generated_at': datetime.now().isoformat(),
                'total_source_datasets': len(recommendations)
            },
            'recommendations': {}
        }
        
        for dataset_id, rec_df in recommendations.items():
            if not rec_df.empty:
                recommendations_export['recommendations'][dataset_id] = {
                    'source_dataset': dataset_id,
                    'recommended_datasets': rec_df[['id', 'name', 'similarity_score', 'domain_category']].to_dict('records')
                }
        
        # Export to JSON
        with open(filename, 'w') as f:
            json.dump(recommendations_export, f, indent=2, default=str)
        
        print(f"✅ Recommendations exported to: {filename}")
        return filename
    
    def export_search_results(self, filename: Optional[str] = None) -> str:
        """Export search results by term"""
        if filename is None:
            filename = f"scout_search_results_{self.timestamp}.json"
        
        search_results = self.results['search_results']
        if not search_results:
            print("No search results to export")
            return ""
        
        # Prepare search results export
        search_export = {
            'metadata': {
                'generated_at': datetime.now().isoformat(),
                'search_terms': list(search_results.keys())
            },
            'results_by_term': {}
        }
        
        for term, results_df in search_results.items():
            if not results_df.empty:
                search_export['results_by_term'][term] = {
                    'term': term,
                    'total_results': len(results_df),
                    'datasets': results_df[['id', 'name', 'domain_category', 'download_count']].to_dict('records')
                }
        
        # Export to JSON
        with open(filename, 'w') as f:
            json.dump(search_export, f, indent=2, default=str)
        
        print(f"✅ Search results exported to: {filename}")
        return filename
    
    def export_complete_package(self, base_filename: Optional[str] = None) -> List[str]:
        """Export all findings in a complete package"""
        if base_filename is None:
            base_filename = f"scout_discovery_{self.timestamp}"
        
        exported_files = []
        
        # Export all components
        catalog_file = self.export_dataset_catalog(f"{base_filename}_catalog.csv")
        if catalog_file:
            exported_files.append(catalog_file)
        
        quality_file = self.export_quality_report(f"{base_filename}_quality.json")
        if quality_file:
            exported_files.append(quality_file)
        
        recommendations_file = self.export_recommendations(f"{base_filename}_recommendations.json")
        if recommendations_file:
            exported_files.append(recommendations_file)
        
        search_file = self.export_search_results(f"{base_filename}_search.json")
        if search_file:
            exported_files.append(search_file)
        
        # Create summary file
        summary_file = f"{base_filename}_summary.txt"
        self._export_summary_report(summary_file)
        exported_files.append(summary_file)
        
        print(f"\n📦 Complete package exported: {len(exported_files)} files")
        for file in exported_files:
            print(f"   - {file}")
        
        return exported_files
    
    def _calculate_quality_summary(self) -> Dict:
        """Calculate summary statistics for quality assessments"""
        quality_assessments = self.results['quality_assessments']
        
        if not quality_assessments:
            return {}
        
        valid_assessments = [a for a in quality_assessments.values() if 'error' not in a]
        
        if not valid_assessments:
            return {}
        
        completeness_scores = [a['completeness_score'] for a in valid_assessments]
        usability_scores = [a['usability_score'] for a in valid_assessments]
        missing_percentages = [a['missing_data']['missing_percentage_overall'] for a in valid_assessments]
        
        return {
            'total_assessed': len(valid_assessments),
            'completeness': {
                'mean': np.mean(completeness_scores),
                'median': np.median(completeness_scores),
                'min': np.min(completeness_scores),
                'max': np.max(completeness_scores)
            },
            'usability': {
                'mean': np.mean(usability_scores),
                'median': np.median(usability_scores),
                'min': np.min(usability_scores),
                'max': np.max(usability_scores)
            },
            'missing_data': {
                'mean_percentage': np.mean(missing_percentages),
                'median_percentage': np.median(missing_percentages)
            }
        }
    
    def _export_summary_report(self, filename: str):
        """Export a human-readable summary report"""
        with open(filename, 'w') as f:
            f.write("SCOUT DATA DISCOVERY SUMMARY REPORT\n")
            f.write("=" * 50 + "\n\n")
            
            # Pipeline stats
            stats = self.results.get('pipeline_stats', {})
            f.write(f"Generated: {stats.get('pipeline_timestamp', 'Unknown')}\n")
            f.write(f"Execution Time: {stats.get('execution_time_seconds', 0):.1f} seconds\n")
            f.write(f"Search Terms: {', '.join(stats.get('search_terms_used', []))}\n")
            f.write(f"Datasets Found: {stats.get('total_datasets_found', 0)}\n")
            f.write(f"Datasets Assessed: {stats.get('datasets_assessed', 0)}\n\n")
            
            # Quality summary
            quality_summary = self._calculate_quality_summary()
            if quality_summary:
                f.write("QUALITY SUMMARY\n")
                f.write("-" * 20 + "\n")
                f.write(f"Average Completeness Score: {quality_summary['completeness']['mean']:.1f}/100\n")
                f.write(f"Average Usability Score: {quality_summary['usability']['mean']:.1f}/100\n")
                f.write(f"Average Missing Data: {quality_summary['missing_data']['mean_percentage']:.1f}%\n\n")
            
            # Top datasets
            if not self.results['datasets_metadata'].empty:
                f.write("TOP DATASETS BY DOWNLOADS\n")
                f.write("-" * 30 + "\n")
                top_datasets = self.results['datasets_metadata'].nlargest(5, 'download_count')
                for i, (_, dataset) in enumerate(top_datasets.iterrows(), 1):
                    f.write(f"{i}. {dataset['name']} ({dataset['download_count']} downloads)\n")
        
        print(f"✅ Summary report exported to: {filename}")

# Export all findings from the pipeline
if 'results' in locals() and results:
    print("\n📁 EXPORTING SCOUT DISCOVERY FINDINGS")
    
    exporter = ScoutDataExporter(results)
    
    # Export complete package
    exported_files = exporter.export_complete_package()
    
    print(f"\n🎉 Scout exploration complete! All findings exported to {len(exported_files)} files.")
else:
    print("\n⚠️ No results available for export. Run the pipeline first.")