In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
!{sys.executable} -m pip install seaborn

import seaborn as sns
from datetime import datetime
import os
import requests
from io import BytesIO
import gzip
from collections import defaultdict
import json


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/anaconda/envs/azureml_py310_sdkv2/bin/python -m pip install --upgrade pip[0m


In [2]:
def load_datasets():
    """
    Load the two required datasets directly
    """
    print("=== LOADING DATASETS ===")
    
    # Load enhanced POI data from pickle file
    import pickle
    with open('complete_analysis_results.pkl', 'rb') as f:
        results = pickle.load(f)
    
    enhanced_poi_df = results['enhanced_poi_df']
    print(f"Enhanced POI data loaded from pickle: {enhanced_poi_df.shape}")
    
    # Load task2 dataset from Zenodo
    print("Downloading task2 dataset...")
    task2_url = "https://zenodo.org/records/10142719/files/yjmob100k-dataset2.csv.gz"
    response = requests.get(task2_url)
    response.raise_for_status()
    
    with gzip.open(BytesIO(response.content), 'rt') as f:
        task2_df = pd.read_csv(f)
    
    print(f"Task2 dataset loaded: {task2_df.shape}")
    
    return enhanced_poi_df, task2_df

In [3]:
# ============================================================================
# DATA EXPLORATION
# ============================================================================

def explore_datasets(enhanced_poi_df, task2_df):
    """
    Quick exploration of both datasets
    """
    print("\n=== DATASET EXPLORATION ===")
    
    print(f"POI Dataset: {enhanced_poi_df.shape}")
    print(f"Columns: {list(enhanced_poi_df.columns)}")
    
    print(f"\nTask2 Dataset: {task2_df.shape}")
    print(f"Columns: {list(task2_df.columns)}")
    
    print(f"\nUsers in task2: {task2_df['uid'].nunique()}")
    print(f"POI coordinates: {len(enhanced_poi_df[['x', 'y']].drop_duplicates())}")
    print(f"Task2 coordinates: {len(task2_df[['x', 'y']].drop_duplicates())}")

def filter_users(task2_df, n_users=10000):
    """
    Filter to first N users
    """
    unique_users = sorted(task2_df['uid'].unique())[:n_users]
    filtered_df = task2_df[task2_df['uid'].isin(unique_users)].copy()
    print(f"\nFiltered to {n_users} users: {filtered_df.shape}")
    return filtered_df


In [4]:
# ============================================================================
# POI PREPARATION
# ============================================================================

def prepare_poi_data(enhanced_poi_df):
    """
    Aggregate POI data per coordinate
    """
    print("\n=== PREPARING POI DATA ===")
    
    # Check for multiple POIs per coordinate
    coord_counts = enhanced_poi_df.groupby(['x', 'y']).size()
    print(f"Coordinates with multiple POIs: {(coord_counts > 1).sum()}")
    
    # Aggregate POI data per coordinate
    agg_dict = {}
    if 'category' in enhanced_poi_df.columns:
        agg_dict['category'] = lambda x: '|'.join(x.dropna().astype(str).unique())
    if 'functional_group' in enhanced_poi_df.columns:
        agg_dict['functional_group'] = lambda x: '|'.join(x.dropna().astype(str).unique())
    if 'POI_count' in enhanced_poi_df.columns:
        agg_dict['POI_count'] = 'sum'
    if 'total_poi_count' in enhanced_poi_df.columns:
        agg_dict['total_poi_count'] = 'sum'
    if 'poi_proportion' in enhanced_poi_df.columns:
        agg_dict['poi_proportion'] = 'mean'
    
    poi_agg = enhanced_poi_df.groupby(['x', 'y']).agg(agg_dict).reset_index()
    
    # Add diversity metrics
    poi_agg['num_poi_types'] = enhanced_poi_df.groupby(['x', 'y'])['category'].nunique().values
    poi_agg['num_functional_groups'] = enhanced_poi_df.groupby(['x', 'y'])['functional_group'].nunique().values
    
    print(f"Aggregated POI data: {poi_agg.shape}")
    return poi_agg

In [5]:
# ============================================================================
# MERGE AND ENHANCE
# ============================================================================

def merge_datasets(task2_df, poi_agg):
    """
    Merge task2 data with POI data
    """
    print("\n=== MERGING DATASETS ===")
    
    # Perform LEFT JOIN to keep all mobility data
    merged_df = pd.merge(task2_df, poi_agg, on=['x', 'y'], how='left')
    
    print(f"Original task2: {task2_df.shape}")
    print(f"Merged dataset: {merged_df.shape}")
    
    # Fill missing POI data
    missing_poi_mask = merged_df['category'].isna()
    merged_df.loc[missing_poi_mask, 'category'] = 'No_POI'
    merged_df.loc[missing_poi_mask, 'functional_group'] = 'none'
    
    # Fill other POI columns
    poi_columns = ['POI_count', 'total_poi_count', 'poi_proportion', 'num_poi_types', 'num_functional_groups']
    for col in poi_columns:
        if col in merged_df.columns:
            merged_df.loc[missing_poi_mask, col] = 0
    
    print(f"Records with POI data: {(~missing_poi_mask).sum()}")
    print(f"Records without POI data: {missing_poi_mask.sum()}")
    
    return merged_df

def add_enhanced_features(merged_df):
    """
    Add comprehensive time and spatial features
    """
    print("\n=== ADDING ENHANCED FEATURES ===")
    
    # Time features
    merged_df['hour'] = (merged_df['t'] * 0.5).astype(int)
    merged_df['minute'] = (merged_df['t'] * 30) % 60
    
    # Detailed time categorization
    def get_detailed_time_period(hour):
        if 5 <= hour < 7:
            return 'Early_Morning'
        elif 7 <= hour < 9:
            return 'Morning_Rush'
        elif 9 <= hour < 12:
            return 'Late_Morning'
        elif 12 <= hour < 14:
            return 'Lunch_Time'
        elif 14 <= hour < 17:
            return 'Afternoon'
        elif 17 <= hour < 19:
            return 'Evening_Rush'
        elif 19 <= hour < 22:
            return 'Evening'
        elif 22 <= hour < 24:
            return 'Night'
        else:
            return 'Deep_Night'
    
    merged_df['time_period_detailed'] = merged_df['hour'].apply(get_detailed_time_period)
    
    # Day features
    merged_df['day_of_week'] = merged_df['d'] % 7
    merged_df['day_name'] = merged_df['day_of_week'].map({
        0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday',
        4: 'Friday', 5: 'Saturday', 6: 'Sunday'
    })
    merged_df['is_weekend'] = merged_df['day_of_week'] >= 5
    merged_df['is_weekday'] = merged_df['day_of_week'] < 5
    
    # Spatial features
    center_x, center_y = merged_df['x'].mean(), merged_df['y'].mean()
    merged_df['distance_from_center'] = np.sqrt(
        (merged_df['x'] - center_x)**2 + (merged_df['y'] - center_y)**2
    )
    
    # Spatial zones
    merged_df['distance_quartile'] = pd.qcut(merged_df['distance_from_center'], 
                                           q=4, labels=['Central', 'Inner', 'Outer', 'Peripheral'])
    
    # Grid quadrants
    merged_df['grid_quadrant'] = ((merged_df['x'] > merged_df['x'].median()).astype(int) * 2 + 
                                 (merged_df['y'] > merged_df['y'].median()).astype(int))
    merged_df['grid_quadrant_name'] = merged_df['grid_quadrant'].map({
        0: 'SW', 1: 'NW', 2: 'SE', 3: 'NE'
    })
    
    # POI-based features
    if 'total_poi_count' in merged_df.columns:
        merged_df['poi_density_category'] = pd.cut(merged_df['total_poi_count'], 
                                                  bins=[0, 1, 10, 50, float('inf')],
                                                  labels=['None', 'Low', 'Medium', 'High'])
        
        merged_df['poi_diversity_score'] = (merged_df['num_poi_types'] * merged_df['num_functional_groups'])
        merged_df['location_attractiveness'] = (merged_df['total_poi_count'] * 0.7 + 
                                              merged_df['poi_diversity_score'] * 0.3)
    
    # Movement features
    merged_df_sorted = merged_df.sort_values(['uid', 'd', 't'])
    merged_df_sorted['prev_x'] = merged_df_sorted.groupby('uid')['x'].shift(1)
    merged_df_sorted['prev_y'] = merged_df_sorted.groupby('uid')['y'].shift(1)
    
    merged_df_sorted['displacement'] = np.sqrt(
        (merged_df_sorted['x'] - merged_df_sorted['prev_x'])**2 + 
        (merged_df_sorted['y'] - merged_df_sorted['prev_y'])**2
    )
    merged_df_sorted['displacement'] = merged_df_sorted['displacement'].fillna(0)
    
    print(f"Enhanced dataset: {merged_df_sorted.shape}")
    return merged_df_sorted


In [6]:
# ============================================================================
# CREATE COMPREHENSIVE SCIENTIFIC DATASETS
# ============================================================================

def analyze_poi_categories(enhanced_poi_df):
    """
    Analyze POI categories for creating binary features
    """
    print("\n=== ANALYZING POI CATEGORIES ===")
    
    # Get all unique categories
    all_categories = []
    for cat_string in enhanced_poi_df['category'].dropna():
        if isinstance(cat_string, str) and '|' in cat_string:
            all_categories.extend(cat_string.split('|'))
        else:
            all_categories.append(str(cat_string))
    
    unique_categories = pd.Series(all_categories).value_counts()
    
    # Get all unique functional groups
    all_func_groups = []
    for func_string in enhanced_poi_df['functional_group'].dropna():
        if isinstance(func_string, str) and '|' in func_string:
            all_func_groups.extend(func_string.split('|'))
        else:
            all_func_groups.append(str(func_string))
    
    unique_func_groups = pd.Series(all_func_groups).value_counts()
    
    print(f"Total unique categories: {len(unique_categories)}")
    print(f"Total unique functional groups: {len(unique_func_groups)}")
    
    return unique_categories, unique_func_groups

def create_comprehensive_scientific_datasets(base_merged_df, unique_categories, unique_func_groups):
    """
    Create all 5 comprehensive scientific datasets
    """
    print("\n=== CREATING COMPREHENSIVE SCIENTIFIC DATASETS ===")
    
    # DATASET 1: Base mobility data with primary POI information
    print("\n1. Creating BASE MOBILITY dataset...")
    base_df = base_merged_df.copy()
    
    # Extract primary (first) category and functional group
    base_df['primary_category'] = base_df['category'].astype(str).str.split('|').str[0]
    base_df['primary_functional_group'] = base_df['functional_group'].astype(str).str.split('|').str[0]
    
    # Select base columns
    base_columns = ['uid', 'd', 't', 'x', 'y', 
                   'primary_category', 'primary_functional_group',
                   'total_poi_count', 'poi_proportion', 
                   'num_poi_types', 'num_functional_groups',
                   'hour', 'time_period_detailed', 'day_of_week', 'day_name',
                   'is_weekend', 'is_weekday', 'distance_from_center',
                   'distance_quartile', 'grid_quadrant_name', 'displacement']
    
    # Filter to available columns
    available_base_columns = [col for col in base_columns if col in base_df.columns]
    base_mobility_df = base_df[available_base_columns].copy()
    
    # Rename for clarity
    base_mobility_df = base_mobility_df.rename(columns={
        'uid': 'user_id',
        'd': 'day', 
        't': 'time_slot',
        'x': 'grid_x',
        'y': 'grid_y',
        'primary_category': 'location_category',
        'primary_functional_group': 'location_function',
        'total_poi_count': 'poi_density',
        'num_poi_types': 'category_diversity',
        'num_functional_groups': 'functional_diversity'
    })
    
    print(f"   Base mobility dataset shape: {base_mobility_df.shape}")
    
    # DATASET 2: POI presence matrix
    print("\n2. Creating POI PRESENCE MATRIX...")
    
    # Get top categories for binary encoding (limit to manageable number)
    top_categories = unique_categories.head(20).index.tolist()
    
    poi_matrix_df = base_mobility_df.copy()
    
    # Create binary columns for each top category
    for category in top_categories:
        col_name = f"has_{category.replace(' ', '_').replace('|', '_').lower()}"
        poi_matrix_df[col_name] = base_merged_df['category'].astype(str).str.contains(category, na=False).astype(int)
    
    # Create binary columns for functional groups
    for func_group in unique_func_groups.index:
        col_name = f"has_function_{func_group.replace(' ', '_').replace('|', '_').lower()}"
        poi_matrix_df[col_name] = base_merged_df['functional_group'].astype(str).str.contains(func_group, na=False).astype(int)
    
    print(f"   POI presence matrix shape: {poi_matrix_df.shape}")
    print(f"   Added {len(top_categories)} category binary features")
    print(f"   Added {len(unique_func_groups)} functional binary features")
    
    # DATASET 3: Functional group focused dataset
    print("\n3. Creating FUNCTIONAL GROUP dataset...")
    
    functional_df = base_mobility_df.copy()
    
    # Create functional group hierarchy
    def categorize_functional_group(func_group):
        if pd.isna(func_group) or func_group == 'none':
            return 'No_Function'
        func_group_str = str(func_group).lower()
        if 'food' in func_group_str or 'dining' in func_group_str:
            return 'Food_Services'
        elif 'shopping' in func_group_str or 'retail' in func_group_str:
            return 'Retail_Shopping'
        elif 'transport' in func_group_str:
            return 'Transportation'
        elif 'education' in func_group_str:
            return 'Education'
        elif 'health' in func_group_str or 'medical' in func_group_str:
            return 'Healthcare'
        elif 'entertainment' in func_group_str or 'recreation' in func_group_str:
            return 'Entertainment'
        elif 'service' in func_group_str:
            return 'Services'
        elif 'business' in func_group_str:
            return 'Business'
        elif 'religious' in func_group_str:
            return 'Religious'
        else:
            return 'Other'
    
    functional_df['functional_category'] = functional_df['location_function'].apply(categorize_functional_group)
    
    print(f"   Functional group dataset shape: {functional_df.shape}")
    func_dist = functional_df['functional_category'].value_counts()
    print(f"   Functional categories distribution:")
    for func, count in func_dist.items():
        print(f"     {func}: {count:,} ({count/len(functional_df)*100:.1f}%)")
    
    return base_mobility_df, poi_matrix_df, functional_df

def create_user_level_features(base_df):
    """
    Create comprehensive user-level features
    """
    print("\n=== CREATING USER-LEVEL FEATURES ===")
    
    # Calculate user-level statistics
    agg_dict = {
        'day': 'nunique',
        'time_slot': 'count',
        'grid_x': ['std', 'nunique', 'min', 'max'],
        'grid_y': ['std', 'nunique', 'min', 'max'],
        'distance_from_center': ['mean', 'std', 'max'],
        'is_weekend': 'mean'
    }
    
    # Add POI-related columns if available
    if 'poi_density' in base_df.columns:
        agg_dict['poi_density'] = ['mean', 'std', 'max']
    if 'category_diversity' in base_df.columns:
        agg_dict['category_diversity'] = ['mean', 'std']
    if 'functional_diversity' in base_df.columns:
        agg_dict['functional_diversity'] = ['mean', 'std']
    if 'displacement' in base_df.columns:
        agg_dict['displacement'] = ['mean', 'std', 'sum']
    
    user_features = base_df.groupby('user_id').agg(agg_dict).round(3)
    
    # Flatten column names
    user_features.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col 
                           for col in user_features.columns.values]
    
    # Rename for clarity
    column_mapping = {
        'day_nunique': 'active_days',
        'time_slot_count': 'total_records',
        'grid_x_std': 'mobility_x_std',
        'grid_x_nunique': 'unique_x_locations',
        'grid_y_std': 'mobility_y_std', 
        'grid_y_nunique': 'unique_y_locations',
        'distance_from_center_mean': 'avg_distance_from_center',
        'distance_from_center_std': 'spatial_range_variability',
        'distance_from_center_max': 'max_distance_from_center',
        'is_weekend_mean': 'weekend_activity_ratio'
    }
    
    # Add POI-related mappings if columns exist
    if 'poi_density_mean' in user_features.columns:
        column_mapping.update({
            'poi_density_mean': 'avg_poi_density',
            'poi_density_std': 'poi_density_variability'
        })
    
    if 'displacement_mean' in user_features.columns:
        column_mapping.update({
            'displacement_mean': 'avg_displacement',
            'displacement_std': 'displacement_variability',
            'displacement_sum': 'total_displacement'
        })
    
    user_features = user_features.rename(columns=column_mapping)
    
    # Calculate additional metrics
    user_features['mobility_radius'] = np.sqrt(
        user_features['mobility_x_std']**2 + user_features['mobility_y_std']**2
    )
    
    user_features['spatial_coverage'] = (
        user_features['unique_x_locations'] * user_features['unique_y_locations']
    )
    
    user_features['activity_intensity'] = (
        user_features['total_records'] / user_features['active_days']
    )
    
    print(f"User-level features dataset shape: {user_features.shape}")
    
    return user_features

def create_location_level_features(base_df):
    """
    Create comprehensive location-level features
    """
    print("\n=== CREATING LOCATION-LEVEL FEATURES ===")
    
    # Calculate location-level statistics
    agg_dict = {
        'user_id': 'nunique',
        'day': 'nunique', 
        'time_slot': 'count',
        'location_category': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown',
        'location_function': lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else 'Unknown',
        'distance_from_center': 'first',
        'is_weekend': 'mean'
    }
    
    # Add POI-related columns if available
    if 'poi_density' in base_df.columns:
        agg_dict['poi_density'] = 'first'
    if 'category_diversity' in base_df.columns:
        agg_dict['category_diversity'] = 'first'
    if 'functional_diversity' in base_df.columns:
        agg_dict['functional_diversity'] = 'first'
    if 'hour' in base_df.columns:
        agg_dict['hour'] = ['min', 'max', 'nunique']
    if 'displacement' in base_df.columns:
        agg_dict['displacement'] = 'mean'
    
    location_features = base_df.groupby(['grid_x', 'grid_y']).agg(agg_dict).round(3)
    
    # Flatten column names
    location_features.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col 
                               for col in location_features.columns.values]
    
    # Rename for clarity
    location_features = location_features.rename(columns={
        'user_id_nunique': 'unique_visitors',
        'day_nunique': 'active_days',
        'time_slot_count': 'total_visits',
        'is_weekend_mean': 'weekend_visit_ratio'
    })
    
    # Add hour-related columns if available
    if 'hour_min' in location_features.columns:
        location_features = location_features.rename(columns={
            'hour_min': 'earliest_visit_hour',
            'hour_max': 'latest_visit_hour',
            'hour_nunique': 'active_hours'
        })
        
        location_features['temporal_span'] = (
            location_features['latest_visit_hour'] - location_features['earliest_visit_hour']
        )
    
    # Calculate additional metrics
    location_features['popularity_score'] = (
        location_features['unique_visitors'] * location_features['total_visits']
    )
    
    print(f"Location-level features dataset shape: {location_features.shape}")
    
    return location_features

In [7]:
# ============================================================================
# EXPORT ALL DATASETS
# ============================================================================

def export_all_scientific_datasets(base_mobility_df, poi_matrix_df, functional_df, 
                                  user_features_df, location_features_df):
    """
    Export all 5 comprehensive scientific datasets
    """
    print("\n=== EXPORTING ALL SCIENTIFIC DATASETS ===")
    
    # Export datasets
    datasets = {
        'scientific_mobility_base_dataset.csv': base_mobility_df,
        'scientific_mobility_poi_matrix_dataset.csv': poi_matrix_df,
        'scientific_mobility_functional_dataset.csv': functional_df,
        'scientific_user_profiles_dataset.csv': user_features_df,
        'scientific_location_profiles_dataset.csv': location_features_df
    }
    
    exported_files = []
    for filename, dataset in datasets.items():
        dataset.to_csv(filename, index=False)
        exported_files.append(filename)
        print(f"Exported: {filename} ({dataset.shape[0]:,} rows, {dataset.shape[1]} cols)")
    
    # Create comprehensive documentation
    with open('comprehensive_datasets_documentation.md', 'w') as f:
        f.write("# Comprehensive Scientific Mobility Datasets Documentation\n\n")
        f.write("This package contains 5 comprehensive datasets for scientific mobility analysis.\n\n")
        
        f.write("## Dataset Overview\n\n")
        
        descriptions = {
            'scientific_mobility_base_dataset.csv': 'Core mobility data with primary POI information and temporal/spatial features',
            'scientific_mobility_poi_matrix_dataset.csv': 'Mobility data with binary POI presence indicators for detailed analysis',
            'scientific_mobility_functional_dataset.csv': 'Mobility data with hierarchical functional group categorization',
            'scientific_user_profiles_dataset.csv': 'Aggregated user-level features and mobility patterns',
            'scientific_location_profiles_dataset.csv': 'Aggregated location-level features and visit patterns'
        }
        
        for filename, description in descriptions.items():
            dataset = datasets[filename]
            f.write(f"### {filename}\n")
            f.write(f"- **Description**: {description}\n")
            f.write(f"- **Dimensions**: {dataset.shape[0]:,} rows × {dataset.shape[1]} columns\n")
            if 'user_id' in dataset.columns:
                f.write(f"- **Users**: {dataset['user_id'].nunique()}\n")
            f.write("\n")
        
        f.write("## Research Applications\n\n")
        f.write("These datasets are designed for:\n")
        f.write("- **Anomaly Detection**: Identify unusual mobility patterns\n")
        f.write("- **Temporal Analysis**: Study time-based mobility patterns\n")
        f.write("- **Spatial Analysis**: Analyze location-based behaviors\n")
        f.write("- **User Profiling**: Characterize different mobility types\n")
        f.write("- **Location Profiling**: Understand place characteristics\n")
        f.write("- **Multi-scale Analysis**: From individual to aggregate patterns\n\n")
    
    # Create summary metadata
    metadata = {
        'created_date': datetime.now().isoformat(),
        'datasets': {
            filename: {
                'shape': dataset.shape,
                'columns': list(dataset.columns),
                'users': dataset['user_id'].nunique() if 'user_id' in dataset.columns else 'N/A'
            }
            for filename, dataset in datasets.items()
        }
    }
    
    with open('comprehensive_datasets_metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print("Documentation: comprehensive_datasets_documentation.md")
    print("Metadata: comprehensive_datasets_metadata.json")
    
    return exported_files

In [8]:
# ============================================================================
# MAIN EXECUTION
# ============================================================================

def run_comprehensive_grid_poi_merge():
    """
    Execute the complete comprehensive merge process creating all 5 datasets
    """
    print("COMPREHENSIVE GRID-POI DATASET MERGE FOR IEEE RESEARCH")
    print("=" * 60)
    
    # Load datasets
    enhanced_poi_df, task2_df = load_datasets()
    
    # Explore
    explore_datasets(enhanced_poi_df, task2_df)
    
    # Filter users
    filtered_task2_df = filter_users(task2_df, n_users=10000)
    
    # Prepare POI data
    poi_agg = prepare_poi_data(enhanced_poi_df)
    
    # Merge datasets
    merged_df = merge_datasets(filtered_task2_df, poi_agg)
    
    # Add comprehensive features
    enhanced_df = add_enhanced_features(merged_df)
    
    # Analyze POI categories for binary features
    unique_categories, unique_func_groups = analyze_poi_categories(enhanced_poi_df)
    
    # Create all 5 comprehensive scientific datasets
    base_mobility_df, poi_matrix_df, functional_df = create_comprehensive_scientific_datasets(
        enhanced_df, unique_categories, unique_func_groups)
    
    # Create user and location profiles
    user_features_df = create_user_level_features(base_mobility_df)
    location_features_df = create_location_level_features(base_mobility_df)
    
    # Export all datasets
    exported_files = export_all_scientific_datasets(
        base_mobility_df, poi_matrix_df, functional_df, 
        user_features_df, location_features_df)
    
    print(f"\nCOMPREHENSIVE MERGE COMPLETE - ALL {len(exported_files)} DATASETS READY FOR IEEE RESEARCH")
    print("=" * 60)
    
    return {
        'base_mobility': base_mobility_df,
        'poi_matrix': poi_matrix_df,
        'functional': functional_df,
        'user_profiles': user_features_df,
        'location_profiles': location_features_df,
        'exported_files': exported_files
    }


In [9]:
results = run_comprehensive_grid_poi_merge()

COMPREHENSIVE GRID-POI DATASET MERGE FOR IEEE RESEARCH
=== LOADING DATASETS ===
Enhanced POI data loaded from pickle: (221159, 9)
Downloading task2 dataset...
Task2 dataset loaded: (29389749, 5)

=== DATASET EXPLORATION ===
POI Dataset: (221159, 9)
Columns: ['x', 'y', 'POIcategory', 'POI_count', 'id', 'category', 'functional_group', 'total_poi_count', 'poi_proportion']

Task2 Dataset: (29389749, 5)
Columns: ['uid', 'd', 't', 'x', 'y']

Users in task2: 25000
POI coordinates: 20146
Task2 coordinates: 30277

Filtered to 10000 users: (12247358, 5)

=== PREPARING POI DATA ===
Coordinates with multiple POIs: 17136
Aggregated POI data: (20146, 9)

=== MERGING DATASETS ===
Original task2: (12247358, 5)
Merged dataset: (12247358, 12)
Records with POI data: 11896914
Records without POI data: 350444

=== ADDING ENHANCED FEATURES ===
Enhanced dataset: (12247358, 29)

=== ANALYZING POI CATEGORIES ===
Total unique categories: 84
Total unique functional groups: 10

=== CREATING COMPREHENSIVE SCIENTIF

In [1]:

scientific_mobility_base_dataset.head()



NameError: name 'scientific_mobility_base_dataset' is not defined