In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import json
import os

In [2]:
# Replace existing load functions with these Azure-compatible versions

def load_mobility_datasets():
    """Load all mobility datasets and merge them - Azure compatible"""
    try:
        # Try relative paths first (for uploaded files)
        base_df = pd.read_csv("scientific_mobility_base_dataset.csv")
        functional_df = pd.read_csv("scientific_mobility_functional_dataset.csv")
    except FileNotFoundError:
        # If files not found, provide helpful error message
        print("ERROR: CSV files not found!")
        print("Please ensure these files are uploaded to your Azure workspace:")
        print("- scientific_mobility_base_dataset.csv")
        print("- scientific_mobility_functional_dataset.csv")
        print("- scientific_mobility_poi_matrix_dataset.csv")
        return None
    
    # Merge base and functional datasets
    merged_df = base_df.merge(
        functional_df[['user_id', 'day', 'time_slot', 'functional_category']], 
        on=['user_id', 'day', 'time_slot'], 
        how='left'
    )
    
    return merged_df

In [3]:
def load_poi_presence_matrix():
    """Load and process the POI presence matrix - Azure compatible"""
    try:
        poi_df = pd.read_csv("scientific_mobility_poi_matrix_dataset.csv")
    except FileNotFoundError:
        print("WARNING: POI matrix file not found. Continuing without POI features.")
        return None, [], []
    
    # Get POI binary columns (assuming they start with 'has_')
    poi_columns = [col for col in poi_df.columns if col.startswith('has_')]
    
    # Get unique POI categories and functions
    poi_categories = [col.replace('has_', '') for col in poi_columns 
                     if 'category' in col.lower() or len(col.split('_')) <= 3]
    
    return poi_df, poi_columns, poi_categories

In [4]:
def get_period_type(day):
    """Determine if day is in normal or emergency period"""
    if day <= 60:
        return 'normal'
    elif day <= 75:
        return 'emergency'
    else:
        return 'post_emergency'


In [5]:
def create_user_folders(base_path='UserMobilityTexts'):
    """Create folder structure for user mobility texts - Azure compatible"""
    if not os.path.exists(base_path):
        os.makedirs(base_path)
    
    subfolders = ['normal_period', 'emergency_period', 'daily_narratives', 'sequences', 'vocabulary']
    for subfolder in subfolders:
        folder_path = os.path.join(base_path, subfolder)
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
    
    return base_path

In [6]:
# Step 3: Create time-based context descriptions
def generate_time_context(row):
    """Generate natural language time context with missing column handling"""
    
    # Handle hour - try different possible column names
    if 'hour' in row:
        hour = row['hour']
    elif 'time_slot' in row:
        # If only time_slot exists, convert it to hour (assuming time_slot 0-23 = hours)
        hour = row['time_slot'] % 24
    else:
        hour = 12  # Default fallback
    
    # Handle minute - create default if missing
    if 'minute' in row:
        minute = row['minute']
    else:
        # Generate minute from time_slot if available, otherwise default
        if 'time_slot' in row:
            minute = (row['time_slot'] * 15) % 60  # Assume 15-min intervals
        else:
            minute = 0  # Default fallback
    
    # Handle day_name
    if 'day_name' in row:
        day_name = row['day_name']
    else:
        # Generate from day number if available
        if 'day' in row:
            days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
            day_name = days[row['day'] % 7]
        else:
            day_name = "Monday"  # Default fallback
    
    # Handle weekend flag
    if 'is_weekend' in row:
        is_weekend = row['is_weekend'] == 'True' or row['is_weekend'] == True
    else:
        # Infer from day if available
        if 'day' in row:
            is_weekend = (row['day'] % 7) >= 5  # Saturday=5, Sunday=6
        else:
            is_weekend = False  # Default fallback
    
    time_str = f"{hour:02d}:{minute:02d}"
    
    if hour < 6:
        period = "early morning"
    elif hour < 12:
        period = "morning"
    elif hour < 17:
        period = "afternoon"
    elif hour < 21:
        period = "evening"
    else:
        period = "night"
    
    weekend_context = "weekend" if is_weekend else "weekday"
    
    return {
        'time_str': time_str,
        'period': period,
        'day_context': f"{day_name} {weekend_context}",
        'full_context': f"At {time_str} on {day_name} {period}"
    }


In [7]:

# Step 4: Create location context from POI density and categories
def generate_location_context(row):
    """Generate natural language location context with missing column handling"""
    
    # Handle location category
    location_category = row.get('location_category', 'unknown location')
    location_function = row.get('location_function', 'general purpose')
    
    # Handle POI density
    poi_density = row.get('poi_density', 1000)  # Default moderate density
    
    # Handle category diversity
    category_diversity = row.get('category_diversity', 15)  # Default moderate diversity
    
    # Handle distance from center
    distance_from_center = row.get('distance_from_center', 5.0)  # Default 5km
    distance_quartile = row.get('distance_quartile', 'moderate')  # Default quartile
    
    # Density description
    if poi_density > 1500:
        density_desc = "high-density urban area"
    elif poi_density > 800:
        density_desc = "moderate-density area"
    else:
        density_desc = "low-density area"
    
    # Diversity description
    if category_diversity > 25:
        diversity_desc = "with diverse amenities"
    elif category_diversity > 15:
        diversity_desc = "with several amenities"
    else:
        diversity_desc = "with limited amenities"
    
    # Distance context
    if isinstance(distance_quartile, str):
        center_context = f"{distance_quartile.lower()} from city center"
    else:
        center_context = "moderate distance from city center"
    
    return {
        'primary_location': location_category,
        'function': location_function,
        'density_desc': density_desc,
        'diversity_desc': diversity_desc,
        'center_context': center_context,
        'full_description': f"{location_category} in {density_desc} {center_context} {diversity_desc}"
    }

In [8]:
# Step 5: Generate activity context based on functional category
def generate_activity_context(functional_category, location_function, time_period):
    """Generate activity-based context"""
    
    activity_mapping = {
        'food_dining': ['dining', 'eating', 'food shopping'],
        'retail_shopping': ['shopping', 'browsing stores', 'purchasing items'],
        'entertainment': ['entertainment', 'leisure activities', 'recreation'],
        'professional': ['work', 'business activities', 'professional meetings'],
        'healthcare': ['medical appointment', 'health services', 'wellness'],
        'education': ['learning', 'educational activities', 'studying'],
        'transportation': ['transit', 'traveling', 'commuting'],
        'residential': ['home activities', 'residential area', 'living']
    }
    
    # Infer likely activity
    if functional_category and functional_category in activity_mapping:
        activities = activity_mapping[functional_category]
    else:
        activities = ['general activities']
    
    # Time-based activity refinement
    if time_period == 'morning' and 'work' in activities:
        activity = 'commuting to work'
    elif time_period == 'evening' and 'residential' in activities:
        activity = 'returning home'
    else:
        activity = activities[0] if activities else 'visiting location'
    
    return {
        'inferred_activity': activity,
        'functional_category': functional_category,
        'context': f"engaged in {activity}"
    }

In [9]:
def create_mobility_sentence(row):
    """FIXED: Create a natural language sentence for a single mobility record"""
    
    # Generate all contexts using safe functions
    time_ctx = generate_time_context(row)
    location_ctx = generate_location_context(row)
    
    activity_ctx = generate_activity_context(
        row.get('functional_category', None),
        row.get('location_function', 'general purpose'),
        time_ctx['period']
    )
    
    # Handle grid coordinates
    grid_x = row.get('grid_x', 0)
    grid_y = row.get('grid_y', 0)
    
    # Construct sentence variations
    sentences = {
        'detailed': f"{time_ctx['full_context']}, the user visited {location_ctx['full_description']} and {activity_ctx['context']}.",
        
        'medium': f"At {time_ctx['time_str']}, user was at {row.get('location_category', 'location')} in {location_ctx['density_desc']} {activity_ctx['context']}.",
        
        'simple': f"{time_ctx['period']}: {row.get('location_category', 'location')} - {activity_ctx['inferred_activity']}",
        
        'coordinate': f"Grid({grid_x}, {grid_y}) at {time_ctx['time_str']}: {row.get('location_category', 'location')}"
    }
    
    return {
        'sentences': sentences,
        'contexts': {
            'time': time_ctx,
            'location': location_ctx,
            'activity': activity_ctx
        },
        'metadata': {
            'grid_coords': (grid_x, grid_y),
            'timestamp': f"Day_{row.get('day', 0)}_Slot_{row.get('time_slot', 0)}",
            'poi_density': row.get('poi_density', 1000),
            'distance_from_center': row.get('distance_from_center', 5.0)
        }
    }

In [10]:
# Step 7: Generate daily mobility narratives
def create_daily_narrative(user_data, narrative_style='medium'):
    """FIXED: Create a daily narrative for a user's mobility"""
    
    # Sort by time_slot
    user_data_sorted = user_data.sort_values('time_slot')
    
    daily_sentences = []
    contexts = []
    
    for _, row in user_data_sorted.iterrows():
        sentence_data = create_mobility_sentence(row)  # Use fixed function
        daily_sentences.append(sentence_data['sentences'][narrative_style])
        contexts.append(sentence_data)
    
    # Create narrative flow
    narrative_parts = []
    for i, sentence in enumerate(daily_sentences):
        if i == 0:
            narrative_parts.append(f"The day began: {sentence}")
        elif i == len(daily_sentences) - 1:
            narrative_parts.append(f"Finally, {sentence}")
        else:
            connectors = ["Then,", "Next,", "Later,", "Subsequently,"]
            connector = connectors[i % len(connectors)]
            narrative_parts.append(f"{connector} {sentence}")
    
    return {
        'full_narrative': ' '.join(narrative_parts),
        'individual_sentences': daily_sentences,
        'sentence_contexts': contexts,
        'total_visits': len(daily_sentences)
    }

In [11]:
# UPDATED: Generate mobility corpus separated by periods
def generate_mobility_corpus_by_period(df, output_style='medium'):
    """Generate mobility text corpus separated by normal/emergency periods"""
    
    mobility_corpus = {
        'normal_period': {},
        'emergency_period': {}
    }
    
    for user_id in df['user_id'].unique():
        user_data = df[df['user_id'] == user_id]
        
        # Initialize user data for both periods
        mobility_corpus['normal_period'][f'user_{user_id}'] = {}
        mobility_corpus['emergency_period'][f'user_{user_id}'] = {}
        
        for day in user_data['day'].unique():
            day_data = user_data[user_data['day'] == day]
            
            if len(day_data) > 0:
                # Determine period type
                period_type = get_period_type(day)
                
                if period_type in ['normal', 'emergency']:
                    period_key = f'{period_type}_period'
                    narrative = create_daily_narrative(day_data, output_style)
                    mobility_corpus[period_key][f'user_{user_id}'][f'day_{day}'] = narrative
    
    return mobility_corpus

In [12]:
# NEW FUNCTION: Save individual user files
def save_user_files_by_period(mobility_corpus, base_path):
    """Save individual files for each user separated by period"""
    
    for period_type, period_data in mobility_corpus.items():
        period_folder = os.path.join(base_path, period_type)
        
        for user_id, user_data in period_data.items():
            if user_data:  # Only save if user has data for this period
                
                # Create user-specific files
                user_file = os.path.join(period_folder, f'{user_id}_{period_type}.json')
                
                # Save detailed JSON with all contexts
                with open(user_file, 'w') as f:
                    json.dump(user_data, f, indent=2)
                
                # Create readable text file for each user
                text_file = os.path.join(period_folder, f'{user_id}_{period_type}_readable.txt')
                with open(text_file, 'w') as f:
                    f.write(f"=== {user_id.upper()} - {period_type.upper()} ===\n\n")
                    
                    for day_id, day_data in user_data.items():
                        f.write(f"\n--- {day_id.upper()} ---\n")
                        f.write(f"Full Narrative:\n{day_data['full_narrative']}\n\n")
                        f.write(f"Individual Sentences:\n")
                        for i, sentence in enumerate(day_data['individual_sentences'], 1):
                            f.write(f"{i}. {sentence}\n")
                        f.write(f"\nTotal visits: {day_data['total_visits']}\n")
                        f.write("-" * 80 + "\n")


In [13]:
# NEW FUNCTION: Export training data by period
def export_training_data_by_period(mobility_corpus, base_path):
    """Export training data separated by period"""
    
    for period_type, period_data in mobility_corpus.items():
        
        # JSONL format for language model training
        jsonl_file = os.path.join(base_path, period_type, f'training_data_{period_type}.jsonl')
        
        with open(jsonl_file, 'w') as f:
            for user_id, user_data in period_data.items():
                for day_id, day_data in user_data.items():
                    sample = {
                        'user_id': user_id,
                        'day': day_id,
                        'period_type': period_type,
                        'text': day_data['full_narrative'],
                        'sentences': day_data['individual_sentences'],
                        'metadata': {
                            'total_visits': day_data['total_visits'],
                            'period': period_type
                        }
                    }
                    f.write(json.dumps(sample) + '\n')
        
        # Plain text format for GPT training
        txt_file = os.path.join(base_path, period_type, f'gpt_training_{period_type}.txt')
        
        with open(txt_file, 'w') as f:
            f.write(f"=== MOBILITY TEXTS - {period_type.upper()} ===\n\n")
            
            for user_data in period_data.values():
                for day_data in user_data.values():
                    f.write(day_data['full_narrative'] + '\n')
                    f.write('\n')

In [14]:
# NEW FUNCTION: Create user summaries
def create_user_summary_files(mobility_corpus, base_path):
    """Create summary files for each user across both periods"""
    
    summary_folder = os.path.join(base_path, 'user_summaries')
    if not os.path.exists(summary_folder):
        os.makedirs(summary_folder)
    
    # Get all unique users across both periods
    all_users = set()
    for period_data in mobility_corpus.values():
        all_users.update(period_data.keys())
    
    for user_id in all_users:
        summary_file = os.path.join(summary_folder, f'{user_id}_complete_summary.txt')
        
        with open(summary_file, 'w') as f:
            f.write(f"=== COMPLETE MOBILITY SUMMARY FOR {user_id.upper()} ===\n\n")
            
            for period_type in ['normal_period', 'emergency_period']:
                f.write(f"\n{'='*50}\n")
                f.write(f"{period_type.upper().replace('_', ' ')}\n")
                f.write(f"{'='*50}\n")
                
                if user_id in mobility_corpus[period_type]:
                    user_data = mobility_corpus[period_type][user_id]
                    
                    f.write(f"Total days: {len(user_data)}\n")
                    f.write(f"Days covered: {list(user_data.keys())}\n\n")
                    
                    total_visits = sum(day_data['total_visits'] for day_data in user_data.values())
                    f.write(f"Total visits in period: {total_visits}\n\n")
                    
                    # Sample narratives
                    for day_id, day_data in list(user_data.items())[:3]:  # First 3 days as sample
                        f.write(f"\nSample - {day_id}:\n")
                        f.write(f"{day_data['full_narrative']}\n")
                        f.write("-" * 40 + "\n")
                    
                    if len(user_data) > 3:
                        f.write(f"\n... and {len(user_data) - 3} more days\n")
                else:
                    f.write("No data available for this period\n")


In [15]:
# UPDATED: Build vocabulary from period-separated mobility text corpus
def build_mobility_vocabulary_updated(mobility_corpus):
    """Build vocabulary from period-separated mobility text corpus"""
    
    vocabulary = {
        'locations': set(),
        'activities': set(),
        'time_periods': set(),
        'density_terms': set(),
        'spatial_terms': set()
    }
    
    # Handle new period-separated structure
    if 'normal_period' in mobility_corpus and 'emergency_period' in mobility_corpus:
        # New structure: corpus has 'normal_period' and 'emergency_period' keys
        for period_data in mobility_corpus.values():
            for user_data in period_data.values():
                for day_data in user_data.values():
                    if 'sentence_contexts' in day_data:
                        for context in day_data['sentence_contexts']:
                            vocab_ctx = context['contexts']
                            
                            vocabulary['locations'].add(vocab_ctx['location']['primary_location'])
                            vocabulary['activities'].add(vocab_ctx['activity']['inferred_activity'])
                            vocabulary['time_periods'].add(vocab_ctx['time']['period'])
                            vocabulary['density_terms'].add(vocab_ctx['location']['density_desc'])
    else:
        # Old structure: corpus directly contains user data
        for user_data in mobility_corpus.values():
            for day_data in user_data.values():
                if 'sentence_contexts' in day_data:
                    for context in day_data['sentence_contexts']:
                        vocab_ctx = context['contexts']
                        
                        vocabulary['locations'].add(vocab_ctx['location']['primary_location'])
                        vocabulary['activities'].add(vocab_ctx['activity']['inferred_activity'])
                        vocabulary['time_periods'].add(vocab_ctx['time']['period'])
                        vocabulary['density_terms'].add(vocab_ctx['location']['density_desc'])
    
    # Convert sets to sorted lists
    for key in vocabulary:
        vocabulary[key] = sorted(list(vocabulary[key]))
    
    return vocabulary

In [16]:

# Step 10: Export functions for different use cases
def export_for_language_model_training(mobility_corpus, output_file='Users/agugire/mobility_training_data.jsonl'):
    """Export in format suitable for language model training"""
    
    training_data = []
    
    # Handle period-separated structure
    if 'normal_period' in mobility_corpus and 'emergency_period' in mobility_corpus:
        for period_type, period_data in mobility_corpus.items():
            for user_id, user_data in period_data.items():
                for day_id, day_data in user_data.items():
                    
                    # Create training sample
                    sample = {
                        'user_id': user_id,
                        'day': day_id,
                        'period': period_type,
                        'text': day_data['full_narrative'],
                        'sentences': day_data['individual_sentences'],
                        'metadata': {
                            'total_visits': day_data['total_visits'],
                            'coordinates': [ctx['metadata']['grid_coords'] 
                                          for ctx in day_data['sentence_contexts']]
                        }
                    }
                    
                    training_data.append(sample)
    else:
        # Old structure
        for user_id, user_data in mobility_corpus.items():
            for day_id, day_data in user_data.items():
                
                # Create training sample
                sample = {
                    'user_id': user_id,
                    'day': day_id,
                    'text': day_data['full_narrative'],
                    'sentences': day_data['individual_sentences'],
                    'metadata': {
                        'total_visits': day_data['total_visits'],
                        'coordinates': [ctx['metadata']['grid_coords'] 
                                      for ctx in day_data['sentence_contexts']]
                    }
                }
                
                training_data.append(sample)
    
    # Save as JSONL for easy loading
    with open(output_file, 'w') as f:
        for sample in training_data:
            f.write(json.dumps(sample) + '\n')
    
    return training_data

In [17]:
def export_for_sequence_modeling(mobility_corpus, sequence_length=10):
    """Export in sequences suitable for transformer training"""
    
    sequences = []
    
    # Handle period-separated structure
    if 'normal_period' in mobility_corpus and 'emergency_period' in mobility_corpus:
        for period_data in mobility_corpus.values():
            for user_data in period_data.values():
                user_sentences = []
                
                # Collect all sentences for user
                for day_data in user_data.values():
                    user_sentences.extend(day_data['individual_sentences'])
                
                # Create overlapping sequences
                for i in range(0, len(user_sentences) - sequence_length + 1, sequence_length//2):
                    sequence = user_sentences[i:i + sequence_length]
                    if len(sequence) == sequence_length:
                        sequences.append({
                            'input_sequence': sequence[:-1],
                            'target_sequence': sequence[1:],
                            'full_sequence': sequence
                        })
    else:
        # Old structure
        for user_data in mobility_corpus.values():
            user_sentences = []
            
            # Collect all sentences for user
            for day_data in user_data.values():
                user_sentences.extend(day_data['individual_sentences'])
            
            # Create overlapping sequences
            for i in range(0, len(user_sentences) - sequence_length + 1, sequence_length//2):
                sequence = user_sentences[i:i + sequence_length]
                if len(sequence) == sequence_length:
                    sequences.append({
                        'input_sequence': sequence[:-1],
                        'target_sequence': sequence[1:],
                        'full_sequence': sequence
                    })
    
    return sequences

In [18]:
# UPDATED: Extract mobility patterns from period-separated corpus
def extract_mobility_patterns(corpus):
    """Extract common mobility patterns from the period-separated text corpus"""
    
    patterns = {
        'morning_activities': [],
        'evening_activities': [],
        'weekend_patterns': [],
        'weekday_patterns': [],
        'normal_period_patterns': [],
        'emergency_period_patterns': []
    }
    
    # Handle new period-separated structure
    if 'normal_period' in corpus and 'emergency_period' in corpus:
        # New structure: corpus has 'normal_period' and 'emergency_period' keys
        for period_type, period_data in corpus.items():
            for user_data in period_data.values():
                for day_data in user_data.values():
                    if 'sentence_contexts' in day_data:
                        for context in day_data['sentence_contexts']:
                            time_ctx = context['contexts']['time']
                            activity = context['contexts']['activity']['inferred_activity']
                            
                            # Time-based patterns
                            if time_ctx['period'] == 'morning':
                                patterns['morning_activities'].append(activity)
                            elif time_ctx['period'] == 'evening':
                                patterns['evening_activities'].append(activity)
                            
                            # Weekend/weekday patterns
                            if 'weekend' in time_ctx['day_context']:
                                patterns['weekend_patterns'].append(activity)
                            else:
                                patterns['weekday_patterns'].append(activity)
                            
                            # Period-specific patterns
                            if period_type == 'normal_period':
                                patterns['normal_period_patterns'].append(activity)
                            elif period_type == 'emergency_period':
                                patterns['emergency_period_patterns'].append(activity)
    else:
        # Old structure: corpus directly contains user data
        for user_data in corpus.values():
            for day_data in user_data.values():
                if 'sentence_contexts' in day_data:
                    for context in day_data['sentence_contexts']:
                        time_ctx = context['contexts']['time']
                        activity = context['contexts']['activity']['inferred_activity']
                        
                        if time_ctx['period'] == 'morning':
                            patterns['morning_activities'].append(activity)
                        elif time_ctx['period'] == 'evening':
                            patterns['evening_activities'].append(activity)
                        
                        if 'weekend' in time_ctx['day_context']:
                            patterns['weekend_patterns'].append(activity)
                        else:
                            patterns['weekday_patterns'].append(activity)
    
    # Count frequencies
    from collections import Counter
    for pattern_type, activities in patterns.items():
        if activities:  # Only process if we have data
            patterns[pattern_type] = Counter(activities).most_common(10)
        else:
            patterns[pattern_type] = []
    
    return patterns

In [19]:
# UPDATED: Analyze user narratives for period-separated data
def analyze_user_narratives_updated(corpus, user_id='user_0'):
    """Analyze narratives for a specific user in period-separated structure"""
    
    print(f"\nAnalysis for {user_id}:")
    
    # Handle new period-separated structure
    if 'normal_period' in corpus and 'emergency_period' in corpus:
        for period_type in ['normal_period', 'emergency_period']:
            print(f"\n--- {period_type.upper().replace('_', ' ')} ---")
            
            if user_id in corpus[period_type]:
                user_data = corpus[period_type][user_id]
                print(f"Total days: {len(user_data)}")
                
                for day_id, day_data in user_data.items():
                    print(f"\n{day_id}:")
                    print(f"  Visits: {day_data['total_visits']}")
                    print(f"  Narrative: {day_data['full_narrative'][:100]}...")
            else:
                print(f"No data for {user_id} in {period_type}")
    else:
        # Old structure
        if user_id in corpus:
            user_data = corpus[user_id]
            print(f"Total days: {len(user_data)}")
            
            for day_id, day_data in user_data.items():
                print(f"\n{day_id}:")
                print(f"  Visits: {day_data['total_visits']}")
                print(f"  Narrative: {day_data['full_narrative'][:100]}...")
        else:
            print(f"User {user_id} not found")

In [20]:
def create_sample_outputs(df, num_users=3):
    """Create sample outputs to demonstrate the approach"""
    
    sample_users = df['user_id'].unique()[:num_users]
    
    for user_id in sample_users:
        print(f"\n{'='*50}")
        print(f"SAMPLE OUTPUT FOR USER {user_id}")
        print(f"{'='*50}")
        
        user_data = df[df['user_id'] == user_id]
        sample_day = user_data['day'].iloc[0]
        day_data = user_data[user_data['day'] == sample_day]
        
        # Show different narrative styles
        for style in ['simple', 'medium', 'detailed']:
            narrative = create_daily_narrative(day_data, style)
            print(f"\n{style.upper()} STYLE:")
            print(f"{narrative['full_narrative']}")
        
        print(f"\nCOORDINATE SEQUENCE:")
        coords = [(row['grid_x'], row['grid_y']) for _, row in day_data.iterrows()]
        print(f"Grid coordinates: {coords}")

In [21]:
# UPDATED: Demonstrate text-to-coordinate mapping for period-separated data
def demonstrate_text_to_coordinate_mapping_updated(corpus, user_id='user_0', day='day_0'):
    """Demonstrate how text maps back to coordinates for period-separated data"""
    
    print(f"\nTEXT-TO-COORDINATE MAPPING:")
    
    # Handle new period-separated structure
    if 'normal_period' in corpus and 'emergency_period' in corpus:
        found = False
        for period_type in ['normal_period', 'emergency_period']:
            if user_id in corpus[period_type] and day in corpus[period_type][user_id]:
                day_data = corpus[period_type][user_id][day]
                found = True
                
                print(f"Period: {period_type.replace('_', ' ').title()}")
                print(f"Narrative: {day_data['full_narrative']}")
                print(f"\nBreakdown:")
                
                for i, (sentence, context) in enumerate(zip(
                    day_data['individual_sentences'], 
                    day_data['sentence_contexts']
                )):
                    coords = context['metadata']['grid_coords']
                    timestamp = context['metadata']['timestamp']
                    
                    print(f"{i+1}. {sentence}")
                    print(f"   → Coordinates: {coords}, Time: {timestamp}")
                
                break
        
        if not found:
            print(f"Data not found for {user_id} on {day}")
    else:
        # Old structure
        if user_id in corpus and day in corpus[user_id]:
            day_data = corpus[user_id][day]
            
            print(f"Narrative: {day_data['full_narrative']}")
            print(f"\nBreakdown:")
            
            for i, (sentence, context) in enumerate(zip(
                day_data['individual_sentences'], 
                day_data['sentence_contexts']
            )):
                coords = context['metadata']['grid_coords']
                timestamp = context['metadata']['timestamp']
                
                print(f"{i+1}. {sentence}")
                print(f"   → Coordinates: {coords}, Time: {timestamp}")


In [22]:
# NEW FUNCTION: Analyze patterns by period
def analyze_patterns_by_period(corpus):
    """Analyze patterns specifically comparing normal vs emergency periods"""
    
    if 'normal_period' not in corpus or 'emergency_period' not in corpus:
        print("No period separation found in corpus")
        return
    
    print("\n=== PERIOD COMPARISON ANALYSIS ===")
    
    for period_type in ['normal_period', 'emergency_period']:
        print(f"\n--- {period_type.upper().replace('_', ' ')} ---")
        
        period_data = corpus[period_type]
        total_users = len(period_data)
        total_days = sum(len(user_data) for user_data in period_data.values())
        total_visits = sum(
            day_data['total_visits'] 
            for user_data in period_data.values() 
            for day_data in user_data.values()
        )
        
        print(f"Users: {total_users}")
        print(f"Total days: {total_days}")
        print(f"Total visits: {total_visits}")
        print(f"Avg visits per day: {total_visits/total_days:.1f}" if total_days > 0 else "No data")
        
        # Extract activities for this period
        activities = []
        for user_data in period_data.values():
            for day_data in user_data.values():
                if 'sentence_contexts' in day_data:
                    for context in day_data['sentence_contexts']:
                        activity = context['contexts']['activity']['inferred_activity']
                        activities.append(activity)
        
        # Show top activities
        from collections import Counter
        if activities:
            top_activities = Counter(activities).most_common(5)
            print("Top activities:")
            for activity, count in top_activities:
                print(f"  {activity}: {count}")

def prepare_for_bert_training(corpus, mask_probability=0.15):
    """Prepare data for BERT-style masked language model training"""
    
    bert_training_data = []
    
    # Handle period-separated structure
    if 'normal_period' in corpus and 'emergency_period' in corpus:
        for period_data in corpus.values():
            for user_data in period_data.values():
                for day_data in user_data.values():
                    sentences = day_data['individual_sentences']
                    
                    # Create masked versions for BERT training
                    for mask_idx in range(len(sentences)):
                        if np.random.random() < mask_probability:
                            masked_sentences = sentences.copy()
                            original_sentence = masked_sentences[mask_idx]
                            masked_sentences[mask_idx] = "[MASK]"
                            
                            bert_training_data.append({
                                'input': ' '.join(masked_sentences),
                                'target': original_sentence,
                                'mask_position': mask_idx
                            })
    else:
        # Old structure
        for user_data in corpus.values():
            for day_data in user_data.values():
                sentences = day_data['individual_sentences']
                
                # Create masked versions for BERT training
                for mask_idx in range(len(sentences)):
                    if np.random.random() < mask_probability:
                        masked_sentences = sentences.copy()
                        original_sentence = masked_sentences[mask_idx]
                        masked_sentences[mask_idx] = "[MASK]"
                        
                        bert_training_data.append({
                            'input': ' '.join(masked_sentences),
                            'target': original_sentence,
                            'mask_position': mask_idx
                        })
    
    return bert_training_data


In [23]:
# UPDATED MAIN FUNCTION: For regular mobility with period separation
def main_with_period_separation():
    """Main execution pipeline with period separation"""
    
    print("Creating folder structure...")
    base_path = create_user_folders()
    
    print("Loading mobility datasets...")
    df = load_mobility_datasets()
    print(f"Loaded {len(df)} mobility records for {df['user_id'].nunique()} users")
    
    # Check period distribution
    normal_days = df[df['day'] <= 60]['day'].nunique()
    emergency_days = df[(df['day'] > 60) & (df['day'] <= 75)]['day'].nunique()
    
    print(f"Normal period: {normal_days} days (0-60)")
    print(f"Emergency period: {emergency_days} days (61-75)")
    
    print("\nGenerating mobility text corpus by period...")
    
    # Generate different narrative styles
    styles = ['simple', 'medium', 'detailed']
    
    for style in styles:
        print(f"\nProcessing {style} narrative style...")
        
        # Generate corpus separated by period
        corpus_by_period = generate_mobility_corpus_by_period(df, output_style=style)
        
        # Create style-specific folder
        style_path = os.path.join(base_path, f'{style}_style')
        if not os.path.exists(style_path):
            os.makedirs(style_path)
            # Create period subfolders
            for period in ['normal_period', 'emergency_period']:
                period_path = os.path.join(style_path, period)
                if not os.path.exists(period_path):
                    os.makedirs(period_path)
        
        # Save individual user files
        save_user_files_by_period(corpus_by_period, style_path)
        
        # Export training data by period
        export_training_data_by_period(corpus_by_period, style_path)
        
        print(f"  Normal period: {len(corpus_by_period['normal_period'])} users")
        print(f"  Emergency period: {len(corpus_by_period['emergency_period'])} users")
    
    # Create user summaries (using medium style)
    print("\nCreating user summary files...")
    medium_corpus = generate_mobility_corpus_by_period(df, output_style='medium')
    create_user_summary_files(medium_corpus, base_path)
    
    # Create overall statistics
    stats_file = os.path.join(base_path, 'dataset_statistics.txt')
    with open(stats_file, 'w') as f:
        f.write("=== MOBILITY DATASET STATISTICS ===\n\n")
        f.write(f"Total users: {df['user_id'].nunique()}\n")
        f.write(f"Total records: {len(df)}\n")
        f.write(f"Total days: {df['day'].nunique()}\n")
        f.write(f"Normal period days: 0-60 ({normal_days} days)\n")
        f.write(f"Emergency period days: 61-75 ({emergency_days} days)\n\n")
        
        # Period-specific stats
        for period_name, period_filter in [('Normal', df['day'] <= 60), 
                                          ('Emergency', (df['day'] > 60) & (df['day'] <= 75))]:
            period_data = df[period_filter]
            f.write(f"{period_name} Period:\n")
            f.write(f"  Users: {period_data['user_id'].nunique()}\n")
            f.write(f"  Records: {len(period_data)}\n")
            f.write(f"  Avg records per user: {len(period_data) / period_data['user_id'].nunique():.1f}\n\n")
    
    print(f"\n Period-separated mobility text generation completed!")
    print(f" All files saved in: {base_path}")
    
    return medium_corpus, base_path


In [24]:
# UPDATED ANALYSIS FUNCTION
def run_analysis_with_period_separation(corpus):
    """Run all analysis functions with period separation support"""
    
    print(f"\n{'='*60}")
    print("MOBILITY PATTERNS ANALYSIS")
    print(f"{'='*60}")
    
    # Extract patterns (updated function)
    patterns = extract_mobility_patterns(corpus)
    for pattern_type, activities in patterns.items():
        if activities:  # Only show patterns with data
            print(f"\n{pattern_type.upper()}:")
            for activity, count in activities[:5]:
                print(f"  {activity}: {count}")
    
    # Period-specific analysis
    analyze_patterns_by_period(corpus)
    
    # Build vocabulary (updated function)
    vocabulary = build_mobility_vocabulary_updated(corpus)
    print(f"\n{'='*60}")
    print("VOCABULARY ANALYSIS")
    print(f"{'='*60}")
    for vocab_type, words in vocabulary.items():
        print(f"{vocab_type}: {len(words)} unique terms")
    
    # User analysis (updated function)
    analyze_user_narratives_updated(corpus, 'user_0')
    
    # Text-to-coordinate mapping (updated function)
    demonstrate_text_to_coordinate_mapping_updated(corpus, 'user_0', 'day_5')
    
    return patterns, vocabulary

In [25]:
# POI Binary Matrix Integration Functions

def load_poi_presence_matrix():
    """Load and process the POI presence matrix"""
    poi_df = pd.read_csv("scientific_mobility_poi_matrix_dataset.csv")
    
    # Identify POI binary columns
    poi_binary_cols = [col for col in poi_df.columns if col.startswith('has_')]
    function_binary_cols = [col for col in poi_df.columns if 'function' in col.lower()]
    
    return poi_df, poi_binary_cols, function_binary_cols

def merge_with_poi_features(mobility_df, poi_df):
    """Merge mobility data with POI binary features"""
    
    if mobility_df is None:
        print("ERROR: mobility_df is None")
        return None
    if poi_df is None:
        print("ERROR: poi_df is None")
        return mobility_df
    
    print(f"Mobility DF shape: {mobility_df.shape}")
    print(f"POI DF shape: {poi_df.shape}")
    
    # Remove duplicate columns from POI data before merging
    poi_columns_to_keep = [col for col in poi_df.columns if col not in mobility_df.columns]
    poi_df_clean = poi_df[poi_columns_to_keep]
    
    print(f"POI columns to add: {len(poi_columns_to_keep)}")
    
    if len(mobility_df) == len(poi_df_clean):
        print("Same length - merging by index")
        merged_df = pd.concat([mobility_df.reset_index(drop=True), 
                              poi_df_clean.reset_index(drop=True)], axis=1)
    else:
        print("Different lengths - using mobility data only")
        merged_df = mobility_df
    
    print(f"Merged DataFrame shape: {merged_df.shape}")
    return merged_df

In [26]:
def extract_active_pois(row, poi_binary_cols):
    """Extract active POI features for a specific record"""
    
    active_pois = []
    for col in poi_binary_cols:
        if col in row and row[col] == 1:
            # Clean up POI name
            poi_name = col.replace('has_', '').replace('_', ' ')
            active_pois.append(poi_name)
    
    return active_pois

In [27]:
def generate_poi_enriched_context(row, poi_binary_cols, function_binary_cols):
    """Generate context enriched with POI binary features"""
    
    active_pois = extract_active_pois(row, poi_binary_cols)
    active_functions = extract_active_pois(row, function_binary_cols)
    
    # Create POI context descriptions
    poi_context = {
        'nearby_pois': active_pois[:5],  # Limit to top 5 for readability
        'available_functions': active_functions[:3],
        'poi_richness': len(active_pois),
        'functional_diversity': len(active_functions)
    }
    
    # Generate natural language descriptions
    if active_pois:
        if len(active_pois) == 1:
            poi_desc = f"near {active_pois[0]}"
        elif len(active_pois) <= 3:
            poi_desc = f"near {', '.join(active_pois[:-1])} and {active_pois[-1]}"
        else:
            poi_desc = f"in area with {active_pois[0]}, {active_pois[1]} and {len(active_pois)-2} other amenities"
    else:
        poi_desc = "in area with basic amenities"
    
    if active_functions:
        function_desc = f"offering {', '.join(active_functions)}"
    else:
        function_desc = "with standard services"
    
    poi_context.update({
        'poi_description': poi_desc,
        'function_description': function_desc,
        'full_poi_context': f"{poi_desc} {function_desc}"
    })
    
    return poi_context

In [28]:
def create_poi_enriched_sentence(row, poi_binary_cols, function_binary_cols):
    """Create mobility sentence enriched with POI information"""
    
    # Generate base contexts
    time_ctx = generate_time_context(
        row['hour'], row['minute'], 
        row['day_name'], row['is_weekend'] == 'True'
    )
    
    location_ctx = generate_location_context(
        row['location_category'], row['location_function'],
        row['poi_density'], row['category_diversity'],
        row['distance_from_center'], row['distance_quartile']
    )
    
    activity_ctx = generate_activity_context(
        row.get('functional_category', None),
        row['location_function'],
        time_ctx['period']
    )
    
    # Generate POI-enriched context
    poi_ctx = generate_poi_enriched_context(row, poi_binary_cols, function_binary_cols)
    
    # Create enhanced sentences
    enhanced_sentences = {
        'poi_detailed': f"{time_ctx['full_context']}, the user visited {row['location_category']} {poi_ctx['full_poi_context']} and {activity_ctx['context']}.",
        
        'poi_medium': f"At {time_ctx['time_str']}, user was at {row['location_category']} {poi_ctx['poi_description']} {activity_ctx['context']}.",
        
        'poi_simple': f"{time_ctx['period']}: {row['location_category']} {poi_ctx['poi_description']} - {activity_ctx['inferred_activity']}",
        
        'poi_rich': f"During {time_ctx['day_context']} at {time_ctx['time_str']}, user engaged in {activity_ctx['inferred_activity']} at {row['location_category']} in {location_ctx['density_desc']} {poi_ctx['full_poi_context']}."
    }
    
    return {
        'sentences': enhanced_sentences,
        'contexts': {
            'time': time_ctx,
            'location': location_ctx,
            'activity': activity_ctx,
            'poi': poi_ctx
        },
        'metadata': {
            'grid_coords': (row['grid_x'], row['grid_y']),
            'timestamp': f"Day_{row['day']}_Slot_{row['time_slot']}",
            'poi_count': poi_ctx['poi_richness'],
            'function_count': poi_ctx['functional_diversity']
        }
    }

In [29]:
def generate_poi_vocabulary(merged_df, poi_binary_cols, function_binary_cols):
    """Generate vocabulary from POI features"""
    
    poi_vocab = {
        'poi_types': set(),
        'function_types': set(),
        'poi_combinations': set()
    }
    
    for _, row in merged_df.iterrows():
        active_pois = extract_active_pois(row, poi_binary_cols)
        active_functions = extract_active_pois(row, function_binary_cols)
        
        poi_vocab['poi_types'].update(active_pois)
        poi_vocab['function_types'].update(active_functions)
        
        # Create combinations for common patterns
        if len(active_pois) >= 2:
            poi_combo = ' + '.join(sorted(active_pois[:3]))
            poi_vocab['poi_combinations'].add(poi_combo)
    
    # Convert to sorted lists
    for key in poi_vocab:
        poi_vocab[key] = sorted(list(poi_vocab[key]))
    
    return poi_vocab


In [30]:
# UPDATED: Create POI-enhanced corpus separated by periods
def create_poi_enhanced_corpus_by_period(df_with_poi, poi_binary_cols, function_binary_cols, style='poi_medium'):
    """Create POI-enhanced corpus separated by periods"""
    
    enhanced_corpus = {
        'normal_period': {},
        'emergency_period': {}
    }
    
    if df_with_poi is None:
        print("ERROR: df_with_poi is None")
        return enhanced_corpus
    
    # Handle duplicate column issue
    if 'user_id' in df_with_poi.columns:
        user_id_series = df_with_poi.iloc[:, df_with_poi.columns.get_loc('user_id')]
        if hasattr(user_id_series, 'unique'):
            unique_users = user_id_series.unique()
        else:
            unique_users = user_id_series.iloc[:, 0].unique()
    else:
        print(f"ERROR: 'user_id' column not found")
        return enhanced_corpus
    
    for user_id in unique_users:
        user_mask = (df_with_poi.iloc[:, df_with_poi.columns.get_loc('user_id')] == user_id)
        user_data = df_with_poi[user_mask]
        
        # Initialize user data for both periods
        enhanced_corpus['normal_period'][f'user_{user_id}'] = {}
        enhanced_corpus['emergency_period'][f'user_{user_id}'] = {}
        
        for day in user_data['day'].unique():
            day_data = user_data[user_data['day'] == day]
            
            if len(day_data) > 0:
                # Determine period type
                period_type = get_period_type(day)
                
                if period_type in ['normal', 'emergency']:
                    period_key = f'{period_type}_period'
                    
                    daily_sentences = []
                    contexts = []
                    
                    day_data_sorted = day_data.sort_values('time_slot')
                    
                    for _, row in day_data_sorted.iterrows():
                        sentence_data = create_poi_enriched_sentence(
                            row, poi_binary_cols, function_binary_cols
                        )
                        daily_sentences.append(sentence_data['sentences'][style])
                        contexts.append(sentence_data)
                    
                    # Create narrative with period info
                    narrative_parts = []
                    for i, sentence in enumerate(daily_sentences):
                        if i == 0:
                            narrative_parts.append(f"The day began: {sentence}")
                        elif i == len(daily_sentences) - 1:
                            narrative_parts.append(f"Finally, {sentence}")
                        else:
                            connectors = ["Then,", "Next,", "Later,", "Subsequently,"]
                            connector = connectors[i % len(connectors)]
                            narrative_parts.append(f"{connector} {sentence}")
                    
                    enhanced_corpus[period_key][f'user_{user_id}'][f'day_{day}'] = {
                        'full_narrative': ' '.join(narrative_parts),
                        'individual_sentences': daily_sentences,
                        'sentence_contexts': contexts,
                        'total_visits': len(daily_sentences),
                        'period_type': period_type,
                        'day_number': day,
                        'poi_richness': sum(ctx['metadata']['poi_count'] for ctx in contexts),
                        'avg_poi_per_visit': sum(ctx['metadata']['poi_count'] for ctx in contexts) / len(contexts) if contexts else 0
                    }
    
    return enhanced_corpus

In [31]:
def analyze_poi_patterns(enhanced_corpus):
    """Analyze POI usage patterns from the enhanced corpus"""
    
    poi_patterns = {
        'poi_sequences': [],
        'time_poi_associations': defaultdict(list),
        'activity_poi_associations': defaultdict(list),
        'poi_diversity_by_user': {}
    }
    
    # Handle period-separated structure
    if 'normal_period' in enhanced_corpus and 'emergency_period' in enhanced_corpus:
        for period_data in enhanced_corpus.values():
            for user_id, user_data in period_data.items():
                user_pois = set()
                
                for day_data in user_data.values():
                    day_poi_sequence = []
                    
                    if 'sentence_contexts' in day_data:
                        for context in day_data['sentence_contexts']:
                            poi_ctx = context['contexts']['poi']
                            time_ctx = context['contexts']['time']
                            activity_ctx = context['contexts']['activity']
                            
                            # Collect POI sequences
                            if poi_ctx['nearby_pois']:
                                day_poi_sequence.extend(poi_ctx['nearby_pois'])
                                user_pois.update(poi_ctx['nearby_pois'])
                                
                                # Time-POI associations
                                for poi in poi_ctx['nearby_pois']:
                                    poi_patterns['time_poi_associations'][time_ctx['period']].append(poi)
                                    poi_patterns['activity_poi_associations'][activity_ctx['inferred_activity']].append(poi)
                    
                    if day_poi_sequence:
                        poi_patterns['poi_sequences'].append(day_poi_sequence)
                
                poi_patterns['poi_diversity_by_user'][user_id] = len(user_pois)
    
    # Count frequencies
    from collections import Counter
    
    for pattern_type in ['time_poi_associations', 'activity_poi_associations']:
        for key, values in poi_patterns[pattern_type].items():
            poi_patterns[pattern_type][key] = Counter(values).most_common(10)
    
    return poi_patterns

In [32]:

def create_trajectory_templates(enhanced_corpus):
    """Create trajectory templates based on POI patterns"""
    
    templates = {
        'morning_routines': [],
        'evening_routines': [],
        'weekend_patterns': [],
        'workday_patterns': []
    }
    
    # Handle period-separated structure
    if 'normal_period' in enhanced_corpus and 'emergency_period' in enhanced_corpus:
        for period_data in enhanced_corpus.values():
            for user_data in period_data.values():
                for day_data in user_data.values():
                    if 'sentence_contexts' not in day_data:
                        continue
                        
                    day_template = {
                        'early morning': [],
                        'morning': [],
                        'afternoon': [],
                        'evening': [],
                        'night': [],
                        'poi_flow': [],
                        'activity_flow': []
                    }
                    
                    for context in day_data['sentence_contexts']:
                        time_period = context['contexts']['time']['period']
                        activity = context['contexts']['activity']['inferred_activity']
                        pois = context['contexts']['poi']['nearby_pois']
                        
                        # Handle time period mapping
                        if time_period in day_template:
                            day_template[time_period].append({
                                'activity': activity,
                                'pois': pois,
                                'location': context['contexts']['location']['primary_location']
                            })
                        else:
                            # Map unexpected time periods
                            if 'early' in time_period.lower():
                                mapped_period = 'early morning'
                            elif 'night' in time_period.lower():
                                mapped_period = 'night'
                            else:
                                mapped_period = 'morning'  # Default fallback
                            
                            day_template[mapped_period].append({
                                'activity': activity,
                                'pois': pois,
                                'location': context['contexts']['location']['primary_location']
                            })
                        
                        day_template['poi_flow'].extend(pois)
                        day_template['activity_flow'].append(activity)
                    
                    # Categorize templates
                    is_weekend = any('weekend' in ctx['contexts']['time']['day_context'] 
                                   for ctx in day_data['sentence_contexts'])
                    
                    if is_weekend:
                        templates['weekend_patterns'].append(day_template)
                    else:
                        templates['workday_patterns'].append(day_template)
                    
                    # Combine early morning and morning for routines
                    morning_activities = day_template['early morning'] + day_template['morning']
                    if morning_activities:
                        templates['morning_routines'].append(morning_activities)
                    
                    evening_activities = day_template['evening'] + day_template['night']
                    if evening_activities:
                        templates['evening_routines'].append(evening_activities)
    
    return templates

In [33]:
def prepare_bert_training_with_poi(enhanced_corpus, mask_poi_probability=0.2):
    """Prepare BERT training data with POI masking"""
    
    bert_poi_training = []
    
    # Handle period-separated structure
    if 'normal_period' in enhanced_corpus and 'emergency_period' in enhanced_corpus:
        for period_data in enhanced_corpus.values():
            for user_data in period_data.values():
                for day_data in user_data.values():
                    if 'individual_sentences' not in day_data or 'sentence_contexts' not in day_data:
                        continue
                        
                    sentences = day_data['individual_sentences']
                    contexts = day_data['sentence_contexts']
                    
                    for i, (sentence, context) in enumerate(zip(sentences, contexts)):
                        # Standard sentence masking
                        if np.random.random() < 0.15:
                            masked_sentences = sentences.copy()
                            masked_sentences[i] = "[MASK]"
                            
                            bert_poi_training.append({
                                'input': ' '.join(masked_sentences),
                                'target': sentence,
                                'mask_type': 'full_sentence',
                                'poi_count': context['metadata']['poi_count']
                            })
                        
                        # POI-specific masking
                        if context['contexts']['poi']['nearby_pois'] and np.random.random() < mask_poi_probability:
                            # Mask POI information in sentence
                            masked_sentence = sentence
                            for poi in context['contexts']['poi']['nearby_pois']:
                                masked_sentence = masked_sentence.replace(poi, '[POI_MASK]')
                            
                            bert_poi_training.append({
                                'input': masked_sentence,
                                'target': sentence,
                                'mask_type': 'poi_specific',
                                'masked_pois': context['contexts']['poi']['nearby_pois']
                            })
    
    return bert_poi_training


In [34]:
def generate_synthetic_trajectories(templates, poi_vocab, num_trajectories=10):
    """Generate synthetic trajectories using learned templates"""
    
    synthetic_trajectories = []
    
    for i in range(num_trajectories):
        # Check if templates exist before choosing
        available_template_types = []
        if templates['workday_patterns']:
            available_template_types.append('workday_patterns')
        if templates['weekend_patterns']:
            available_template_types.append('weekend_patterns')
        
        if not available_template_types:
            print("WARNING: No templates available, skipping synthetic trajectory generation")
            break
        
        # Choose random template type from available ones
        template_type = np.random.choice(available_template_types)
        
        # Check if template list is not empty
        if not templates[template_type]:
            print(f"WARNING: No templates for {template_type}, skipping")
            continue
            
        template = np.random.choice(templates[template_type])
        
        synthetic_day = {
            'synthetic_id': f'synth_{i}',
            'template_type': template_type,
            'generated_sentences': [],
            'poi_sequence': []
        }
        
        # Generate sentences for each time period
        for period in ['early morning', 'morning', 'afternoon', 'evening', 'night']:
            if period in template and template[period]:
                period_activities = template[period]
                
                for activity_data in period_activities:
                    # Use original or sample new POIs
                    if activity_data['pois']:
                        selected_pois = activity_data['pois']
                    else:
                        # Sample random POIs from vocabulary
                        if poi_vocab['poi_types']:
                            available_pois = poi_vocab['poi_types']
                            num_pois = min(3, len(available_pois))
                            selected_pois = np.random.choice(available_pois, 
                                                           size=num_pois, 
                                                           replace=False).tolist()
                        else:
                            selected_pois = []
                    
                    # Generate synthetic sentence
                    time_desc = period
                    activity = activity_data['activity']
                    location = activity_data['location']
                    poi_desc = f"near {', '.join(selected_pois[:2])}" if selected_pois else "in area"
                    
                    synthetic_sentence = f"During {time_desc}, user {activity} at {location} {poi_desc}."
                    
                    synthetic_day['generated_sentences'].append(synthetic_sentence)
                    synthetic_day['poi_sequence'].extend(selected_pois)
        
        # Only add if we generated some sentences
        if synthetic_day['generated_sentences']:
            synthetic_trajectories.append(synthetic_day)
    
    print(f"Generated {len(synthetic_trajectories)} synthetic trajectories")
    return synthetic_trajectories

In [35]:
def export_for_gpt_training(enhanced_corpus, output_file='Users/agugire/gpt_mobility_training.txt'):
    """Export data in format suitable for GPT-style autoregressive training"""
    
    with open(output_file, 'w') as f:
        # Handle period-separated structure
        if 'normal_period' in enhanced_corpus and 'emergency_period' in enhanced_corpus:
            for period_type, period_data in enhanced_corpus.items():
                f.write(f"=== {period_type.upper().replace('_', ' ')} ===\n\n")
                
                for user_data in period_data.values():
                    for day_data in user_data.values():
                        # Write full narrative
                        f.write(day_data['full_narrative'] + '\n')
                        
                        # Write individual sentences for fine-grained training
                        for sentence in day_data['individual_sentences']:
                            f.write(sentence + '\n')
                        
                        f.write('\n')  # Separator between days
        else:
            # Old structure
            for user_data in enhanced_corpus.values():
                for day_data in user_data.values():
                    # Write full narrative
                    f.write(day_data['full_narrative'] + '\n')
                    
                    # Write individual sentences for fine-grained training
                    for sentence in day_data['individual_sentences']:
                        f.write(sentence + '\n')
                    
                    f.write('\n')  # Separator between days
    
    print(f"Exported GPT training data to {output_file}")

In [36]:
def create_poi_feature_embeddings(poi_vocab, embedding_dim=100):
    """Create embeddings for POI features"""
    
    poi_to_id = {}
    id_to_poi = {}
    
    all_pois = poi_vocab['poi_types'] + poi_vocab['function_types']
    
    for i, poi in enumerate(all_pois):
        poi_to_id[poi] = i
        id_to_poi[i] = poi
    
    # Initialize random embeddings (in practice, you'd train these)
    embeddings = np.random.randn(len(all_pois), embedding_dim)
    
    return {
        'poi_to_id': poi_to_id,
        'id_to_poi': id_to_poi,
        'embeddings': embeddings,
        'vocab_size': len(all_pois),
        'embedding_dim': embedding_dim
    }

In [37]:
def main_poi_pipeline_with_periods():
    """Main POI pipeline with period separation - Azure compatible"""
    
    print("Creating POI folder structure...")
    base_path = create_user_folders('POI_UserMobilityTexts')
    
    print("Loading datasets...")
    mobility_df = load_mobility_datasets()
    
    if mobility_df is None:
        print("Cannot proceed without mobility datasets")
        return None, None, None, None
    
    poi_df, poi_binary_cols, function_binary_cols = load_poi_presence_matrix()
    
    print("Merging with POI features...")
    merged_df = merge_with_poi_features(mobility_df, poi_df)
    
    if merged_df is None:
        merged_df = mobility_df
        poi_binary_cols = []
        function_binary_cols = []
    
    print("Generating POI-enhanced corpus by period...")
    enhanced_corpus = create_poi_enhanced_corpus_by_period(
        merged_df, poi_binary_cols, function_binary_cols, style='poi_medium'
    )
    
    # Save POI-enhanced data by period
    save_user_files_by_period(enhanced_corpus, base_path)
    export_training_data_by_period(enhanced_corpus, base_path)
    create_user_summary_files(enhanced_corpus, base_path)
    
    # Build POI vocabulary
    poi_vocab = generate_poi_vocabulary(merged_df, poi_binary_cols, function_binary_cols)
    
    # Analyze patterns
    patterns = analyze_poi_patterns(enhanced_corpus)
    
    # Create templates
    templates = create_trajectory_templates(enhanced_corpus)
    
    # Prepare BERT training data
    bert_poi_data = prepare_bert_training_with_poi(enhanced_corpus)
    
    # Generate synthetic trajectories
    synthetic_trajs = generate_synthetic_trajectories(templates, poi_vocab, num_trajectories=20)
    
    # Export additional POI data with Azure-compatible paths
    with open(os.path.join(base_path, 'poi_vocabulary.json'), 'w') as f:
        json.dump(poi_vocab, f, indent=2)
    
    with open(os.path.join(base_path, 'poi_patterns.json'), 'w') as f:
        json.dump(patterns, f, indent=2, default=str)
    
    with open(os.path.join(base_path, 'trajectory_templates.json'), 'w') as f:
        json.dump(templates, f, indent=2)
    
    with open(os.path.join(base_path, 'bert_poi_training.json'), 'w') as f:
        json.dump(bert_poi_data, f, indent=2)
    
    with open(os.path.join(base_path, 'synthetic_trajectories.json'), 'w') as f:
        json.dump(synthetic_trajs, f, indent=2)
    
    # Export for GPT training
    export_for_gpt_training(enhanced_corpus, os.path.join(base_path, 'gpt_mobility_training.txt'))
    
    # Create POI embeddings
    poi_embeddings = create_poi_feature_embeddings(poi_vocab)
    np.save(os.path.join(base_path, 'poi_embeddings.npy'), poi_embeddings['embeddings'])
    with open(os.path.join(base_path, 'poi_embedding_vocab.json'), 'w') as f:
        json.dump({k: v for k, v in poi_embeddings.items() if k != 'embeddings'}, f, indent=2)
    
    print(f"\n POI-enhanced period-separated pipeline completed!")
    print(f" POI files saved in: {base_path}")
    
    return enhanced_corpus, poi_vocab, patterns, templates

In [38]:
if __name__ == "__main__":
    print(" Starting Mobility Text Generation with Period Separation...")
    print("\nChoose pipeline:")
    print("1. Regular mobility pipeline")
    print("2. POI-enhanced mobility pipeline") 
    print("3. Both pipelines")
    
    choice = input("Enter choice (1/2/3) or just press Enter for both: ").strip()
    
    if choice == "1":
        print("\n Running Regular Mobility Pipeline...")
        corpus, base_path = main_with_period_separation()
        
        # Run analysis
        print("\n Running Analysis...")
        patterns, vocabulary = run_analysis_with_period_separation(corpus)
        
        # Create sample outputs
        print(f"\n{'='*60}")
        print("CREATING SAMPLE OUTPUTS")
        print(f"{'='*60}")
        
        df = load_mobility_datasets()
        create_sample_outputs(df, num_users=2)
        
        # Export additional files
        export_for_language_model_training(corpus, 
            os.path.join(base_path, 'mobility_training_data.jsonl'))
        
        sequences = export_for_sequence_modeling(corpus)
        with open(os.path.join(base_path, 'mobility_sequences.json'), 'w') as f:
            json.dump(sequences, f, indent=2)
        
        bert_data = prepare_for_bert_training(corpus)
        with open(os.path.join(base_path, 'bert_training_data.json'), 'w') as f:
            json.dump(bert_data, f, indent=2)
        
        with open(os.path.join(base_path, 'mobility_vocabulary.json'), 'w') as f:
            json.dump(vocabulary, f, indent=2)
        
        print(f"\n Regular mobility pipeline completed!")
        
    elif choice == "2":
        print("\n Running POI-Enhanced Mobility Pipeline...")
        enhanced_corpus, poi_vocab, patterns, templates = main_poi_pipeline_with_periods()
        
        # Run POI analysis
        print("\n Running POI Analysis...")
        poi_patterns_analysis, poi_vocabulary_analysis = run_analysis_with_period_separation(enhanced_corpus)
        
        print(f"\n POI-enhanced mobility pipeline completed!")
        
    else:  # Default: run both
        print("\n Running Regular Mobility Pipeline...")
        corpus, base_path = main_with_period_separation()
        
        # Run analysis for regular pipeline
        print("\n Running Regular Analysis...")
        patterns, vocabulary = run_analysis_with_period_separation(corpus)
        
        # Export additional files for regular pipeline
        export_for_language_model_training(corpus, 
            os.path.join(base_path, 'mobility_training_data.jsonl'))
        
        sequences = export_for_sequence_modeling(corpus)
        with open(os.path.join(base_path, 'mobility_sequences.json'), 'w') as f:
            json.dump(sequences, f, indent=2)
        
        bert_data = prepare_for_bert_training(corpus)
        with open(os.path.join(base_path, 'bert_training_data.json'), 'w') as f:
            json.dump(bert_data, f, indent=2)
        
        with open(os.path.join(base_path, 'mobility_vocabulary.json'), 'w') as f:
            json.dump(vocabulary, f, indent=2)
        
        print("\n Running POI-Enhanced Mobility Pipeline...")
        enhanced_corpus, poi_vocab, poi_patterns, templates = main_poi_pipeline_with_periods()
        
        # Run POI analysis
        print("\n Running POI Analysis...")
        poi_patterns_analysis, poi_vocabulary_analysis = run_analysis_with_period_separation(enhanced_corpus)
        
        # Create sample outputs
        print(f"\n{'='*60}")
        print("CREATING SAMPLE OUTPUTS")
        print(f"{'='*60}")
        
        df = load_mobility_datasets()
        create_sample_outputs(df, num_users=2)
        
        print(f"\n Both pipelines completed!")
    
    print("\n ALL DONE!")
    print("\n Generated Files:")
    print("  📁 UserMobilityTexts/ (Regular mobility with period separation)")
    print("     ├── simple_style/, medium_style/, detailed_style/")
    print("     │   ├── normal_period/ (individual user files)")
    print("     │   └── emergency_period/ (individual user files)")
    print("     ├── user_summaries/ (complete user summaries)")
    print("     └── dataset_statistics.txt")
    print("  📁 POI_UserMobilityTexts/ (POI-enhanced mobility with period separation)")
    print("     ├── normal_period/, emergency_period/ (POI-enhanced user files)")
    print("     ├── poi_vocabulary.json, poi_patterns.json")
    print("     ├── trajectory_templates.json, synthetic_trajectories.json")
    print("     └── bert_poi_training.json, gpt_mobility_training.txt")
    print("\n Each user now has separate files for normal and emergency periods!")
    print(" Check the user_summaries folder for complete user analysis across both periods!")

 Starting Mobility Text Generation with Period Separation...

Choose pipeline:
1. Regular mobility pipeline
2. POI-enhanced mobility pipeline
3. Both pipelines

 Running Regular Mobility Pipeline...
Creating folder structure...
Loading mobility datasets...
Loaded 12247358 mobility records for 10000 users
Normal period: 61 days (0-60)
Emergency period: 14 days (61-75)

Generating mobility text corpus by period...

Processing simple narrative style...
  Normal period: 10000 users
  Emergency period: 10000 users

Processing medium narrative style...
