# GPS Location Analysis

This notebook analyzes GPS location data to infer significant locations in a subject's life. It was developed as part of a CS4501 Data Privacy project at UVA.

Use the detailed README to process your own data.

### Imports

In [12]:
import pandas as pd
import numpy as np
import os
import glob
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
import folium
from folium.plugins import HeatMap
from collections import Counter

### Data Exploration and User Selection

In [13]:
# Function to parse GPS data from the dataset format
def parse_gps_file(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    
    data = []
    current_record = {}
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        if line.startswith('time'):
            if current_record and 'time' in current_record and 'latitude' in current_record and 'longitude' in current_record:
                data.append(current_record)
            current_record = {'time': None}
        elif line[0].isdigit():  # This is a timestamp
            if current_record and 'time' in current_record and 'latitude' in current_record and 'longitude' in current_record:
                data.append(current_record)
            current_record = {'time': int(line)}
        else:
            parts = line.split('\t')
            if len(parts) == 2:
                key, value = parts
                # Convert numeric values to appropriate types
                if key in ['accuracy', 'latitude', 'longitude', 'altitude', 'bearing', 'speed']:
                    try:
                        value = float(value)
                    except:
                        pass
                current_record[key] = value
    
    # Add the last record
    if current_record and 'time' in current_record and 'latitude' in current_record and 'longitude' in current_record:
        data.append(current_record)
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    return df

# Function to calculate user metrics for selection
def calculate_user_metrics(df):
    # Filter out invalid coordinates
    valid_coords = df[df['latitude'].between(-90, 90) & df['longitude'].between(-180, 180)]
    
    if len(valid_coords) == 0:
        return {
            'data_points': 0,
            'unique_locations': 0,
            'distance_traveled': 0,
            'time_range_days': 0,
            'location_variance': 0
        }
    
    # Count data points
    data_points = len(valid_coords)
    
    # Count approximate unique locations (rounded to 4 decimal places)
    valid_coords['lat_round'] = valid_coords['latitude'].round(4)
    valid_coords['lon_round'] = valid_coords['longitude'].round(4)
    unique_locations = valid_coords.groupby(['lat_round', 'lon_round']).size().reset_index().shape[0]
    
    # Calculate total distance traveled
    coords = valid_coords[['latitude', 'longitude']].values
    distance = 0
    if len(coords) > 1:
        for i in range(1, len(coords)):
            # Calculate distance between consecutive points in kilometers
            dist = great_circle((coords[i-1][0], coords[i-1][1]), (coords[i][0], coords[i][1])).kilometers
            # Only count reasonable distances (filter out GPS errors)
            if dist < 100:  # 100km threshold to filter out teleportations
                distance += dist
    
    # Calculate time range in days
    if 'time' in valid_coords.columns and len(valid_coords['time']) > 1:
        time_range = (valid_coords['time'].max() - valid_coords['time'].min()) / (60*60*24)
    else:
        time_range = 0
    
    # Calculate location variance
    lat_var = valid_coords['latitude'].var()
    lon_var = valid_coords['longitude'].var()
    location_variance = (lat_var + lon_var) / 2
    
    return {
        'data_points': data_points,
        'unique_locations': unique_locations,
        'distance_traveled': distance,
        'time_range_days': time_range,
        'location_variance': location_variance
    }

# Get list of all GPS files
gps_files = glob.glob('gps/*.csv')
if not gps_files:
    # If files are named with .csv extension but actually have different content format
    gps_files = glob.glob('gps/gps_u*.csv')

# Analyze each user's data
user_metrics = {}
for file_path in gps_files:
    user_id = os.path.basename(file_path).replace('.csv', '')
    print(f"Processing {user_id}...")
    try:
        df = parse_gps_file(file_path)
        metrics = calculate_user_metrics(df)
        user_metrics[user_id] = metrics
        print(f"  {user_id}: {metrics['data_points']} points, {metrics['unique_locations']} locations, {metrics['distance_traveled']:.2f}km traveled")
    except Exception as e:
        print(f"Error processing {user_id}: {e}")

# Convert to DataFrame for easier analysis
metrics_df = pd.DataFrame.from_dict(user_metrics, orient='index')

# Score each user based on data richness
metrics_df['score'] = (
    metrics_df['data_points'] / metrics_df['data_points'].max() +
    metrics_df['unique_locations'] / metrics_df['unique_locations'].max() +
    metrics_df['distance_traveled'] / metrics_df['distance_traveled'].max() +
    metrics_df['time_range_days'] / metrics_df['time_range_days'].max() +
    metrics_df['location_variance'] / metrics_df['location_variance'].max()
)

# Sort by score
metrics_df = metrics_df.sort_values('score', ascending=False)

# Select top 2 users
top_users = metrics_df.head(2).index.tolist()
print(f"\nSelected users for analysis: {top_users}")
print(metrics_df.head(5))

# Plot metrics for visualization
plt.figure(figsize=(12, 8))
metrics_df.sort_values('score', ascending=False).head(10)['score'].plot(kind='bar')
plt.title('Top 10 Users by Data Quality Score')
plt.ylabel('Score')
plt.xlabel('User ID')
plt.tight_layout()
plt.show()

Processing gps_u45...
Error processing gps_u45: invalid literal for int() with base 10: '1364410654,network,wifi,22.094,43.7066051,-72.2870424,0.0,0.0,0.0,,'
Processing gps_u51...
Error processing gps_u51: invalid literal for int() with base 10: '1364357164,network,wifi,21.765,43.7066013,-72.2870488,0.0,0.0,0.0,stationary,'
Processing gps_u50...
Error processing gps_u50: invalid literal for int() with base 10: '1364493509,network,wifi,24.194,43.7066333,-72.2870439,0.0,0.0,0.0,stationary,'
Processing gps_u44...
Error processing gps_u44: invalid literal for int() with base 10: '1364410478,network,wifi,20.0,43.7065952,-72.2870101,0.0,0.0,0.0,,'
Processing gps_u52...
Error processing gps_u52: invalid literal for int() with base 10: '1364414218,network,wifi,20.0,43.7065903,-72.2870373,0.0,0.0,0.0,moving,'
Processing gps_u46...
Error processing gps_u46: invalid literal for int() with base 10: '1364410746,network,wifi,22.963,43.7066085,-72.287028,0.0,0.0,0.0,,'
Processing gps_u47...
Error pro

KeyError: 'data_points'

### Location Clustering and Significant Places Detection

In [None]:
# Function to cluster locations and identify significant places
def identify_significant_places(gps_df, eps=0.05, min_samples=5, min_duration_minutes=20):
    """
    Cluster GPS coordinates to identify significant places
    
    Parameters:
    - gps_df: DataFrame containing GPS data
    - eps: DBSCAN epsilon parameter in kilometers (cluster radius)
    - min_samples: Minimum points in a cluster
    - min_duration_minutes: Minimum time spent at a location to be considered significant
    
    Returns:
    - DataFrame with significant places information
    """
    # Ensure the dataframe is sorted by time
    gps_df = gps_df.sort_values('time')
    
    # Filter out invalid coordinates
    valid_coords = gps_df[gps_df['latitude'].between(-90, 90) & 
                          gps_df['longitude'].between(-180, 180)]
    
    if len(valid_coords) == 0:
        return pd.DataFrame()
    
    # Calculate staying duration at each point
    valid_coords['next_time'] = valid_coords['time'].shift(-1)
    valid_coords['duration'] = valid_coords['next_time'] - valid_coords['time']
    
    # Filter points with reasonable durations (remove last row with NaN duration)
    valid_coords = valid_coords[valid_coords['duration'].notna()]
    
    # Convert eps from km to degrees (approximate)
    kms_per_radian = 6371.0  # Earth's radius in kilometers
    epsilon = eps / kms_per_radian
    
    # Extract coordinates for clustering
    coords = valid_coords[['latitude', 'longitude']].values
    
    # Perform DBSCAN clustering
    db = DBSCAN(eps=epsilon, min_samples=min_samples, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    
    # Add cluster labels to the dataframe
    valid_coords['cluster'] = db.labels_
    
    # Filter for points that belong to a cluster (not noise)
    clustered = valid_coords[valid_coords['cluster'] != -1]
    
    # Group by cluster to analyze each potential significant place
    cluster_stats = []
    
    for cluster_id, cluster_data in clustered.groupby('cluster'):
        # Calculate total duration in minutes at this cluster
        total_duration_minutes = cluster_data['duration'].sum() / 60
        
        # Check if this is a significant place (based on duration)
        if total_duration_minutes >= min_duration_minutes:
            # Calculate centroid (average location)
            centroid_lat = cluster_data['latitude'].mean()
            centroid_lon = cluster_data['longitude'].mean()
            
            # Count visits (a visit is when a person arrives after being elsewhere)
            cluster_data = cluster_data.sort_values('time')
            cluster_data['prev_cluster'] = cluster_data['cluster'].shift(1)
            visits = len(cluster_data[(cluster_data['prev_cluster'] != cluster_data['cluster']) | 
                                     (cluster_data['prev_cluster'].isna())])
            
            # Get first and last time at this place
            first_time = datetime.fromtimestamp(cluster_data['time'].min())
            last_time = datetime.fromtimestamp(cluster_data['time'].max())
            
            # Check time patterns (morning, afternoon, evening, night)
            hours = [datetime.fromtimestamp(ts).hour for ts in cluster_data['time']]
            
            time_patterns = {
                'morning': sum(1 for h in hours if 5 <= h < 12),
                'afternoon': sum(1 for h in hours if 12 <= h < 17),
                'evening': sum(1 for h in hours if 17 <= h < 22),
                'night': sum(1 for h in hours if h >= 22 or h < 5)
            }
            
            dominant_time = max(time_patterns, key=time_patterns.get)
            
            # Check day patterns (weekday vs weekend)
            days = [datetime.fromtimestamp(ts).weekday() for ts in cluster_data['time']]
            weekday_count = sum(1 for d in days if d < 5)
            weekend_count = sum(1 for d in days if d >= 5)
            
            day_pattern = 'weekday' if weekday_count > weekend_count else 'weekend'
            
            # Append to results
            cluster_stats.append({
                'cluster_id': cluster_id,
                'latitude': centroid_lat,
                'longitude': centroid_lon,
                'visit_count': visits,
                'total_points': len(cluster_data),
                'total_duration_minutes': total_duration_minutes,
                'first_visit': first_time,
                'last_visit': last_time,
                'dominant_time': dominant_time,
                'day_pattern': day_pattern,
                'weekday_count': weekday_count,
                'weekend_count': weekend_count
            })
    
    # Convert to DataFrame
    sig_places = pd.DataFrame(cluster_stats)
    
    # Sort by most visited
    if not sig_places.empty:
        sig_places = sig_places.sort_values('visit_count', ascending=False)
    
    return sig_places

# Process selected users
sig_places_results = {}

for user_id in top_users:
    file_path = f'gps/{user_id}.csv'
    print(f"Processing significant places for {user_id}...")
    
    try:
        # Parse GPS data
        gps_df = parse_gps_file(file_path)
        
        # Identify significant places
        sig_places = identify_significant_places(gps_df)
        
        if sig_places.empty:
            print(f"  No significant places found for {user_id}")
        else:
            print(f"  Found {len(sig_places)} significant places for {user_id}")
            sig_places_results[user_id] = sig_places
            
            # Display top places
            print(sig_places[['visit_count', 'total_duration_minutes', 'dominant_time', 'day_pattern']].head())
            
    except Exception as e:
        print(f"Error processing {user_id}: {e}")

# Basic location type inference based on time patterns
def infer_location_type(place):
    """Infer the type of location based on visiting patterns"""
    # Home is likely where someone spends the most time, especially at night
    if place['dominant_time'] == 'night' and place['total_duration_minutes'] > 480:  # >8 hours
        return 'Home'
    
    # Work/school is likely where someone spends time on weekdays during the day
    elif place['dominant_time'] in ['morning', 'afternoon'] and place['weekday_count'] > place['weekend_count'] * 2:
        return 'Work/School'
    
    # Restaurant or shopping is likely shorter duration, typically in evening or afternoon
    elif place['total_duration_minutes'] < 120 and place['dominant_time'] in ['afternoon', 'evening']:
        return 'Restaurant/Shopping'
    
    # Social venue - evening and night visits, moderate duration
    elif place['dominant_time'] in ['evening', 'night'] and 60 < place['total_duration_minutes'] < 240:
        return 'Social Venue'
    
    # Default for places we can't categorize
    else:
        return 'Other'

# Add location type inference
for user_id, places in sig_places_results.items():
    places['inferred_type'] = places.apply(infer_location_type, axis=1)
    print(f"\nInferred location types for {user_id}:")
    print(places[['inferred_type', 'visit_count', 'total_duration_minutes']].head())

### Visualization of Location Data

In [None]:
# Function to create a map for a specific user
def create_user_location_map(user_id, gps_df, sig_places):
    """
    Create an interactive map visualization for a user's locations
    
    Parameters:
    - user_id: ID of the user
    - gps_df: DataFrame containing raw GPS data
    - sig_places: DataFrame containing significant places
    
    Returns:
    - Folium map object
    """
    # Get the centroid of user's locations for map center
    center_lat = gps_df['latitude'].median()
    center_lon = gps_df['longitude'].median()
    
    # Create a map
    m = folium.Map(location=[center_lat, center_lon], zoom_start=13)
    
    # Add a heatmap of all the points
    heat_data = [[row['latitude'], row['longitude']] for _, row in gps_df.iterrows() 
                if -90 <= row['latitude'] <= 90 and -180 <= row['longitude'] <= 180]
    
    HeatMap(heat_data, radius=10).add_to(m)
    
    # Add markers for significant places
    for _, place in sig_places.iterrows():
        # Color based on inferred type
        color_map = {
            'Home': 'red',
            'Work/School': 'blue',
            'Restaurant/Shopping': 'green',
            'Social Venue': 'purple',
            'Other': 'gray'
        }
        
        color = color_map.get(place['inferred_type'], 'gray')
        
        # Create popup content
        popup_content = f"""
        <b>Type:</b> {place['inferred_type']}<br>
        <b>Visits:</b> {place['visit_count']}<br>
        <b>Total Duration:</b> {place['total_duration_minutes']:.1f} minutes<br>
        <b>Pattern:</b> {place['dominant_time']}, {place['day_pattern']}
        """
        
        # Create marker
        folium.CircleMarker(
            location=[place['latitude'], place['longitude']],
            radius=place['visit_count'] / 2 + 5,  # Size based on visit count
            popup=folium.Popup(popup_content, max_width=300),
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7,
            tooltip=f"{place['inferred_type']}"
        ).add_to(m)
    
    # Draw lines between consecutive significant visits
    # This requires tracking movement between significant places in chronological order
    # For simplicity, we'll connect the top N places based on the chronological order of first visit
    
    top_places = sig_places.sort_values('first_visit').head(min(10, len(sig_places)))
    
    if len(top_places) > 1:
        coords = top_places[['latitude', 'longitude']].values
        for i in range(1, len(coords)):
            folium.PolyLine(
                locations=[coords[i-1], coords[i]],
                color='black',
                weight=2,
                opacity=0.5,
                dash_array='5, 5'
            ).add_to(m)
    
    # Add a legend
    legend_html = '''
    <div style="position: fixed; 
        bottom: 50px; left: 50px; width: 180px; height: 160px; 
        border:2px solid grey; z-index:9999; font-size:14px;
        background-color:white; padding:10px;
        border-radius:5px;">
        <b>Location Types</b><br>
        <i style="background: red; width: 15px; height: 15px; display: inline-block;"></i> Home<br>
        <i style="background: blue; width: 15px; height: 15px; display: inline-block;"></i> Work/School<br>
        <i style="background: green; width: 15px; height: 15px; display: inline-block;"></i> Restaurant/Shopping<br>
        <i style="background: purple; width: 15px; height: 15px; display: inline-block;"></i> Social Venue<br>
        <i style="background: gray; width: 15px; height: 15px; display: inline-block;"></i> Other<br>
        <span style="font-size:10px;">Larger circles = more visits</span>
    </div>
    '''
    m.get_root().html.add_child(folium.Element(legend_html))
    
    return m

# Create maps for each selected user
for user_id in sig_places_results:
    file_path = f'gps/{user_id}.csv'
    gps_df = parse_gps_file(file_path)
    sig_places = sig_places_results[user_id]
    
    map_title = f"Map for User {user_id}"
    print(f"Creating {map_title}...")
    
    # Create the map
    user_map = create_user_location_map(user_id, gps_df, sig_places)
    
    # Display the map
    display(user_map)
    
    # Save the map
    map_file = f"{user_id}_location_map.html"
    user_map.save(map_file)
    print(f"Map saved to {map_file}")

### Detailed Analysis and Pattern Recognition

In [None]:
# Function for detailed analysis of user movement patterns
def analyze_movement_patterns(user_id, gps_df, sig_places):
    """
    Analyze detailed movement patterns for a user
    
    Parameters:
    - user_id: ID of the user
    - gps_df: DataFrame containing raw GPS data
    - sig_places: DataFrame containing significant places
    
    Returns:
    - Dictionary with analysis results
    """
    results = {}
    
    # Ensure data is sorted by time
    gps_df = gps_df.sort_values('time')
    
    # Convert timestamps to datetime for easier analysis
    gps_df['datetime'] = pd.to_datetime(gps_df['time'], unit='s')
    
    # Extract day of week and hour
    gps_df['day_of_week'] = gps_df['datetime'].dt.dayofweek
    gps_df['hour'] = gps_df['datetime'].dt.hour
    
    # Overall date range
    date_range = (gps_df['datetime'].max() - gps_df['datetime'].min()).days
    results['date_range_days'] = date_range
    
    # Calculate distance traveled each day
    gps_df['date'] = gps_df['datetime'].dt.date
    
    daily_distances = []
    for date, group in gps_df.groupby('date'):
        coords = group[['latitude', 'longitude']].values
        distance = 0
        if len(coords) > 1:
            for i in range(1, len(coords)):
                dist = great_circle((coords[i-1][0], coords[i-1][1]), (coords[i][0], coords[i][1])).kilometers
                if dist < 100:  # Filter out unreasonable jumps
                    distance += dist
        daily_distances.append({'date': date, 'distance_km': distance})
    
    daily_distances_df = pd.DataFrame(daily_distances)
    results['daily_distances'] = daily_distances_df
    
    # Calculate average daily travel distance
    if not daily_distances_df.empty:
        results['avg_daily_distance_km'] = daily_distances_df['distance_km'].mean()
        results['max_daily_distance_km'] = daily_distances_df['distance_km'].max()
    
    # Analyze time spent at each significant place
    if not sig_places.empty:
        results['total_places'] = len(sig_places)
        
        # Calculate percentages of time at each type of place
        type_durations = sig_places.groupby('inferred_type')['total_duration_minutes'].sum()
        total_duration = type_durations.sum()
        
        if total_duration > 0:
            type_percentages = (type_durations / total_duration * 100).to_dict()
            results['place_type_percentages'] = type_percentages
    
    # Analyze hourly activity patterns
    hourly_counts = gps_df['hour'].value_counts().sort_index()
    results['hourly_activity'] = hourly_counts.to_dict()
    
    # Analyze day of week patterns
    day_counts = gps_df['day_of_week'].value_counts().sort_index()
    day_names = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 
                 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
    day_counts.index = day_counts.index.map(day_names)
    results['day_of_week_activity'] = day_counts.to_dict()
    
    return results

# Analyze movement patterns for each user
movement_analysis = {}

for user_id in sig_places_results:
    file_path = f'gps/{user_id}.csv'
    gps_df = parse_gps_file(file_path)
    sig_places = sig_places_results[user_id]
    
    print(f"\nAnalyzing movement patterns for {user_id}...")
    analysis = analyze_movement_patterns(user_id, gps_df, sig_places)
    movement_analysis[user_id] = analysis
    
    # Print key findings
    print(f"Date range: {analysis['date_range_days']} days")
    if 'avg_daily_distance_km' in analysis:
        print(f"Average daily travel: {analysis['avg_daily_distance_km']:.2f} km")
    
    if 'place_type_percentages' in analysis:
        print("\nTime distribution by place type:")
        for place_type, percentage in analysis['place_type_percentages'].items():
            print(f"  {place_type}: {percentage:.1f}%")
    
    # Plot daily distances
    if 'daily_distances' in analysis and not analysis['daily_distances'].empty:
        plt.figure(figsize=(12, 5))
        plt.bar(analysis['daily_distances']['date'], analysis['daily_distances']['distance_km'])
        plt.title(f'{user_id} - Daily Travel Distances')
        plt.ylabel('Distance (km)')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
    
    # Plot hourly activity
    if 'hourly_activity' in analysis:
        hours = list(analysis['hourly_activity'].keys())
        counts = list(analysis['hourly_activity'].values())
        
        plt.figure(figsize=(12, 5))
        plt.bar(hours, counts)
        plt.title(f'{user_id} - Activity by Hour of Day')
        plt.xlabel('Hour')
        plt.ylabel('Number of GPS Records')
        plt.xticks(range(0, 24))
        plt.tight_layout()
        plt.show()
    
    # Plot day of week activity
    if 'day_of_week_activity' in analysis:
        days = list(analysis['day_of_week_activity'].keys())
        counts = list(analysis['day_of_week_activity'].values())
        
        plt.figure(figsize=(10, 5))
        plt.bar(days, counts)
        plt.title(f'{user_id} - Activity by Day of Week')
        plt.ylabel('Number of GPS Records')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

KeyboardInterrupt: 

### Automatic Location Type Classification

In [None]:
# Function to extract more detailed features for location type classification
def extract_location_features(place, gps_df):
    """
    Extract detailed features for a location to use in classification
    
    Parameters:
    - place: Dictionary or Series with basic place information
    - gps_df: DataFrame containing raw GPS data
    
    Returns:
    - Dictionary with extracted features
    """
    # Basic features already in the place object
    features = {
        'total_duration_minutes': place['total_duration_minutes'],
        'visit_count': place['visit_count'],
        'weekday_ratio': place['weekday_count'] / max(1, (place['weekday_count'] + place['weekend_count'])),
        'weekend_ratio': place['weekend_count'] / max(1, (place['weekday_count'] + place['weekend_count']))
    }
    
    # Find all GPS points in this cluster
    if 'cluster_id' in place:
        cluster_points = gps_df[gps_df['cluster'] == place['cluster_id']]
        
        # Convert timestamps to datetime
        if 'datetime' not in cluster_points.columns:
            cluster_points['datetime'] = pd.to_datetime(cluster_points['time'], unit='s')
            
        # Extract hour of day
        cluster_points['hour'] = cluster_points['datetime'].dt.hour
        
        # Calculate hour distribution
        hour_dist = cluster_points['hour'].value_counts(normalize=True).to_dict()
        
        # Calculate time-of-day ratios
        morning_hours = [6, 7, 8, 9, 10, 11]
        afternoon_hours = [12, 13, 14, 15, 16, 17]
        evening_hours = [18, 19, 20, 21]
        night_hours = [22, 23, 0, 1, 2, 3, 4, 5]
        
        features['morning_ratio'] = sum(hour_dist.get(h, 0) for h in morning_hours)
        features['afternoon_ratio'] = sum(hour_dist.get(h, 0) for h in afternoon_hours)
        features['evening_ratio'] = sum(hour_dist.get(h, 0) for h in evening_hours)
        features['night_ratio'] = sum(hour_dist.get(h, 0) for h in night_hours)
        
        # Calculate average duration per visit
        features['avg_duration_per_visit'] = place['total_duration_minutes'] / place['visit_count']
        
        # Calculate visit regularity (standard deviation of times between visits)
        visit_times = cluster_points.sort_values('time')['time'].values
        if len(visit_times) > 1:
            time_diffs = np.diff(visit_times)
            features['visit_regularity'] = np.std(time_diffs) / (60 * 60)  # in hours
        else:
            features['visit_regularity'] = 0
    
    return features

# Function to classify location types based on extracted features
def classify_location_type(features):
    """
    Classify a location based on its features
    
    Parameters:
    - features: Dictionary of location features
    
    Returns:
    - Predicted location type and confidence
    """
    # Define classification rules based on common patterns
    
    # Home characteristics: Long duration, high night ratio, regular visits
    home_score = (
        min(1, features['total_duration_minutes'] / 500) * 0.3 +
        features['night_ratio'] * 0.4 +
        (1 - min(1, features['visit_regularity'] / 24)) * 0.3  # Lower irregularity is better for home
    )
    
    # Work/School: Weekday dominant, morning/afternoon, regular visits
    work_score = (
        features['weekday_ratio'] * 0.4 +
        (features['morning_ratio'] + features['afternoon_ratio']) * 0.4 +
        (1 - min(1, features['visit_regularity'] / 24)) * 0.2  # Lower irregularity is better for work
    )
    
    # Restaurant/Shopping: Shorter stays, afternoon/evening, any day
    restaurant_score = (
        (1 - min(1, features['avg_duration_per_visit'] / 120)) * 0.5 +  # Shorter stays (inverse relation)
        (features['afternoon_ratio'] + features['evening_ratio']) * 0.5
    )
    
    # Social Venue: Evening/Night, longer than restaurants, less regular
    social_score = (
        (features['evening_ratio'] + features['night_ratio']) * 0.6 +
        min(1, features['avg_duration_per_visit'] / 180) * 0.2 +  # Medium duration
        min(1, features['visit_regularity'] / 48) * 0.2  # More irregular is typical
    )
    
    # Transit/Commute: Very short stays, any time
    transit_score = (
        (1 - min(1, features['avg_duration_per_visit'] / 30)) * 0.7 +  # Very short stays
        (1 - min(1, features['visit_count'] / 20)) * 0.3  # Less frequent visits
    )
    
    # Calculate the highest score
    scores = {
        'Home': home_score,
        'Work/School': work_score,
        'Restaurant/Shopping': restaurant_score,
        'Social Venue': social_score,
        'Transit/Commute': transit_score
    }
    
    predicted_type = max(scores, key=scores.get)
    confidence = scores[predicted_type]
    
    # If no strong prediction, mark as Other
    if confidence < 0.5:
        predicted_type = 'Other'
        confidence = 1 - confidence
    
    return predicted_type, confidence

# Apply advanced classification to significant places
for user_id in sig_places_results:
    file_path = f'gps/{user_id}.csv'
    gps_df = parse_gps_file(file_path)
    sig_places = sig_places_results[user_id]
    
    # Add cluster IDs to GPS data for feature extraction
    gps_with_clusters = gps_df.copy()
    
    # Since we don't have the original clustering data, we'll need to recreate it
    # This is a simplified approach - in practice, we'd reuse the same clustering from earlier
    coords = gps_df[['latitude', 'longitude']].values
    
    # Apply DBSCAN clustering
    kms_per_radian = 6371.0
    epsilon = 0.05 / kms_per_radian  # 50 meters
    db = DBSCAN(eps=epsilon, min_samples=5, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    
    # Add cluster labels
    gps_with_clusters['cluster'] = db.labels_
    
    print(f"\nApplying advanced location classification for {user_id}...")
    
    # Extract features and classify each place
    classification_results = []
    
    for idx, place in sig_places.iterrows():
        # Match the cluster ID from our simplified clustering to the significant place
        # based on proximity to the place's centroid
        place_coords = (place['latitude'], place['longitude'])
        
        # Find the closest cluster
        closest_cluster = -1
        min_distance = float('inf')
        
        for cluster_id, cluster_data in gps_with_clusters[gps_with_clusters['cluster'] != -1].groupby('cluster'):
            cluster_centroid = (cluster_data['latitude'].mean(), cluster_data['longitude'].mean())
            distance = great_circle(place_coords, cluster_centroid).kilometers
            
            if distance < min_distance:
                min_distance = distance
                closest_cluster = cluster_id
        
        if closest_cluster != -1 and min_distance < 0.1:  # 100 meters threshold
            place_copy = place.copy()
            place_copy['cluster_id'] = closest_cluster
            
            # Extract features
            features = extract_location_features(place_copy, gps_with_clusters)
            
            # Classify location
            predicted_type, confidence = classify_location_type(features)
            
            classification_results.append({
                'latitude': place['latitude'],
                'longitude': place['longitude'],
                'visit_count': place['visit_count'],
                'total_duration_minutes': place['total_duration_minutes'],
                'manual_type': place['inferred_type'],
                'predicted_type': predicted_type,
                'confidence': confidence,
                'features': features
            })
    
    # Convert to DataFrame
    if classification_results:
        results_df = pd.DataFrame(classification_results)
        
        # Calculate accuracy against the initial inference
        accuracy = sum(results_df['manual_type'] == results_df['predicted_type']) / len(results_df)
        
        print(f"Classification accuracy: {accuracy:.2f}")
        print("\nTop locations with classifications:")
        print(results_df[['manual_type', 'predicted_type', 'confidence', 'visit_count', 'total_duration_minutes']].head())
        
        # Confusion matrix visualization
        confusion = pd.crosstab(
            results_df['manual_type'], 
            results_df['predicted_type'], 
            rownames=['Manual'], 
            colnames=['Predicted']
        )
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(confusion, annot=True, cmap='Blues', fmt='d')
        plt.title(f'{user_id} - Location Type Classification Comparison')
        plt.tight_layout()
        plt.show()
        
        # Feature importance visualization for each type
        feature_cols = [col for col in results_df['features'].iloc[0].keys()]
        
        # Extract features into separate columns
        for col in feature_cols:
            results_df[col] = results_df['features'].apply(lambda x: x.get(col, 0))
        
        # Plot feature distributions by predicted type
        for feature in ['morning_ratio', 'afternoon_ratio', 'evening_ratio', 'night_ratio', 
                       'weekday_ratio', 'weekend_ratio', 'avg_duration_per_visit']:
            plt.figure(figsize=(12, 6))
            sns.boxplot(x='predicted_type', y=feature, data=results_df)
            plt.title(f'{user_id} - {feature} Distribution by Location Type')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()

### Final Summary and Visualization

In [None]:
# Create a final summary dashboard for each user
def create_summary_dashboard(user_id, gps_df, sig_places, movement_analysis):
    """Create a comprehensive summary of a user's location patterns"""
    
    print(f"\n{'='*50}")
    print(f"LOCATION PATTERN SUMMARY FOR {user_id}")
    print(f"{'='*50}")
    
    # Basic statistics
    total_data_points = len(gps_df)
    date_range = movement_analysis.get('date_range_days', 0)
    avg_daily_distance = movement_analysis.get('avg_daily_distance_km', 0)
    
    print(f"Data points: {total_data_points}")
    print(f"Date range: {date_range} days")
    print(f"Average daily travel distance: {avg_daily_distance:.2f} km")
    
    # Significant places summary
    if not sig_places.empty:
        print(f"\nIdentified {len(sig_places)} significant places")
        
        # Top places by visit count
        print("\nMost visited places:")
        for idx, place in sig_places.sort_values('visit_count', ascending=False).head(5).iterrows():
            print(f"  {place['inferred_type']}: {place['visit_count']} visits, {place['total_duration_minutes']:.1f} minutes total")
        
        # Places by total duration
        print("\nPlaces with longest duration:")
        for idx, place in sig_places.sort_values('total_duration_minutes', ascending=False).head(5).iterrows():
            print(f"  {place['inferred_type']}: {place['total_duration_minutes']:.1f} minutes total, {place['visit_count']} visits")
    
    # Time distribution
    if 'place_type_percentages' in movement_analysis:
        print("\nTime distribution by place type:")
        for place_type, percentage in movement_analysis['place_type_percentages'].items():
            print(f"  {place_type}: {percentage:.1f}%")
    
    # Visualize the time distribution
    if 'place_type_percentages' in movement_analysis:
        plt.figure(figsize=(10, 6))
        types = list(movement_analysis['place_type_percentages'].keys())
        percentages = list(movement_analysis['place_type_percentages'].values())
        
        plt.pie(percentages, labels=types, autopct='%1.1f%%', startangle=90, shadow=True)
        plt.title(f'{user_id} - Time Distribution by Place Type')
        plt.axis('equal')
        plt.tight_layout()
        plt.show()
    
    # Weekly patterns visualization
    if 'hourly_activity' in movement_analysis and 'day_of_week_activity' in movement_analysis:
        # Create a 2D heatmap of hour x day
        days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
        hours = list(range(24))
        
        # Convert GPS data to datetime if needed
        if 'datetime' not in gps_df.columns:
            gps_df['datetime'] = pd.to_datetime(gps_df['time'], unit='s')
        
        # Extract day and hour
        gps_df['day'] = gps_df['datetime'].dt.dayofweek
        gps_df['hour'] = gps_df['datetime'].dt.hour
        
        # Create a 2D matrix for the heatmap
        activity_matrix = np.zeros((7, 24))
        
        for _, row in gps_df.iterrows():
            if 0 <= row['day'] < 7 and 0 <= row['hour'] < 24:
                activity_matrix[row['day'], row['hour']] += 1
        
        # Normalize by the maximum value
        if activity_matrix.max() > 0:
            activity_matrix = activity_matrix / activity_matrix.max()
        
        # Plot the heatmap
        plt.figure(figsize=(14, 7))
        sns.heatmap(activity_matrix, xticklabels=hours, yticklabels=days, cmap='viridis')
        plt.title(f'{user_id} - Weekly Activity Pattern')
        plt.xlabel('Hour of Day')
        plt.ylabel('Day of Week')
        plt.tight_layout()
        plt.show()
    
    # Movement trajectory visualization
    plt.figure(figsize=(12, 10))
    
    # Filter valid coordinates
    valid_coords = gps_df[(gps_df['latitude'].between(-90, 90)) & (gps_df['longitude'].between(-180, 180))]
    
    # Plot the trajectory
    plt.scatter(valid_coords['longitude'], valid_coords['latitude'], 
                c=valid_coords['time'], cmap='viridis', alpha=0.5, s=10)
    
    # Add markers for significant places
    if not sig_places.empty:
        for _, place in sig_places.iterrows():
            marker_style = {
                'Home': 'ro',
                'Work/School': 'bs',
                'Restaurant/Shopping': 'g^',
                'Social Venue': 'mD',
                'Transit/Commute': 'yP',
                'Other': 'kX'
            }
            
            style = marker_style.get(place['inferred_type'], 'kX')
            plt.plot(place['longitude'], place['latitude'], style, 
                     markersize=10+place['visit_count']/5, label=place['inferred_type'])
    
    # Remove duplicate labels in the legend
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys(), loc='best')
    
    plt.title(f'{user_id} - Location Trajectory and Significant Places')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Generate summary for each user
for user_id in sig_places_results:
    file_path = f'gps/{user_id}.csv'
    gps_df = parse_gps_file(file_path)
    sig_places = sig_places_results[user_id]
    analysis = movement_analysis[user_id]
    
    create_summary_dashboard(user_id, gps_df, sig_places, analysis)