In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

# Process the data

In [None]:
# move_dt = pd.read_csv("data/foood_mob_cover.csv")
# move_dt = move_dt.assign(
#     who = (move_dt['quitaz_1'] * 8e3 + move_dt['quitaz_2']).astype(int),
#     seq = move_dt['seiqd'],
#     lon_o = (move_dt['mean_dur'] - 200)/1e3 + 114,
#     lon_d = (move_dt['std_dur'] - 200)/1e3 + 114,
#     lat_o = (move_dt['mean_volm'])/10 + 22.65,
#     lat_d = (move_dt['std_volm'])/10 + 22.65,
#     date = move_dt['date'],
#     poi_o = move_dt['district_o'],
#     poi_d = move_dt['district_d'],
#     inplace=True
# ) 

# select_columns = ['who', 'seq', 'lon_o', 'lat_o', 'lon_d', 'lat_d', 'date', 'poi_o', 'poi_d']
# move_dt = move_dt[select_columns].copy()
# # sort the values of the move_dt dataframe by who, date and seq
# move_dt = move_dt.sort_values(by=['who', 'date', 'seq']).reset_index(drop=True)
# move_dt.to_csv("data/processed_moves.csv", index=False)

In [None]:
# move_dt = pd.read_csv("data/processed_moves.csv")
# persons = move_dt.who.drop_duplicates().reset_index(drop=True)
# persons.to_csv("data/processed_users.csv", index=False)

In [2]:
# process stay data
# read all the data as series, because they only contain one column
t_start = pd.read_csv('data/st/reptoire.csv').iloc[:, 0]
t_end = pd.read_csv('data/st/nif.csv').iloc[:, 0]
ptype = pd.read_csv('data/st/model.csv').iloc[:, 0]
poi = pd.read_csv('data/st/iop.csv').iloc[:, 0]
who = pd.read_csv('data/st/est.csv').iloc[:, 0]
date = pd.read_csv('data/st/aoz.csv').iloc[:, 0]
lon_p1 = pd.read_csv('data/st/mean_log_p1.csv').iloc[:, 0]
lon_p2 = pd.read_csv('data/st/mean_log_p2.csv').iloc[:, 0]
lat_p1 = pd.read_csv('data/st/std_log_p1.csv').iloc[:, 0]
lat_p2 = pd.read_csv('data/st/std_log_p2.csv').iloc[:, 0]

# concatenate the longitude and latitude parts
lon = pd.concat([lon_p1, lon_p2], ignore_index=True)
lat = pd.concat([lat_p1, lat_p2], ignore_index=True)

st_data = pd.DataFrame({
    't_start': t_start,
    't_end': t_end,
    'ptype': ptype,
    'poi': poi,
    'who': who,
    'date': date,
    'lon': lon,
    'lat': lat
})

In [3]:
st_sample = st_data.assign(
    # turn to the time stamp by adding the base time 1677600000
    who = st_data['who'].astype(int),
    date = st_data['date'].astype(int) + 20202020,
    t_start = pd.to_datetime(st_data['t_start'] + 1677600000, unit='s'),
    t_end = pd.to_datetime(st_data['t_end'] + 1677600000, unit='s'),
    lon = st_data['lon'] / 4,
    lat = st_data['lat'] * 4,
    ptype = st_data['ptype'].astype(int),
    poi = st_data['poi'].astype(int)
)

st_sample = st_sample[['who', 'date', 't_start', 't_end', 'lon', 'lat', 'ptype', 'poi']].\
    sort_values(by=['who', 'date', 't_start'])


In [4]:
# ====== Stay Record Processing Algorithm ======
def merge_consecutive_stays(df, gap_minutes=30):
    """
    Merge consecutive stay records with the same location and gap <= gap_minutes
    OR same location AND adjacent calendar dates
    
    Parameters:
    - df: DataFrame sorted by who and t_start
    - gap_minutes: Maximum gap threshold (minutes)
    
    Returns:
    - Merged DataFrame
    """
    if len(df) == 0:
        return df.copy()
    
    # Calculate time gap to next record (in minutes)
    time_diff = (df['t_start'].shift(-1) - df['t_end']).dt.total_seconds() / 60
    
    # Check if location changed (use np.isclose for floating-point comparison)
    lon_same = np.isclose(df['lon'].shift(-1), df['lon'], rtol=1e-9)
    lat_same = np.isclose(df['lat'].shift(-1), df['lat'], rtol=1e-9)
    pos_same = lon_same & lat_same
    
    # Check if dates are adjacent
    date_adjacent = (df['date'].shift(-1) - df['date']) == 1
    
    # Determine if merge is needed:
    # Option 1: same location AND time gap <= threshold
    # Option 2: same location AND adjacent calendar dates
    need_merge = pos_same & ((time_diff <= gap_minutes) | date_adjacent)
    need_merge.iloc[-1] = False  # Last record doesn't need merge
    
    # Create group identifier: new group whenever merge is not needed
    group_id = (~need_merge).cumsum()
    
    # Aggregate by group
    # Add group_id as a column first to avoid FutureWarning
    df = df.copy()
    df['group_id'] = group_id
    merged = df.groupby(['who', 'group_id'], as_index=False).agg({
        't_start': 'min',
        't_end': 'max',
        'lon': 'first',
        'lat': 'first',
        'ptype': 'first',
        'poi': 'first',
        'date': 'first'
    })
    merged = merged.drop(columns=['group_id'])
    
    return merged


def filter_short_stays(df, min_minutes=30):
    """
    Filter out stay records shorter than min_minutes
    
    Parameters:
    - df: DataFrame
    - min_minutes: Minimum stay duration (minutes)
    
    Returns:
    - Filtered DataFrame
    """
    stay_duration = (df['t_end'] - df['t_start']).dt.total_seconds() / 60
    return df[stay_duration >= min_minutes].reset_index(drop=True)


# ====== Processing Pipeline ======
print(f"Original data: {len(st_sample)} records")

# Step 1: Group by who (data is already sorted by who, date, t_start)
st_processed = st_sample.copy()

# Step 2: Merge consecutive stays for each individual
def process_single_person(group):
    """Process data for a single individual"""
    return merge_consecutive_stays(group, gap_minutes=30)

st_processed = st_processed.groupby('who', group_keys=False).apply(process_single_person)
print(f"After Step 2 (merge consecutive stays): {len(st_processed)} records")

# Step 3: Filter out stays shorter than 30 minutes
st_processed = filter_short_stays(st_processed, min_minutes=30)
print(f"After Step 3 (filter short stays): {len(st_processed)} records")

# Step 4: Merge again
st_processed = st_processed.groupby('who', group_keys=False).apply(process_single_person)
print(f"After Step 4 (merge again): {len(st_processed)} records")

# Re-sort
st_processed = st_processed.sort_values(by=['who', 't_start']).reset_index(drop=True)

st_processed

Original data: 720427 records


  st_processed = st_processed.groupby('who', group_keys=False).apply(process_single_person)


After Step 2 (merge consecutive stays): 652777 records
After Step 3 (filter short stays): 575138 records


  st_processed = st_processed.groupby('who', group_keys=False).apply(process_single_person)


After Step 4 (merge again): 567555 records


Unnamed: 0,who,t_start,t_end,lon,lat,ptype,poi,date
0,126272,2019-01-01 00:28:58,2019-01-01 10:11:49,113.833530,22.689639,1,0,20190101
1,126272,2019-01-01 10:19:15,2019-01-01 17:13:30,113.948257,22.529631,0,3,20190101
2,126272,2019-01-01 17:26:58,2019-01-01 18:47:28,113.889444,22.773309,0,14,20190101
3,126272,2019-01-01 18:59:06,2019-01-01 21:26:17,113.867473,22.573858,0,11,20190101
4,126272,2019-01-02 10:05:07,2019-01-02 19:41:13,114.095561,22.553550,0,2,20190102
...,...,...,...,...,...,...,...,...
567550,78623283,2019-12-30 11:00:45,2019-12-30 18:46:13,113.912155,22.534095,2,0,20191230
567551,78623283,2019-12-30 19:58:40,2019-12-30 23:15:45,113.908330,22.518993,1,1,20191230
567552,78623283,2019-12-31 09:02:22,2019-12-31 18:30:06,114.026052,22.625212,0,9,20191231
567553,78623283,2019-12-31 18:30:40,2019-12-31 20:05:57,113.945381,22.556100,2,0,20191231


# Format the trajectories

In [5]:
# ====== Step 1 & 2: Random Jitter & HDBSCAN Clustering ======
import hdbscan
from typing import Optional
import plotly.express as px
from pyproj import Transformer
from shapely.geometry import Point

# Random jitter function: uniform random offset within specified radius (area-uniform)
def jitter_within_radius(xy: np.ndarray, max_radius_m: float, rng: np.random.Generator) -> np.ndarray:
    """
    Apply uniform random jitter within a circle of specified radius.
    Uses sqrt(r) for radius to ensure uniform area distribution.
    """
    # Check: the dimension of the input array should be 2
    if xy.shape[1] != 2:
        raise ValueError("Input array should have exactly 2 columns (x, y)")
    n = xy.shape[0]
    angles = rng.uniform(0.0, 2.0 * np.pi, size=n)
    radii = max_radius_m * np.sqrt(rng.uniform(0.0, 1.0, size=n))
    offsets = np.column_stack([radii * np.cos(angles), radii * np.sin(angles)])
    return xy + offsets


def add_jitter_and_cluster(
    df: pd.DataFrame, 
    jitter_radius_m: float = 300, 
    min_cluster_size: int = 6, 
    min_samples: int = 6,
    seed: int = 114514
) -> pd.DataFrame:
    """
    Apply 300m random jitter to coordinates, then perform HDBSCAN clustering.
    
    This function:
    1. Converts lon/lat to UTM coordinates (EPSG:32650)
    2. Applies random jitter within specified radius
    3. Converts back to WGS84 lon/lat (updating original coordinates)
    4. Performs HDBSCAN clustering on the jittered coordinates
    
    Cluster labels:
    - 0: Missing coordinates (outside study area)
    - -1: HDBSCAN noise points
    - 1+: HDBSCAN clusters (original HDBSCAN labels shifted by +1)
    
    Parameters:
    - df: DataFrame with 'who', 'lon', 'lat' columns
    - jitter_radius_m: Radius for random jitter in meters
    - min_cluster_size: HDBSCAN parameter
    - min_samples: HDBSCAN parameter
    - seed: Random seed for reproducibility
    
    Returns:
    - DataFrame with additional 'cluster_id' column (0 = missing, -1 = noise, 1+ = clusters)
    """
    RNG = np.random.default_rng(seed)
    df = df.copy()
    
    def cluster_one_person(person_df: pd.DataFrame) -> pd.DataFrame:
        """Cluster stays for a single person using HDBSCAN.
        
        Returns:
        - cluster_id = 0: missing coordinates (outside study area)
        - cluster_id = -1: HDBSCAN noise
        - cluster_id = 1+: HDBSCAN clusters (original label + 1)
        """
        person_df = person_df.copy()
        
        # Check for missing coordinates
        has_coords = ~person_df[['lon', 'lat']].isna().any(axis=1)
        valid_mask = has_coords.values
        valid_count = valid_mask.sum()
        total_count = len(person_df)
        
        # Initialize all cluster_ids to 0 (missing coordinates)
        person_df['cluster_id'] = 0
        
        # If no valid coordinates at all, skip clustering
        if valid_count == 0:
            print(f"  User {person_df['who'].iloc[0]}: All {total_count} records have missing coordinates, skipping clustering")
            return person_df
        
        # If some records have missing coordinates, report stats
        if valid_count < total_count:
            missing_count = total_count - valid_count
            print(f"  User {person_df['who'].iloc[0]}: {missing_count} missing, {valid_count} valid coordinates")
        
        # Get valid records
        valid_indices = person_df[has_coords].index
        valid_lon = person_df.loc[valid_indices, 'lon'].values.copy()
        valid_lat = person_df.loc[valid_indices, 'lat'].values.copy()
        
        # UTM conversion (EPSG:32650 = UTM Zone 50N, covers Shenzhen)
        transformer = Transformer.from_crs("EPSG:4326", "EPSG:32650", always_xy=True)
        inverse_transformer = Transformer.from_crs("EPSG:32650", "EPSG:4326", always_xy=True)
        
        # Convert lon/lat to UTM coordinates (meters)
        utm_x, utm_y = transformer.transform(valid_lon, valid_lat)
        
        # Apply random jitter to UTM coordinates
        xy_valid = np.column_stack([utm_x, utm_y])
        xy_jittered = jitter_within_radius(xy_valid, max_radius_m=jitter_radius_m, rng=RNG)
        
        # Convert back to WGS84 lon/lat
        jittered_lon, jittered_lat = inverse_transformer.transform(xy_jittered[:, 0], xy_jittered[:, 1])
        
        # Update original lon/lat with jittered coordinates
        person_df.loc[valid_indices, 'lon'] = jittered_lon
        person_df.loc[valid_indices, 'lat'] = jittered_lat
        
        # HDBSCAN clustering
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=min_cluster_size, 
            min_samples=min_samples,
            metric='euclidean'
        )
        labels_valid = clusterer.fit_predict(xy_jittered)
        
        # Shift labels for valid records:
        # HDBSCAN returns -1 for noise, 0, 1, 2... for clusters
        # We want to keep -1 as noise, shift 0→1, 1→2, 2→3...
        # So we add 1 to non-noise labels only
        labels_shifted = np.where(labels_valid == -1, -1, labels_valid + 1)
        
        # Assign shifted labels back to valid records
        person_df.loc[valid_indices, 'cluster_id'] = labels_shifted
        
        return person_df
    
    # Group by 'who' and apply clustering
    print("Applying random jitter (300m) and HDBSCAN clustering...")
    print(f"Total users: {df['who'].nunique()}, Total records: {len(df)}")
    df_clustered = df.groupby('who', group_keys=False).apply(cluster_one_person)
    
    return df_clustered


# Apply clustering to processed data
print("Applying random jitter (300m) and HDBSCAN clustering...")
st_clustered = add_jitter_and_cluster(
    st_processed, 
    jitter_radius_m=300, 
    min_cluster_size=6, 
    min_samples=10
)

# Display clustering results
print(f"\nClustering complete!")
print(f"Total records: {len(st_clustered)}")
print(f"Missing coordinates (cluster_id=0): {(st_clustered['cluster_id'] == 0).sum()}")
print(f"HDBSCAN noise (cluster_id=-1): {(st_clustered['cluster_id'] == -1).sum()}")
print(f"Records in clusters (cluster_id>=1): {(st_clustered['cluster_id'] >= 1).sum()}")
print(f"Number of unique clusters: {st_clustered[st_clustered['cluster_id'] >= 1]['cluster_id'].nunique()}")

st_clustered.head(10)


Applying random jitter (300m) and HDBSCAN clustering...
Applying random jitter (300m) and HDBSCAN clustering...
Total users: 441, Total records: 567555
  User 395753: 1 missing, 1199 valid coordinates
  User 2436270: 5 missing, 1385 valid coordinates
  User 2590264: 3 missing, 1588 valid coordinates
  User 3087457: 2 missing, 1023 valid coordinates
  User 3549549: 2 missing, 1409 valid coordinates
  User 3803542: 1 missing, 1028 valid coordinates
  User 3932007: 1 missing, 1169 valid coordinates
  User 4717957: 1 missing, 751 valid coordinates
  User 4975553: 1 missing, 1271 valid coordinates
  User 6071595: 3 missing, 1640 valid coordinates
  User 6854307: 1 missing, 1322 valid coordinates
  User 7477809: 5 missing, 2034 valid coordinates
  User 7823042: 1 missing, 1262 valid coordinates
  User 8614296: 2 missing, 1388 valid coordinates
  User 12521378: 7 missing, 1696 valid coordinates
  User 13976993: 3 missing, 1176 valid coordinates
  User 14921919: 1 missing, 1372 valid coordinat

  df_clustered = df.groupby('who', group_keys=False).apply(cluster_one_person)


Unnamed: 0,who,t_start,t_end,lon,lat,ptype,poi,date,cluster_id
0,126272,2019-01-01 00:28:58,2019-01-01 10:11:49,113.834643,22.69071,1,0,20190101,3
1,126272,2019-01-01 10:19:15,2019-01-01 17:13:30,113.948529,22.530949,0,3,20190101,20
2,126272,2019-01-01 17:26:58,2019-01-01 18:47:28,113.889141,22.773277,0,14,20190101,9
3,126272,2019-01-01 18:59:06,2019-01-01 21:26:17,113.86688,22.573279,0,11,20190101,-1
4,126272,2019-01-02 10:05:07,2019-01-02 19:41:13,114.098086,22.554694,0,2,20190102,26
5,126272,2019-01-02 19:56:04,2019-01-02 22:37:02,114.097994,22.598955,0,9,20190102,10
6,126272,2019-01-03 00:46:34,2019-01-03 07:51:17,113.831582,22.688753,1,0,20190103,3
7,126272,2019-01-03 08:11:55,2019-01-03 14:11:15,113.842961,22.607576,2,1,20190103,24
8,126272,2019-01-03 14:11:19,2019-01-03 16:40:00,113.829167,22.737915,0,10,20190103,-1
9,126272,2019-01-03 16:53:00,2019-01-03 21:15:07,114.023939,22.531267,0,2,20190103,22


In [6]:
# ====== Step 3: Interactive Map Visualization ======
def visualize_user_clusters(
    df: pd.DataFrame, 
    user_id: int, 
    zoom: int = 12,
    height: int = 600
) -> px.scatter_mapbox:
    """
    Create an interactive map showing cluster locations for a specific user.
    
    Parameters:
    - df: DataFrame with 'who', 'lon', 'lat', 'cluster_id' columns
    - user_id: User ID to visualize
    - zoom: Initial zoom level
    - height: Map height in pixels
    
    Returns:
    - Plotly Express scatter_mapbox figure
    """
    user_data = df[df['who'] == user_id].copy()
    
    if user_data.empty:
        raise ValueError(f"No data found for user {user_id}")
    
    # Add cluster label for hover (missing vs noise vs cluster)
    # cluster_id = 0: Missing coordinates (outside study area)
    # cluster_id = -1: HDBSCAN noise
    # cluster_id = 1+: HDBSCAN clusters
    user_data['cluster_label'] = user_data['cluster_id'].apply(
        lambda x: f"Missing (Outside Study Area)" if x == 0 
                  else (f"HDBSCAN Noise" if x == -1 
                  else f"Cluster {x}")
    )
    
    # Define custom colors: missing=red, noise=gray, clusters=auto
    color_discrete_map = {
        'Missing (Outside Study Area)': 'red',
        'HDBSCAN Noise': 'gray',
    }
    
    # Get unique cluster labels for ordering
    cluster_labels = [f'Cluster {i}' for i in sorted(user_data[user_data['cluster_id'] >= 1]['cluster_id'].unique())]
    
    # Create scatter mapbox with custom colors
    fig = px.scatter_mapbox(
        user_data,
        lat='lat',
        lon='lon',
        color='cluster_label',
        color_discrete_map=color_discrete_map,
        category_orders={'cluster_label': ['Missing (Outside Study Area)', 'HDBSCAN Noise'] + cluster_labels},
        hover_data={
            'who': True,
            't_start': True,
            't_end': True,
            'cluster_id': True,
            'lat': ':.5f',
            'lon': ':.5f'
        },
        title=f"Stay locations for User {user_id}",
        zoom=zoom,
        height=height,
        size_max=15
    )
    
    fig.update_layout(mapbox_style="carto-positron")
    fig.update_layout(margin={"r": 0, "t": 40, "l": 0, "b": 0})
    
    return fig


# Example: Visualize clusters for a specific user
# Change user_id to visualize different users
inspect_user = st_clustered['who'].iloc[0]  # First user as example
print(f"Visualizing clusters for user {inspect_user}...")
fig = visualize_user_clusters(st_clustered, user_id=inspect_user, zoom=12, height=500)
fig.show()

# Visualize summary statistics per user
# New cluster_id meanings: 0 = missing, -1 = noise, 1+ = clusters
print("\nCluster statistics per user:")
user_stats = st_clustered.groupby('who').agg({
    'cluster_id': [
        'count',                                          # total_stays
        lambda x: (x == 0).sum(),                         # missing_stays
        lambda x: (x != 0).sum(),                          # valid_stays (not missing)
        lambda x: (x == -1).sum(),                         # noise_stays
        lambda x: (x >= 1).sum(),                          # clustered_stays
        lambda x: (x[x >= 1].nunique() if (x >= 1).any() else 0)  # num_clusters
    ]
}).reset_index()
user_stats.columns = ['who', 'total_stays', 'missing_stays', 'valid_stays', 'noise_stays', 'clustered_stays', 'num_clusters']
print(user_stats.head(10))

# Summary
print(f"\n=== Overall Statistics ===")
print(f"Total records: {len(st_clustered)}")
print(f"Missing coordinates (cluster_id=0): {(st_clustered['cluster_id'] == 0).sum()}")
print(f"Valid coordinates: {(st_clustered['cluster_id'] != 0).sum()}")
print(f"  - HDBSCAN noise (cluster_id=-1): {(st_clustered['cluster_id'] == -1).sum()}")
print(f"  - In clusters (cluster_id>=1): {(st_clustered['cluster_id'] >= 1).sum()}")
print(f"Number of unique clusters: {st_clustered[st_clustered['cluster_id'] >= 1]['cluster_id'].nunique()}")


  fig = px.scatter_mapbox(


Visualizing clusters for user 126272...



Cluster statistics per user:
       who  total_stays  missing_stays  valid_stays  noise_stays  \
0   126272         1082              0         1082          310   
1   278978          925              0          925          241   
2   395753         1200              1         1199          408   
3   506035         1288              0         1288          503   
4   612431         1432              0         1432          390   
5   661336         1608              0         1608          693   
6   824617          262              0          262           75   
7   928827         1620              0         1620          700   
8  1017498          769              0          769          221   
9  1159109         1567              0         1567          407   

   clustered_stays  num_clusters  
0              772            33  
1              684            25  
2              791            36  
3              785            29  
4             1042            33  
5          

In [7]:

# ====== Step 4 & 5: OOP Data Models & Conversion ======
from dataclasses import dataclass, field
from typing import List, Dict, Optional
from datetime import datetime, timedelta


@dataclass
class Visit:
    """
    Represents a single stay/visit at a location.
    
    Attributes:
    - t_start: Start timestamp of the visit
    - t_end: End timestamp of the visit
    - lon: Longitude of the location
    - lat: Latitude of the location
    - cluster_id: Cluster label (0 = missing/outside study area, -1 = HDBSCAN noise, 1+ = HDBSCAN clusters)
    - ptype: Place type (optional)
    - poi: POI information (optional)
    """
    t_start: datetime
    t_end: datetime
    lon: float
    lat: float
    cluster_id: int = 0  # 0 = missing, -1 = noise, 1+ = clusters
    ptype: Optional[int] = None
    poi: Optional[int] = None
    
    @property
    def duration(self) -> timedelta:
        """Calculate duration of the visit."""
        return self.t_end - self.t_start
    
    @property
    def date(self) -> int:
        """Extract date in YYYYMMDD format from t_start."""
        return int(self.t_start.strftime('%Y%m%d'))
    
    @classmethod
    def from_dict(cls, data: dict) -> 'Visit':
        """Create Visit from dictionary."""
        return cls(
            t_start=pd.to_datetime(data['t_start']),
            t_end=pd.to_datetime(data['t_end']),
            lon=float(data['lon']),
            lat=float(data['lat']),
            cluster_id=int(data.get('cluster_id', -1)),
            ptype=int(data['ptype']) if pd.notna(data.get('ptype')) else None,
            poi=int(data['poi']) if pd.notna(data.get('poi')) else None
        )
    
    def to_dict(self) -> dict:
        """Convert Visit to dictionary."""
        return {
            't_start': self.t_start,
            't_end': self.t_end,
            'lon': self.lon,
            'lat': self.lat,
            'cluster_id': self.cluster_id,
            'ptype': self.ptype,
            'poi': self.poi
        }


@dataclass
class Trajectory:
    """
    Represents a daily trajectory (from 3:00 AM to next day 3:00 AM).
    
    Attributes:
    - date: Date of the trajectory (YYYYMMDD format, representing the day starting at 3 AM)
    - visits: List of Visit objects
    """
    date: int
    visits: List[Visit] = field(default_factory=list)
    
    @property
    def num_visits(self) -> int:
        """Return number of visits."""
        return len(self.visits)
    
    @property
    def start_time(self) -> Optional[datetime]:
        """Return start time of first visit."""
        return self.visits[0].t_start if self.visits else None
    
    @property
    def end_time(self) -> Optional[datetime]:
        """Return end time of last visit."""
        return self.visits[-1].t_end if self.visits else None
    
    def add_visit(self, visit: Visit):
        """Add a visit to the trajectory."""
        self.visits.append(visit)
        # Keep visits sorted by start time
        self.visits.sort(key=lambda v: v.t_start)
    
    def to_dict(self) -> dict:
        """Convert trajectory to dictionary."""
        return {
            'date': self.date,
            'visits': [v.to_dict() for v in self.visits]
        }


class User:
    """
    Represents a user with their trajectories and memory.
    
    Attributes:
    - id: User identifier
    - trajectories: Dictionary mapping date to Trajectory
    - memory: User's memory (to be implemented)
    """
    
    # Reference time: 3:00 AM threshold for day boundaries
    DAY_THRESHOLD_HOUR = 3
    
    def __init__(self, id: int):
        """
        Initialize a User.
        
        Parameters:
        - id: User identifier
        """
        self.id = id
        self.trajectories: Dict[int, Trajectory] = {}
        self.memory: Optional[Dict] = None
    
    @property
    def num_trajectories(self) -> int:
        """Return number of trajectories."""
        return len(self.trajectories)
    
    @property
    def total_visits(self) -> int:
        """Return total number of visits across all trajectories."""
        return sum(t.num_visits for t in self.trajectories.values())
    
    @property
    def unique_clusters(self) -> set:
        """Return set of unique cluster IDs visited by this user.
        
        Note: cluster_id >= 1 are actual clusters (0 = missing, -1 = noise)
        """
        clusters = set()
        for traj in self.trajectories.values():
            for visit in traj.visits:
                if visit.cluster_id >= 1:  # Only count actual clusters (not noise=-1 or missing=0)
                    clusters.add(visit.cluster_id)
        return clusters
    
    def add_trajectory(self, trajectory: Trajectory):
        """Add a trajectory to the user."""
        self.trajectories[trajectory.date] = trajectory
    
    def get_trajectory(self, date: int) -> Optional[Trajectory]:
        """Get trajectory for a specific date."""
        return self.trajectories.get(date)
    
    def get_all_visits(self) -> List[Visit]:
        """Get all visits across all trajectories."""
        visits = []
        for traj in self.trajectories.values():
            visits.extend(traj.visits)
        return sorted(visits, key=lambda v: v.t_start)
    
    def to_dataframe(self) -> pd.DataFrame:
        """Convert user data to DataFrame."""
        records = []
        for date, traj in self.trajectories.items():
            for visit in traj.visits:
                records.append({
                    'who': self.id,
                    'date': date,
                    't_start': visit.t_start,
                    't_end': visit.t_end,
                    'lon': visit.lon,
                    'lat': visit.lat,
                    'cluster_id': visit.cluster_id,
                    'ptype': visit.ptype,
                    'poi': visit.poi
                })
        return pd.DataFrame(records)
    
    @classmethod
    def from_dataframe(cls, df: pd.DataFrame) -> 'User':
        """
        Create User objects from a DataFrame.
        
        Handles splitting visits that cross midnight (3 AM threshold).
        """
        # Ensure datetime columns
        df = df.copy()
        df['t_start'] = pd.to_datetime(df['t_start'])
        df['t_end'] = pd.to_datetime(df['t_end'])
        
        users_dict: Dict[int, User] = {}
        
        for _, row in df.iterrows():
            who = int(row['who'])
            
            if who not in users_dict:
                users_dict[who] = User(id=who)
            
            user = users_dict[who]
            
            # Process visits that may cross the 3 AM boundary
            visits_to_add = cls._process_crossing_visits(row)
            
            for visit, visit_date in visits_to_add:
                # Get or create trajectory for this date
                if visit_date not in user.trajectories:
                    user.trajectories[visit_date] = Trajectory(date=visit_date)
                
                user.trajectories[visit_date].add_visit(visit)
        
        return users_dict
    
    @staticmethod
    def _process_crossing_visits(row: pd.Series) -> List[tuple]:
        """
        Process a visit record that may cross midnight (3 AM threshold).
        
        The daily trajectory is defined as: from 3:00 AM today to 3:00 AM tomorrow.
        A timestamp is assigned to its "active day" based on when the person was likely active:
        - Before 3 AM: belongs to previous calendar day (stayed up late)
        - After 3 AM: belongs to current calendar day
        
        This method splits a visit into segments that fit within individual active days.
        
        Handles complex cases:
        1. Visit crosses a single 3 AM threshold
        2. Visit crosses multiple 3 AM thresholds (e.g., spanning multiple days)
        3. Visit starts after 3 AM and ends after 3 AM but on different days
        
        Returns:
        - List of (Visit, date) tuples, where date is YYYYMMDD of the active day
        """
        threshold_hour = User.DAY_THRESHOLD_HOUR
        result = []
        
        t_start = pd.to_datetime(row['t_start'])
        t_end = pd.to_datetime(row['t_end'])
        
        # Create initial visit with original attributes
        visit = Visit(
            t_start=t_start,
            t_end=t_end,
            lon=float(row['lon']),
            lat=float(row['lat']),
            cluster_id=int(row.get('cluster_id', -1)),
            ptype=int(row['ptype']) if pd.notna(row.get('ptype')) else None,
            poi=int(row['poi']) if pd.notna(row.get('poi')) else None
        )
        
        # Calculate threshold dates for start and end
        # The threshold date represents the "active day" for a given timestamp
        # This is based on the intuition that:
        # - Early morning (e.g., 2 AM) belongs to the previous calendar day (staying up late)
        # - Late night (e.g., 4 AM) belongs to the current calendar day
        #
        # A trajectory day is: from 3:00 AM today to 3:00 AM tomorrow
        # e.g., 2019-01-01 02:00 belongs to threshold date 2018-12-31 (stayed up late)
        # e.g., 2019-01-01 04:00 belongs to threshold date 2019-01-01
        def get_threshold_date(dt: pd.Timestamp) -> int:
            """Get the threshold date (YYYYMMDD) for a given datetime."""
            if dt.hour >= threshold_hour:
                # After 3 AM: belongs to current calendar day
                threshold_date = dt.date()
            else:
                # Before 3 AM: belongs to previous calendar day (stayed up late)
                threshold_date = (dt - timedelta(days=1)).date()
            return int(threshold_date.strftime('%Y%m%d'))
        
        start_threshold_date = get_threshold_date(t_start)
        end_threshold_date = get_threshold_date(t_end)
        
        # If no threshold crossing, no splitting needed
        if start_threshold_date == end_threshold_date:
            result.append((visit, start_threshold_date))
            return result
        
        # Visit spans multiple threshold dates - need to split
        # Find the first threshold that is AFTER t_start
        # If t_start is already past today's threshold, first threshold is tomorrow
        first_threshold = t_start.replace(hour=threshold_hour, minute=0, second=0)
        if t_start >= first_threshold:
            first_threshold = first_threshold + timedelta(days=1)
        
        # Part 1: From t_start to first threshold (if t_start is before first threshold)
        if t_start < first_threshold:
            visit1 = Visit(
                t_start=t_start,
                t_end=first_threshold,
                lon=visit.lon,
                lat=visit.lat,
                cluster_id=visit.cluster_id,
                ptype=visit.ptype,
                poi=visit.poi
            )
            # Segment ends before first threshold, so it belongs to previous trajectory
            # But we use start_threshold_date which correctly handles this
            result.append((visit1, start_threshold_date))
        
        # Middle parts: full threshold-to-threshold segments
        # These represent full days where the person was stationary
        current_threshold = first_threshold
        while current_threshold < t_end:
            next_threshold = current_threshold + timedelta(days=1)
            if next_threshold > t_end:
                break
            
            # Create a visit representing the full stationary day
            middle_visit = Visit(
                t_start=current_threshold,
                t_end=next_threshold,
                lon=visit.lon,
                lat=visit.lat,
                cluster_id=visit.cluster_id,
                ptype=visit.ptype,
                poi=visit.poi
            )
            # This segment crosses midnight, so belongs to the day after current_threshold starts
            # Which is exactly what get_threshold_date(t_end) gives us
            current_threshold_date = get_threshold_date(current_threshold + timedelta(hours=12))
            result.append((middle_visit, current_threshold_date))
            
            current_threshold = next_threshold
        
        # Last part: from last threshold to t_end
        last_threshold = current_threshold
        if t_end > last_threshold:
            last_visit = Visit(
                t_start=last_threshold,
                t_end=t_end,
                lon=visit.lon,
                lat=visit.lat,
                cluster_id=visit.cluster_id,
                ptype=visit.ptype,
                poi=visit.poi
            )
            # Use the end time to determine the trajectory date
            last_threshold_date = get_threshold_date(t_end)
            result.append((last_visit, last_threshold_date))
        
        return result
    
    def __repr__(self) -> str:
        return f"User(id={self.id}, trajectories={self.num_trajectories}, total_visits={self.total_visits})"


def convert_dataframe_to_users(df: pd.DataFrame) -> Dict[int, User]:
    """
    Convert a DataFrame to a dictionary of User objects.
    
    Parameters:
    - df: DataFrame with columns ['who', 'date', 't_start', 't_end', 'lon', 'lat', 'cluster_id', 'ptype', 'poi']
    
    Returns:
    - Dictionary mapping user_id to User objects
    """
    print(f"Converting DataFrame with {len(df)} records to User objects...")
    users = User.from_dataframe(df)
    print(f"Created {len(users)} User objects")
    return users


# ====== Execute Step 5: Convert DataFrame to User Model ======
print("\n" + "="*60)
print("Converting DataFrame to User Model...")
print("="*60)

users_dict = convert_dataframe_to_users(st_clustered)

# Display sample users
print("\nSample users:")
for i, (user_id, user) in enumerate(list(users_dict.items())[:3]):
    print(f"\n{user}")
    print(f"  - Unique clusters visited: {len(user.unique_clusters)}")
    print(f"  - Date range: {min(user.trajectories.keys())} to {max(user.trajectories.keys())}")

# Convert back to DataFrame to verify
print("\nVerification: Converting User objects back to DataFrame...")
test_user_id = list(users_dict.keys())[0]
test_user = users_dict[test_user_id]
test_df = test_user.to_dataframe()
print(f"User {test_user_id} converted back to DataFrame: {len(test_df)} records")
print(test_df.head())


Converting DataFrame to User Model...
Converting DataFrame with 567555 records to User objects...
Created 441 User objects

Sample users:

User(id=126272, trajectories=350, total_visits=1320)
  - Unique clusters visited: 33
  - Date range: 20181231 to 20191231

User(id=278978, trajectories=346, total_visits=1182)
  - Unique clusters visited: 25
  - Date range: 20181231 to 20191231

User(id=395753, trajectories=360, total_visits=1537)
  - Unique clusters visited: 36
  - Date range: 20181231 to 20191231

Verification: Converting User objects back to DataFrame...
User 126272 converted back to DataFrame: 1320 records
      who      date             t_start               t_end         lon  \
0  126272  20181231 2019-01-01 00:28:58 2019-01-01 03:00:00  113.834643   
1  126272  20190101 2019-01-01 03:00:00 2019-01-01 10:11:49  113.834643   
2  126272  20190101 2019-01-01 10:19:15 2019-01-01 17:13:30  113.948529   
3  126272  20190101 2019-01-01 17:26:58 2019-01-01 18:47:28  113.889141   
4  

# Model-free RL modeling

In [9]:
# Reference: 251111_rl_demo.ipynb - MF modeling approach
# This implements a TD-learning based MF model with time-based rewards

from scipy.special import logsumexp
from scipy.optimize import minimize
from collections import defaultdict
from typing import Dict, List, Tuple, Optional, Any
import warnings

# =============================================================================
# MF Model Configuration
# =============================================================================

@dataclass
class MFConfig:
    """Configuration for Model-Free RL estimation."""
    # Learning rate for TD updates (alpha)
    alpha_init: float = 0.1
    # Softmax inverse temperature (beta)
    beta_init: float = 1.0
    # Exploration probability (epsilon)
    epsilon_init: float = 0.1
    # Reward type: 'linear', 'power', 'log'
    reward_type: str = 'log'
    # Reward parameter (for power/log functions)
    reward_param_init: float = 1.0
    # Maximum iterations for optimization
    maxiter: int = 1000
    # Convergence tolerance
    ftol: float = 1e-6


def compute_reward_array_mf(stay_minutes: np.ndarray, reward_type: str = 'log', 
                            reward_param: Optional[float] = None) -> np.ndarray:
    """
    Compute reward array based on stay duration (time-based rewards).
    
    Parameters:
    - stay_minutes: Array of stay durations in minutes
    - reward_type: 'linear', 'power', or 'log'
    - reward_param: Parameter for power/log reward functions
    
    Returns:
    - Reward array
    """
    # Base reward: normalized stay duration (per 30 minutes)
    base = np.maximum(stay_minutes / 30.0, 0.0)
    
    if reward_type == 'linear' or reward_param is None:
        rewards = base
    elif reward_type == 'power':
        rewards = np.power(base, reward_param)
    elif reward_type == 'log':
        rewards = np.log1p(reward_param * base) / np.log1p(reward_param)
    else:
        rewards = base
    
    # Handle numerical issues
    return np.nan_to_num(rewards, nan=0.0, posinf=0.0, neginf=0.0)


def unpack_params_mf(theta: np.ndarray, feature_dim: int = 0, 
                     has_reward_param: bool = True) -> Dict[str, float]:
    """
    Unpack MF model parameters from optimization vector.
    """
    idx = 0
    
    # Feature weights
    w = theta[:feature_dim] if feature_dim > 0 else np.array([])
    idx += feature_dim
    
    # TD learning rate: alpha = sigmoid(logit_alpha)
    logit_alpha = theta[idx]; idx += 1
    alpha = 1.0 / (1.0 + np.exp(-logit_alpha))
    
    # Softmax temperature: beta = exp(log_beta)
    log_beta = theta[idx]; idx += 1
    beta = np.exp(log_beta)
    
    # Exploration probability: epsilon = sigmoid(logit_epsilon)
    logit_epsilon = theta[idx]; idx += 1
    epsilon = 1.0 / (1.0 + np.exp(-logit_epsilon))
    
    result = {'w': w, 'alpha': alpha, 'beta': beta, 'epsilon': epsilon}
    
    if has_reward_param:
        logit_reward = theta[idx]; idx += 1
        reward_param = np.exp(logit_reward)
        result['reward_param'] = reward_param
    
    return result


def pack_params_mf(alpha: float, beta: float, epsilon: float, 
                   reward_param: Optional[float] = None,
                   feature_dim: int = 0) -> np.ndarray:
    """Pack MF model parameters into optimization vector."""
    params = []
    
    if feature_dim > 0:
        params.extend([0.0] * feature_dim)
    
    params.append(np.log(alpha / (1.0 - alpha)))  # logit_alpha
    params.append(np.log(beta))                   # log_beta
    params.append(np.log(epsilon / (1.0 - epsilon)))  # logit_epsilon
    
    if reward_param is not None:
        params.append(np.log(reward_param))
    
    return np.array(params, dtype=np.float64)


def compute_transition_data(user_df: pd.DataFrame) -> Dict[str, Any]:
    """Prepare transition data from user DataFrame for MF modeling."""
    df = user_df.sort_values('t_start').reset_index(drop=True)
    
    states = df['cluster_id'].astype(int).to_numpy()
    stay_minutes = (df['t_end'] - df['t_start']).dt.total_seconds() / 60.0
    stay_minutes = stay_minutes.to_numpy()
    
    if 'date' in df.columns:
        date_array = df['date'].to_numpy()
    else:
        date_array = None
    
    n_steps = len(states)
    n_transitions = n_steps - 1
    
    same_day_next = np.zeros(n_transitions, dtype=bool)
    if date_array is not None and n_transitions > 1:
        for t in range(n_transitions - 1):
            same_day_next[t] = (date_array[t + 1] == date_array[t + 2])
    
    return {
        'states': states,
        'stay_minutes': stay_minutes,
        'date_array': date_array,
        'n_transitions': n_transitions,
        'same_day_next': same_day_next,
        'df': df
    }


def simulate_and_loglik_mf(theta: np.ndarray,
                           trans_data: Dict[str, Any],
                           feature_dim: int = 0,
                           reward_type: str = 'log',
                           has_reward_param: bool = True) -> float:
    """Compute negative log-likelihood for MF (TD) model."""
    params = unpack_params_mf(theta, feature_dim, has_reward_param)
    alpha = params['alpha']
    beta = params['beta']
    epsilon = params['epsilon']
    reward_param = params.get('reward_param', 1.0)
    
    states = trans_data['states']
    stay_minutes = trans_data['stay_minutes']
    n_transitions = trans_data['n_transitions']
    same_day_next = trans_data['same_day_next']
    
    reward_array = compute_reward_array_mf(stay_minutes, reward_type, reward_param)
    
    Q_td = defaultdict(float)
    known_actions_set = set()
    loglik = 0.0
    
    for t in range(n_transitions):
        s = int(states[t])
        a = int(states[t + 1])
        r_t = float(reward_array[t])
        
        if a != -1:
            known_actions_set.add(a)
        
        known_list = sorted([act for act in known_actions_set if act != -1])
        unknown_list = [act for act in known_actions_set if act not in known_list and act != -1]
        
        if -1 not in known_list and -1 not in unknown_list:
            unknown_list.append(-1)
        
        if known_list:
            q_values = []
            for act in known_list:
                q_values.append(Q_td[(s, act)])
            q_values = np.asarray(q_values, dtype=np.float64)
            logits = beta * q_values
            probs_exploit = np.exp(logits - logsumexp(logits))
        else:
            probs_exploit = None
        
        if probs_exploit is not None and a in known_list:
            idx_a = known_list.index(a)
            pa = (1.0 - epsilon) * probs_exploit[idx_a]
        elif a in unknown_list and len(unknown_list) > 0:
            pa = epsilon * (1.0 / len(unknown_list))
        else:
            pa = 1e-12
        
        loglik += np.log(pa + 1e-12)
        
        if t < n_transitions - 1 and same_day_next[t]:
            a_next = int(states[t + 2])
            next_Q_td = Q_td[(a, a_next)]
        else:
            next_Q_td = 0.0
        
        delta = r_t + next_Q_td - Q_td[(s, a)]
        Q_td[(s, a)] += alpha * delta
    
    return -loglik


def fit_mf_model(user_df: pd.DataFrame,
                 config: Optional[MFConfig] = None,
                 verbose: bool = True) -> Dict[str, Any]:
    """Fit MF (TD) model to a single user's trajectory data."""
    if config is None:
        config = MFConfig()
    
    trans_data = compute_transition_data(user_df)
    n_transitions = trans_data['n_transitions']
    
    if verbose:
        print(f"Preparing MF model for {len(user_df)} visits, {n_transitions} transitions...")
    
    has_reward_param = config.reward_type in ('power', 'log')
    extra_params = 3 + (1 if has_reward_param else 0)
    param_dim = extra_params
    
    initial_theta = pack_params_mf(
        alpha=config.alpha_init,
        beta=config.beta_init,
        epsilon=config.epsilon_init,
        reward_param=config.reward_param_init if has_reward_param else None,
        feature_dim=0
    )
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        result = minimize(
            simulate_and_loglik_mf,
            initial_theta,
            args=(trans_data, 0, config.reward_type, has_reward_param),
            method='L-BFGS-B',
            options={'maxiter': config.maxiter, 'ftol': config.ftol}
        )
    
    fitted_params = unpack_params_mf(result.x, 0, has_reward_param)
    final_loglik = -result.fun
    
    k_params = param_dim
    AIC = 2 * k_params - 2 * final_loglik
    BIC = k_params * np.log(n_transitions) - 2 * final_loglik if n_transitions > 0 else np.inf
    
    summary = {
        'n_visits': len(user_df),
        'n_transitions': int(n_transitions),
        'log_likelihood': float(final_loglik),
        'AIC': float(AIC),
        'BIC': float(BIC),
        'alpha_td': float(fitted_params['alpha']),
        'beta': float(fitted_params['beta']),
        'epsilon_explore': float(fitted_params['epsilon']),
        'reward_type': config.reward_type,
        'reward_param': float(fitted_params.get('reward_param', None)),
        'converged': result.success,
        'n_iterations': result.nit if hasattr(result, 'nit') else None,
        'optimization_message': result.message if hasattr(result, 'message') else None
    }
    
    if verbose:
        print(f"MF model fitting {'converged' if result.success else 'did not converge'}.")
        print(f"  Log-likelihood: {final_loglik:.2f}, AIC: {AIC:.2f}, BIC: {BIC:.2f}")
        print(f"  alpha: {fitted_params['alpha']:.4f}, beta: {fitted_params['beta']:.4f}, epsilon: {fitted_params['epsilon']:.4f}")
        if has_reward_param:
            print(f"  reward_param: {fitted_params.get('reward_param', None):.4f}")
    
    return summary


def fit_mf_for_all_users(users_dict: Dict[int, Any],
                         config: Optional[MFConfig] = None,
                         sample_size: Optional[int] = None,
                         verbose: bool = True) -> pd.DataFrame:
    """Fit MF model for all users in the dictionary."""
    if config is None:
        config = MFConfig()
    
    user_ids = list(users_dict.keys())
    if sample_size is not None:
        user_ids = user_ids[:sample_size]
    
    results = []
    for i, user_id in enumerate(user_ids):
        if verbose:
            print(f"[{i+1}/{len(user_ids)}] Fitting MF for user {user_id}...")
        
        user = users_dict[user_id]
        user_df = user.to_dataframe()
        
        try:
            result = fit_mf_model(user_df, config, verbose=False)
            result['user_id'] = user_id
            results.append(result)
        except Exception as e:
            if verbose:
                print(f"  Error fitting user {user_id}: {e}")
            results.append({
                'user_id': user_id, 'n_visits': 0, 'n_transitions': 0,
                'log_likelihood': np.nan, 'AIC': np.nan, 'BIC': np.nan,
                'alpha_td': np.nan, 'beta': np.nan, 'epsilon_explore': np.nan,
                'reward_type': config.reward_type, 'reward_param': np.nan,
                'converged': False, 'n_iterations': None, 'optimization_message': str(e)
            })
    
    return pd.DataFrame(results)


# ====== Execute MF Modeling ======
print("\n" + "="*60)
print("Model-Free (MF) RL Estimation")
print("="*60)

mf_config = MFConfig(
    alpha_init=0.1, beta_init=1.0, epsilon_init=0.1,
    reward_type='log', reward_param_init=1.0,
    maxiter=500, ftol=1e-6
)

mf_results = fit_mf_for_all_users(
    users_dict, config=mf_config, 
    sample_size=10, verbose=True
)

print("\nMF Model Fitting Summary:")
print(mf_results[['user_id', 'n_transitions', 'log_likelihood', 'AIC', 'BIC', 
                  'alpha_td', 'beta', 'epsilon_explore']].to_string(index=False))

mf_results.to_csv('mf_estimation_results.csv', index=False)
print(f"\nMF estimation results saved to 'mf_estimation_results.csv'")

print("\nAggregate Statistics Across Users:")
print(f"  Average alpha (TD rate): {mf_results['alpha_td'].mean():.4f} ± {mf_results['alpha_td'].std():.4f}")
print(f"  Average beta (softmax temp): {mf_results['beta'].mean():.4f} ± {mf_results['beta'].std():.4f}")
print(f"  Average epsilon (explore): {mf_results['epsilon_explore'].mean():.4f} ± {mf_results['epsilon_explore'].std():.4f}")
print(f"  Average log-likelihood: {mf_results['log_likelihood'].mean():.2f} ± {mf_results['log_likelihood'].std():.2f}")



Model-Free (MF) RL Estimation
[1/10] Fitting MF for user 126272...


KeyboardInterrupt: 