### Task 8 Solution

In [1]:
import pandas as pd
from haversine import haversine, Unit
from datetime import timedelta
import itertools
from shapely.geometry import Polygon
from utils.utils import *
import os

connection_string = f"mysql+mysqlconnector://{os.environ.get('DB_USER')}:{os.environ.get('DB_PASSWORD')}@tdt4225-10.idi.ntnu.no:3306/default_db"

In [4]:
sql_query = """

    SELECT a.user_id, tp.activity_id, tp.date_time, tp.lat, tp.lon
    FROM activity a JOIN track_point tp ON a.id = tp.activity_id
"""

tp_df = query_to_dataframe(connection_string, sql_query)
tp_df

Unnamed: 0,user_id,activity_id,date_time,lat,lon
0,000,20081023025304000,2008-10-23 02:53:10,39.984683,116.318450
1,000,20081023025304000,2008-10-23 02:53:15,39.984686,116.318417
2,000,20081023025304000,2008-10-23 02:53:20,39.984688,116.318385
3,000,20081023025304000,2008-10-23 02:53:25,39.984655,116.318263
4,000,20081023025304000,2008-10-23 02:53:30,39.984611,116.318026
...,...,...,...,...,...
9555432,181,20080314025755181,2008-03-14 03:39:56,40.914867,111.710500
9555433,181,20080314025755181,2008-03-14 03:41:17,40.914267,111.710333
9555434,181,20080314025755181,2008-03-14 03:43:02,40.912467,111.710667
9555435,181,20080314025755181,2008-03-14 03:43:28,40.911517,111.711317


In [5]:
tp_dict = (tp_df.groupby(['user_id', 'activity_id'])
                .apply(lambda x: x[['date_time', 'lat', 'lon']].values.tolist())
                .to_dict())


In [6]:
sql_query = """

    SELECT a.user_id, a.id, a.start_date_time, a.end_date_time
    FROM activity a 
"""

activity_df = query_to_dataframe(connection_string, sql_query)
activity_df

Unnamed: 0,user_id,id,start_date_time,end_date_time
0,163,20000101231219163,2000-01-01 23:12:19,2000-01-01 23:15:23
1,142,20070412093132142,2007-04-12 09:31:32,2007-04-12 11:33:40
2,161,20070412101853161,2007-04-12 10:18:53,2007-04-12 10:23:15
3,163,20070412102116163,2007-04-12 10:21:16,2007-04-12 14:56:56
4,161,20070412102325161,2007-04-12 10:23:25,2007-04-12 10:26:25
...,...,...,...,...
16043,163,20120724204557163,2012-07-24 20:45:57,2012-07-24 22:01:15
16044,163,20120725080203163,2012-07-25 08:02:03,2012-07-25 08:26:13
16045,163,20120725190300163,2012-07-25 19:03:00,2012-07-25 19:55:08
16046,163,20120726080106163,2012-07-26 08:01:06,2012-07-26 08:27:24


### Find overlapping time-intervals 

This cells below is all functions to extract the activites or trackpoints which overlaps on their timeintervall including a buffer of 30 seconds on each side of the interval 

In [120]:

#Extract the first and last registation by each user
def create_user_first_last_times(activity_df):
    user_first_last_times = activity_df.groupby('user_id').agg(
        first_registered_time=pd.NamedAgg(column='start_date_time', aggfunc='min'),
        last_registered_time=pd.NamedAgg(column='end_date_time', aggfunc='max')
    ).reset_index()

    return user_first_last_times

#Finds the overlapping time intervals for each unique pair of user combination based on their first/last registred times
def find_overlap_time_intervals(user_first_last_times):
    overlaps = {}

    # Create unique pairs of users
    unique_user_ids = user_first_last_times['user_id'].unique()
    user_id_pairs = list(itertools.combinations(unique_user_ids, 2))

    for user1_id, user2_id in user_id_pairs:
        user1_data = user_first_last_times[user_first_last_times['user_id'] == user1_id].iloc[0]
        user2_data = user_first_last_times[user_first_last_times['user_id'] == user2_id].iloc[0]

        overlap_start = max(user1_data['first_registered_time'], user2_data['first_registered_time'])
        overlap_end = min(user1_data['last_registered_time'], user2_data['last_registered_time'])
        if overlap_start < overlap_end:
            overlaps[(user1_id, user2_id)] = (overlap_start, overlap_end)

    return overlaps

# Find the activities that overlap between a user pair and return them as a dictionary
def find_overlapping_activities(overlapping_user_time_intervals, activity_df, include_time = False):
    overlapping_activities = {}

    for (user1_id, user2_id), (overlap_start, overlap_end) in overlapping_user_time_intervals.items():
        user1_interval = list(activity_df[(activity_df["user_id"] == user1_id )& 
                            (
                            ((activity_df['start_date_time'] >= (overlap_start - timedelta(seconds=30))) &
                            (activity_df['start_date_time'] <= (overlap_end + timedelta(seconds=30)))) |

                            ((activity_df['end_date_time'] >= (overlap_start - timedelta(seconds=30))) &
                            (activity_df['end_date_time'] <= (overlap_end + timedelta(seconds=30))))
                            )
                            ].itertuples())
        
        user2_interval = list(activity_df[(activity_df["user_id"] == user2_id) & 
                            (
                            ((activity_df['start_date_time'] >= (overlap_start - timedelta(seconds=30))) &
                            (activity_df['start_date_time'] <= (overlap_end + timedelta(seconds=30)))) |
                            ((activity_df['end_date_time'] >= (overlap_start - timedelta(seconds=30))) &
                            (activity_df['end_date_time'] <= (overlap_end + timedelta(seconds=30))))
                            )
                            ].itertuples())
                
        overlapping_activities[(user1_id, user2_id)] = []

        for u1 in user1_interval:
            for u2 in user2_interval:
                overlap_start = max(u1.start_date_time, u2.start_date_time)
                overlap_end = min(u1.end_date_time, u2.end_date_time)

                if overlap_start < overlap_end:
                    if include_time == True: 
                        overlapping_activities[(user1_id, user2_id)].append((u1.id, u2.id, overlap_start, overlap_end))
                    else: 
                        overlapping_activities[(user1_id, user2_id)].append((u1.id, u2.id))

    return overlapping_activities

#Returns a dataframe with the trackpoints from the overlapping activities 
def get_matching_trackpoints_from_dict(tp_dict, overlapping_activities):
    matching_trackpoints = []

    for user_pair, activity_list in overlapping_activities.items():
        user_id_1, user_id_2 = user_pair

        for activity_info in activity_list:
            activity_id1, activity_id2, overlap_start, overlap_end = activity_info

            user1_trackpoints = tp_dict.get((user_id_1, activity_id1), [])
            user2_trackpoints = tp_dict.get((user_id_2, activity_id2), [])
            
           
            for trackpoint in user1_trackpoints:
                if ((trackpoint[0] >= overlap_start) and trackpoint[0] <= overlap_end): 
                    matching_trackpoints.append((user_id_1, activity_id1) + tuple(trackpoint))

            for trackpoint in user2_trackpoints:
                if ((trackpoint[0] >= overlap_start) and trackpoint[0] <= overlap_end):
                        matching_trackpoints.append((user_id_2, activity_id2) + tuple(trackpoint))

 
    merged_trackpoints = pd.DataFrame(matching_trackpoints, columns=['user_id', 'activity_id', 'date_time', 'lat', 'lon'])
    merged_trackpoints.drop_duplicates(subset=['activity_id', 'date_time'], inplace=True)
    return merged_trackpoints

### Find distance between boundary-boxes 

The functions below makes boundary boxes for the given activities and calculates the distance between the boxes. Those boxes larger than 50 meters gets excluded 

In [141]:
#Finds the boundary boxes defined as the combined min/max combinations of lat and lon
import numpy as np

def find_activity_bounding_box(df):
    df = df.copy()
    new_df = df.groupby(['user_id', 'activity_id'])[['lat', 'lon']].agg(['min', 'max']).reset_index()
    
    def calculate_boundary_box(row):
        return [
            (row[('lat', 'min')], row[('lon', 'min')]),
            (row[('lat', 'min')], row[('lon', 'max')]),
            (row[('lat', 'max')], row[('lon', 'min')]),
            (row[('lat', 'max')], row[('lon', 'max')])
        ]

    new_df['bounding_box'] = new_df.apply(calculate_boundary_box, axis=1)
    # new_df.drop(columns=['lat', 'lon'], axis=1, inplace=True)
    
    return new_df

#Calculates the distance with 
def haversine_distance(point1, point2): 
    return haversine(point1, point2, unit=Unit.METERS)

def calculate_center_and_radius(bounding_box):
    polygon = Polygon(bounding_box)
    center = polygon.centroid
    radius = haversine_distance(center.coords[0], bounding_box[0])
    return center, radius

def calculate_distance_from_centers(center1, radius1, center2, radius2):
    distance = haversine_distance(center1.coords[0], center2.coords[0]) - radius1 - radius2
    return distance  

def calculate_distance_two_boxes(box_df, overlapping_activities_dict):
    box_df = box_df.copy()
    new_dict = overlapping_activities_dict.copy()
    for user_pair, activity_list in new_dict.items():
            user_id_1, user_id_2 = user_pair
            
            excluded_activities = []
            for activity_info in activity_list:
                try:
                    activity_id1, activity_id2 = activity_info
                    box1 = box_df[box_df["activity_id"] == activity_id1].bounding_box.values[0]
                    box2 = box_df[box_df["activity_id"] == activity_id2].bounding_box.values[0]
                    center1, rad1 = calculate_center_and_radius(box1)
                    center2, rad2 = calculate_center_and_radius(box2)
                    distance = calculate_distance_from_centers(center1, rad1, center2, rad2)
                    if distance >= 50:
                        excluded_activities.append(activity_info)
                except:
                    continue

            curr_activity_list = new_dict[user_pair]

            new_dict[user_pair] = [activity_pair for activity_pair in curr_activity_list if activity_pair not in excluded_activities]

    return new_dict

In [142]:
def get_overlapping_activites(activity_df, include_date_time): 
    activity_df = activity_df.copy()
    first_last_registration = create_user_first_last_times(activity_df)
    user_pair_overlap_times = find_overlap_time_intervals(first_last_registration)
    if include_date_time == True: 
        overlapping_activities = find_overlapping_activities(user_pair_overlap_times, activity_df, include_date_time)
    else: 
        overlapping_activities = find_overlapping_activities(user_pair_overlap_times, activity_df, include_date_time)   
    return overlapping_activities

In [143]:
def get_overlapping_activities_based_on_distance(tp_overlap_activities, overlapping_activities_dict): 
    box_df = find_activity_bounding_box(tp_overlap_activities) 
    matching_activites = calculate_distance_two_boxes(box_df, overlapping_activities_dict)
    return matching_activites
    

In [144]:
overlapping_activity_with_date_time = get_overlapping_activites( activity_df, True)

In [145]:
overlapping_activities_only = get_overlapping_activites(activity_df, False)

In [146]:
tp_from_overlapping_activities = get_matching_trackpoints_from_dict(tp_dict, overlapping_activity_with_date_time)


In [147]:
tp_from_overlapping_activities

Unnamed: 0,user_id,activity_id,date_time,lat,lon
0,000,20081023025304000,2008-10-23 09:42:30,40.004783,116.320388
1,000,20081023025304000,2008-10-23 09:42:35,40.004799,116.320545
2,000,20081023025304000,2008-10-23 09:42:40,40.004835,116.320683
3,000,20081023025304000,2008-10-23 09:42:45,40.004851,116.320835
4,000,20081023025304000,2008-10-23 09:42:50,40.004868,116.321010
...,...,...,...,...,...
20563439,174,20071211055512174,2007-12-11 06:08:17,39.971633,116.304733
20563486,176,20071208013029176,2007-12-08 03:15:01,39.971100,116.304933
20563487,176,20071208013029176,2007-12-08 03:15:54,39.970817,116.304250
20563488,176,20071208013029176,2007-12-08 03:16:57,39.970867,116.303250


In [148]:
activity_overlap_dict = get_overlapping_activities_based_on_distance(tp_from_overlapping_activities, overlapping_activities_only)

In [165]:
tot_len = 0
for key, value in activity_overlap_dict.items():
    tot_len += len(value)

tot_len

13333

In [166]:
tot_len = 0
for key, value in overlapping_activities_only.items():
    tot_len += len(value)

tot_len

28312

In [175]:
import pandas as pd
import numpy as np
from haversine import haversine_vector, Unit

def match_track_points(activity_id: str, comp_activity_id: str, track_points_df: pd.DataFrame, verbose=False):
    activity_points = track_points_df[track_points_df["activity_id"] == activity_id]
    comp_activity_points = track_points_df[track_points_df["activity_id"] == comp_activity_id]
    
    if activity_points.empty or comp_activity_points.empty:
        return False

    df_cartesian = pd.merge(activity_points.assign(key=0), comp_activity_points.assign(key=0), on='key').drop('key', axis=1)

    distances = haversine_vector(df_cartesian[['lat_x', 'lon_x']].values, df_cartesian[['lat_y', 'lon_y']].values, unit=Unit.METERS)

    time_diffs = (df_cartesian['date_time_x'] - df_cartesian['date_time_y']).abs().dt.total_seconds()

    distance_condition = distances <= 50
    time_condition = time_diffs <= 30

    if verbose:
        matched_distances = df_cartesian.loc[distance_condition, ['date_time_x', 'date_time_y']]
        for _, row in matched_distances.iterrows():
            print(row['date_time_x'], row['date_time_y'], "Distance Matched")

        matched_times = df_cartesian.loc[time_condition, ['date_time_x', 'date_time_y']]
        for _, row in matched_times.iterrows():
            print(row['date_time_x'], row['date_time_y'], "Time Matched")
    
    # Check for rows satisfying both conditions
    return np.any(distance_condition & time_condition)

In [176]:
final_dict = activity_overlap_dict.copy()
final_dict = {key: value for key, value in final_dict.items() if len(value) > 0}

In [177]:
from itertools import islice
import concurrent.futures

MATCHED_PAIRS = {}

def process_item(item):
    user_pair = item
    activity_pairs = final_dict[item]
    for activity_pair in activity_pairs: 
        activity_id1, activity_id2 = activity_pair
        has_match = match_track_points(activity_id1, activity_id2, tp_df)
        if has_match == True:
            MATCHED_PAIRS[user_pair] = activity_pair
            break

with concurrent.futures.ThreadPoolExecutor() as executor:
    list(executor.map(process_item, final_dict))

In [178]:
MATCHED_PAIRS

{('000', '004'): ('20081029093038000', '20081029052125004'),
 ('000', '015'): ('20081112023003000', '20081112000936015'),
 ('000', '030'): ('20090412004905000', '20090412004858030'),
 ('000', '005'): ('20081115010133000', '20081115010304005'),
 ('000', '003'): ('20090330005208000', '20090330005208003'),
 ('001', '004'): ('20081111234235001', '20081112063539004'),
 ('001', '014'): ('20081029234123001', '20081029234245014'),
 ('001', '015'): ('20081111234235001', '20081112000936015'),
 ('001', '084'): ('20081104054859001', '20081104102003084'),
 ('001', '128'): ('20081114130723001', '20081114131233128'),
 ('003', '004'): ('20081023175854003', '20081023175852004'),
 ('003', '005'): ('20081027041826003', '20081027092607005'),
 ('003', '028'): ('20090210022307003', '20090210091805028'),
 ('003', '030'): ('20090208020837003', '20090208020851030'),
 ('003', '015'): ('20090303004756003', '20090303004804015'),
 ('003', '096'): ('20081115014037003', '20081115015705096'),
 ('003', '126'): ('20090