In [1]:
# Required libraries
from sqlalchemy import create_engine
import pandas as pd
from haversine import haversine, Unit
from datetime import timedelta
import itertools
from shapely.geometry import Polygon


In [2]:
def query_to_dataframe(connection_string, query):
    """
    Query a MySQL database using SQLAlchemy and return the result as a pandas DataFrame.

    Parameters:
    - connection_string (str): The connection string for the database.
    - query (str): The SQL query to be executed.

    Returns:
    - df (pd.DataFrame): The result of the query as a pandas DataFrame.
    """
    # Create an engine
    engine = create_engine(connection_string)
    
    # Connect to the engine and execute the query
    with engine.connect() as connection:
        df = pd.read_sql(query, connection)
        
    return df

In [3]:
connection_string = "mysql+mysqlconnector://arasham:Generalfuzn03?@tdt4225-10.idi.ntnu.no:3306/default_db"

In [5]:
# Task 8

#Simple query to get the 
sql_query = """

    SELECT a.user_id, tp.activity_id, tp.date_time, tp.lat, tp.lon
    FROM activity a JOIN track_point tp ON a.id = tp.activity_id
"""

tp_df = query_to_dataframe(connection_string, sql_query)
tp_df

Unnamed: 0,user_id,activity_id,date_time,lat,lon
0,000,20081023025304000,2008-10-23 02:53:10,39.984683,116.318450
1,000,20081023025304000,2008-10-23 02:53:15,39.984686,116.318417
2,000,20081023025304000,2008-10-23 02:53:20,39.984688,116.318385
3,000,20081023025304000,2008-10-23 02:53:25,39.984655,116.318263
4,000,20081023025304000,2008-10-23 02:53:30,39.984611,116.318026
...,...,...,...,...,...
9555432,181,20080314025755181,2008-03-14 03:39:56,40.914867,111.710500
9555433,181,20080314025755181,2008-03-14 03:41:17,40.914267,111.710333
9555434,181,20080314025755181,2008-03-14 03:43:02,40.912467,111.710667
9555435,181,20080314025755181,2008-03-14 03:43:28,40.911517,111.711317


In [6]:
tp_dict = (tp_df.groupby(['user_id', 'activity_id'])
                .apply(lambda x: x[['date_time', 'lat', 'lon']].values.tolist())
                .to_dict())


In [7]:
sql_query = """

    SELECT a.user_id, a.id, a.start_date_time, a.end_date_time
    FROM activity a 
"""

activity_df = query_to_dataframe(connection_string, sql_query)
activity_df

Unnamed: 0,user_id,id,start_date_time,end_date_time
0,163,20000101231219163,2000-01-01 23:12:19,2000-01-01 23:15:23
1,142,20070412093132142,2007-04-12 09:31:32,2007-04-12 11:33:40
2,161,20070412101853161,2007-04-12 10:18:53,2007-04-12 10:23:15
3,163,20070412102116163,2007-04-12 10:21:16,2007-04-12 14:56:56
4,161,20070412102325161,2007-04-12 10:23:25,2007-04-12 10:26:25
...,...,...,...,...
16043,163,20120724204557163,2012-07-24 20:45:57,2012-07-24 22:01:15
16044,163,20120725080203163,2012-07-25 08:02:03,2012-07-25 08:26:13
16045,163,20120725190300163,2012-07-25 19:03:00,2012-07-25 19:55:08
16046,163,20120726080106163,2012-07-26 08:01:06,2012-07-26 08:27:24


### Find overlapping time-intervals 

This cells below is all functions to extract the activites or trackpoints which overlaps on their timeintervall including a buffer of 30 seconds on each side of the interval 

In [8]:

#Extract the first and last registation by each user
def create_user_first_last_times(activity_df):
    user_first_last_times = activity_df.groupby('user_id').agg(
        first_registered_time=pd.NamedAgg(column='start_date_time', aggfunc='min'),
        last_registered_time=pd.NamedAgg(column='end_date_time', aggfunc='max')
    ).reset_index()

    return user_first_last_times

#Finds the overlapping time intervals for each unique pair of user combination based on their first/last registred times
def find_overlap_time_intervals(user_first_last_times):
    overlaps = {}

    # Create unique pairs of users
    unique_user_ids = user_first_last_times['user_id'].unique()
    user_id_pairs = list(itertools.combinations(unique_user_ids, 2))

    for user1_id, user2_id in user_id_pairs:
        user1_data = user_first_last_times[user_first_last_times['user_id'] == user1_id].iloc[0]
        user2_data = user_first_last_times[user_first_last_times['user_id'] == user2_id].iloc[0]

        overlap_start = max(user1_data['first_registered_time'], user2_data['first_registered_time'])
        overlap_end = min(user1_data['last_registered_time'], user2_data['last_registered_time'])
        if overlap_start < overlap_end:
            overlaps[(user1_id, user2_id)] = (overlap_start, overlap_end)

    return overlaps

# Find the activities that overlap between a user pair and return them as a dictionary
def find_overlapping_activities(overlapping_user_time_intervals, activity_df, include_time = False):
    overlapping_activities = {}

    for (user1_id, user2_id), (overlap_start, overlap_end) in overlapping_user_time_intervals.items():
        user1_interval = list(activity_df[(activity_df["user_id"] == user1_id )& 
                            (
                            ((activity_df['start_date_time'] >= (overlap_start - timedelta(seconds=30))) &
                            (activity_df['start_date_time'] <= (overlap_end + timedelta(seconds=30)))) |

                            ((activity_df['end_date_time'] >= (overlap_start - timedelta(seconds=30))) &
                            (activity_df['end_date_time'] <= (overlap_end + timedelta(seconds=30))))
                            )
                            ].itertuples())
        
        user2_interval = list(activity_df[(activity_df["user_id"] == user2_id) & 
                            (
                            ((activity_df['start_date_time'] >= (overlap_start - timedelta(seconds=30))) &
                            (activity_df['start_date_time'] <= (overlap_end + timedelta(seconds=30)))) |
                            ((activity_df['end_date_time'] >= (overlap_start - timedelta(seconds=30))) &
                            (activity_df['end_date_time'] <= (overlap_end + timedelta(seconds=30))))
                            )
                            ].itertuples())
                
        overlapping_activities[(user1_id, user2_id)] = []

        for u1 in user1_interval:
            for u2 in user2_interval:
                overlap_start = max(u1.start_date_time, u2.start_date_time)
                overlap_end = min(u1.end_date_time, u2.end_date_time)

                if overlap_start < overlap_end:
                    if include_time == True: 
                        overlapping_activities[(user1_id, user2_id)].append((u1.id, u2.id, overlap_start, overlap_end))
                    else: 
                        overlapping_activities[(user1_id, user2_id)].append((u1.id, u2.id))

    return overlapping_activities

#Returns a dataframe with the trackpoints from the overlapping activities 
def get_matching_trackpoints_from_dict(tp_dict, overlapping_activities):
    matching_trackpoints = []

    for user_pair, activity_list in overlapping_activities.items():
        user_id_1, user_id_2 = user_pair

        for activity_info in activity_list:
            activity_id1, activity_id2, overlap_start, overlap_end = activity_info

            user1_trackpoints = tp_dict.get((user_id_1, activity_id1), [])
            user2_trackpoints = tp_dict.get((user_id_2, activity_id2), [])
            
           
            for trackpoint in user1_trackpoints:
                if ((trackpoint[0] >= overlap_start) and trackpoint[0] <= overlap_end): 
                    matching_trackpoints.append((user_id_1, activity_id1) + tuple(trackpoint))

            for trackpoint in user2_trackpoints:
                if ((trackpoint[0] >= overlap_start) and trackpoint[0] <= overlap_end):
                        matching_trackpoints.append((user_id_2, activity_id2) + tuple(trackpoint))

 
    merged_trackpoints = pd.DataFrame(matching_trackpoints, columns=['user_id', 'activity_id', 'date_time', 'lat', 'lon'])
    merged_trackpoints.drop_duplicates(subset=['activity_id', 'date_time'], inplace=True)
    return merged_trackpoints

### Find distance between boundary-boxes 

The functions below makes boundary boxes for the given activities and calculates the distance between the boxes. Those boxes larger than 50 meters gets excluded 

In [38]:
#Finds the boundary boxes defined as the combined min/max combinations of lat and lon
# def find_activity_bounding_box(df):
#     bounding_boxes = {}

#     new_df = df[["user_id", "activity_id"]].drop_duplicates()
    
#     for data in new_df.itertuples():
#         user_id = data.user_id
#         activity_id = data.activity_id
    
#         activity_tp = df[df['activity_id'] == activity_id]

#         min_lat = activity_tp['lat'].min()
#         max_lat = activity_tp['lat'].max()
#         min_lon = activity_tp['lon'].min()
#         max_lon = activity_tp['lon'].max()

#         bounding_boxes[activity_id] = {
#             "user_id": user_id,
#             'bounding_box': [(min_lat, min_lon), (min_lat, max_lon), (max_lat, min_lon), (max_lat, max_lon)]
#         }

#     bounding_boxes_df = pd.DataFrame(bounding_boxes).transpose().reset_index()
#     bounding_boxes_df.rename(columns={'index': 'activity_id'}, inplace=True)

#     return bounding_boxes_df

def calculate_boundary_box(row):
        return [
            (row[('lat', 'min')], row[('lon', 'min')]),
            (row[('lat', 'min')], row[('lon', 'max')]),
            (row[('lat', 'max')], row[('lon', 'min')]),
            (row[('lat', 'max')], row[('lon', 'max')])
        ]

def find_activity_bounding_box(df):
    df = df.copy()
    new_df = df.groupby(['user_id', 'activity_id'])[['lat', 'lon']].agg(['min', 'max']).reset_index()
    new_df['bounding_box'] = new_df.apply(calculate_boundary_box, axis=1)
    return new_df
   

#Calculates the distance with 
def haversine_distance(point1, point2): 
    return haversine(point1, point2, unit=Unit.METERS)

def calculate_center_and_radius(bounding_box):
    polygon = Polygon(bounding_box)
    center = polygon.centroid
    radius = haversine_distance(center.coords[0], bounding_box[0])
    return center, radius

def calculate_distance_from_centers(center1, radius1, center2, radius2):
    distance = haversine_distance(center1.coords[0], center2.coords[0]) - radius1 - radius2
    return distance  

def calculate_distance_two_boxes(box_df, overlapping_activities_dict):
    box_df = box_df.copy()
    new_dict = overlapping_activities_dict.copy()
    for user_pair, activity_list in new_dict.items():
            user_id_1, user_id_2 = user_pair
            
            excluded_activities = []
            for activity_info in activity_list:
                try:
                    activity_id1, activity_id2 = activity_info
                    box1 = box_df[box_df["activity_id"] == activity_id1].bounding_box.values[0]
                    box2 = box_df[box_df["activity_id"] == activity_id2].bounding_box.values[0]
                    center1, rad1 = calculate_center_and_radius(box1)
                    center2, rad2 = calculate_center_and_radius(box2)
                    distance = calculate_distance_from_centers(center1, rad1, center2, rad2)
                    if distance >= 50:
                        excluded_activities.append(activity_info)
                except:
                    continue

            curr_activity_list = new_dict[user_pair]

            new_dict[user_pair] = [activity_pair for activity_pair in curr_activity_list if activity_pair not in excluded_activities]

    return new_dict

# def calculate_shortest_distances(box_df, activity_list):
#     distances_less_than_50 = {}
#     centers = []
#     radii = []

#     for i in range(len(box_df)):
#         bounding_box = box_df.iloc[i]['bounding_box']
#         center, radius = calculate_center_and_radius(bounding_box)
#         centers.append(center)
#         radii.append(radius)

#     for i in range(len(box_df)):
#         for j in range(i + 1, len(box_df)):
#             user_id_1 = box_df.iloc[i]['user_id']
#             user_id_2 = box_df.iloc[j]['user_id']
#             activity_id_1 = box_df.iloc[i]['activity_id']
#             activity_id_2 = box_df.iloc[j]['activity_id']

#             # Check for different users and different activity IDs
#             if user_id_1 != user_id_2 and activity_id_1 != activity_id_2:
#                 distance = calculate_distance_from_centers(centers[i], radii[i], centers[j], radii[j])
#                 if distance <= 50:
#                     distances_less_than_50[(user_id_1, activity_id_1, user_id_2, activity_id_2)] = distance

#     return distances_less_than_50


In [10]:
def get_overlapping_activites(activity_df, include_date_time): 
    first_last_registration = create_user_first_last_times(activity_df)
    user_pair_overlap_times = find_overlap_time_intervals(first_last_registration)
    if include_date_time == True: 
        overlapping_activities = find_overlapping_activities(user_pair_overlap_times, activity_df, include_date_time)
    else: 
        overlapping_activities = find_overlapping_activities(user_pair_overlap_times, activity_df, include_date_time)   
    return overlapping_activities


    

In [11]:
# def extract_matching_keys(overlap_activities, distance_dict):
#     matching_keys = []
    
#     for user_pair, activity_list in overlap_activities.items():
#         user_id_1, user_id_2 = user_pair

#         for activity_info in activity_list:
#             activity_id1, activity_id2 = activity_info

#             # Check if the keys exist in distance_dict
#             key1 = (user_id_1, activity_id1, user_id_2, activity_id2)
#             key2 = (user_id_2, activity_id2, user_id_1, activity_id1)

#             if key1 in distance_dict:
#                 matching_keys.append(key1)
#             elif key2 in distance_dict:
#                 matching_keys.append(key2)
                

#     return matching_keys


In [31]:
def get_overlapping_activities_based_on_distance(tp_overlap_activities, overlapping_activities_dict): 
    box_df = find_activity_bounding_box(tp_overlap_activities) 
    matching_activites = calculate_distance_two_boxes(box_df, overlapping_activities_dict)
    return matching_activites
    

In [13]:
overlapping_activity_with_date_time = get_overlapping_activites( activity_df, True)

{('000',
  '001'): [('20081023025304000',
   '20081023055305001',
   Timestamp('2008-10-23 05:53:05'),
   Timestamp('2008-10-23 11:11:12')), ('20081024020959000',
   '20081023234104001',
   Timestamp('2008-10-24 02:09:59'),
   Timestamp('2008-10-24 02:47:06')), ('20081027115449000',
   '20081027111634001',
   Timestamp('2008-10-27 11:54:49'),
   Timestamp('2008-10-27 12:05:54')), ('20081103232153000',
   '20081103233729001',
   Timestamp('2008-11-03 23:37:29'),
   Timestamp('2008-11-04 00:33:17')), ('20081111001704000',
   '20081110233534001',
   Timestamp('2008-11-11 00:17:04'),
   Timestamp('2008-11-11 00:17:37')), ('20081112023003000',
   '20081111234235001',
   Timestamp('2008-11-12 02:30:03'),
   Timestamp('2008-11-12 05:09:14')), ('20081112091400000',
   '20081111234235001',
   Timestamp('2008-11-12 09:14:00'),
   Timestamp('2008-11-12 09:16:10')), ('20081113034608000',
   '20081113121334001',
   Timestamp('2008-11-13 12:13:34'),
   Timestamp('2008-11-13 13:47:00')), ('2008111410

In [39]:
overlapping_activities_only = get_overlapping_activites(activity_df, False)

In [40]:
tot_len = 0
for key, value in overlapping_activities_only.items():
    tot_len += len(value)

tot_len

28312

In [41]:
tp_from_overlapping_activities = get_matching_trackpoints_from_dict(tp_dict, overlapping_activity_with_date_time)


In [33]:
test_box = find_activity_bounding_box(tp_from_overlapping_activities)
test_box

KeyboardInterrupt: 

In [32]:
activity_overlap_dict = get_overlapping_activities_based_on_distance(tp_from_overlapping_activities, overlapping_activities_only)
activity_overlap_dict

KeyboardInterrupt: 

In [None]:
def match_track_points(activity_id: str, comp_activity_id: str, track_points_df: pd.DataFrame, verbose=False):
    activitities = list(track_points_df[track_points_df["activity_id"] == activity_id][["lat", "lon", "date_time"]].itertuples(index=False))
    comp_activitites = list(track_points_df[track_points_df["activity_id"] == comp_activity_id][["lat", "lon", "date_time"]].itertuples(index=False))

    for activity in activitities:
        for comp_activity in comp_activitites:
            time_diff = abs(activity[2] - comp_activity[2]).total_seconds()
            distance = haversine(activity[0:2], comp_activity[0:2], unit=Unit.METERS)
            
            if distance <= 50 and verbose:
                print("Distance Matched")

            if time_diff <= 30 and verbose:
                print("Time Matched")
            
            if time_diff <= 30 and distance <= 50:
                return True
            else:
                continue
            
    return False

In [None]:

for data in activity_overlap_dict: 
    user_id_1, activity_id_1, user_id_2, activity_id_2 = data
    user_1_activities = tp_from_overlapping_activities[tp_from_overlapping_activities['activity_id'] == activity_id_1]
    user_2_activities = tp_from_overlapping_activities[tp_from_overlapping_activities['activity_id'] == activity_id_2]

    for activity_id in user_1_activities:
        for comp_activity_id in user_2_activities:
            print(activity_id, comp_activity_id)
            if match_track_points(activity_id, comp_activity_id, tp_from_overlapping_activities):
                print(f"Matched activity {activity_id} with {comp_activity_id}")
                #plot_track_points(activity_id, comp_activity_id, tp_from_overlapping_activities)
                break   

In [None]:
act_q = "SELECT * from activity"
act_df = query_to_dataframe(connection_string, act_q)
user_1_activities = act_df[act_df["user_id"] == "012"]["id"].unique()
user_2_activities = act_df[act_df["user_id"] == "128"]["id"].unique()

for activity_id in user_1_activities:
    for comp_activity_id in user_2_activities:
        print(activity_id, comp_activity_id)
        if match_track_points(activity_id, comp_activity_id, track_points_df):
            print(f"Matched activity {activity_id} with {comp_activity_id}")
            plot_track_points(activity_id, comp_activity_id, track_points_df)
            break