In [2]:
# Required libraries
from sqlalchemy import create_engine
import pandas as pd
from haversine import haversine, Unit

def query_to_dataframe(connection_string, query):
    """
    Query a MySQL database using SQLAlchemy and return the result as a pandas DataFrame.

    Parameters:
    - connection_string (str): The connection string for the database.
    - query (str): The SQL query to be executed.

    Returns:
    - df (pd.DataFrame): The result of the query as a pandas DataFrame.
    """
    # Create an engine
    engine = create_engine(connection_string)
    
    # Connect to the engine and execute the query
    with engine.connect() as connection:
        df = pd.read_sql(query, connection)
        
    return df




In [3]:
# Example Usage:

# Define the connection string
# Format: "mysql+<driver_name>://<username>:<password>@<host>:<port>/<database_name>"
connection_string = "mysql+mysqlconnector://arasham:Generalfuzn03?@tdt4225-10.idi.ntnu.no:3306/default_db"

In [3]:
sql_query1 = "select * from user"
sql_query2 = "select * from activity"
sql_query3 = "select * from track_point"


In [4]:
df1 = query_to_dataframe(connection_string, sql_query1)
df1

Unnamed: 0,id,has_labels
0,000,0
1,001,0
2,002,0
3,003,0
4,004,0
...,...,...
168,176,0
169,178,0
170,179,1
171,180,0


In [5]:
df2 = query_to_dataframe(connection_string, sql_query2)
df2

Unnamed: 0,id,user_id,transportation_mode,start_date_time,end_date_time
0,20000101231219163,163,,2000-01-01 23:12:19,2000-01-01 23:15:23
1,20070412093132142,142,,2007-04-12 09:31:32,2007-04-12 11:33:40
2,20070412101853161,161,,2007-04-12 10:18:53,2007-04-12 10:23:15
3,20070412102116163,163,bike,2007-04-12 10:21:16,2007-04-12 14:56:56
4,20070412102325161,161,,2007-04-12 10:23:25,2007-04-12 10:26:25
...,...,...,...,...,...
16043,20120724204557163,163,,2012-07-24 20:45:57,2012-07-24 22:01:15
16044,20120725080203163,163,,2012-07-25 08:02:03,2012-07-25 08:26:13
16045,20120725190300163,163,,2012-07-25 19:03:00,2012-07-25 19:55:08
16046,20120726080106163,163,,2012-07-26 08:01:06,2012-07-26 08:27:24


In [6]:
df2["transportation_mode"].unique()

array([None, 'bike', 'taxi', 'walk', 'car', 'bus', 'subway', 'run',
       'train', 'airplane', 'boat'], dtype=object)

In [7]:
df3 = query_to_dataframe(connection_string, sql_query3)
df3

Unnamed: 0,id,activity_id,lat,lon,altitude,date_days,date_time
0,1,20090412073303000,40.000017,116.327479,105,39915.314618,2009-04-12 07:33:03
1,2,20090412073303000,40.000168,116.327474,80,39915.314688,2009-04-12 07:33:09
2,3,20090412073303000,40.000055,116.327454,99,39915.314745,2009-04-12 07:33:14
3,4,20090412073303000,40.000021,116.327407,109,39915.314803,2009-04-12 07:33:19
4,5,20090412073303000,40.000035,116.327281,111,39915.314861,2009-04-12 07:33:24
...,...,...,...,...,...,...,...
9681751,9681752,20071208010931181,39.866483,116.415383,249,39424.298426,2007-12-08 07:09:44
9681752,9681753,20071208010931181,39.866033,116.415683,174,39424.299537,2007-12-08 07:11:20
9681753,9681754,20071208010931181,39.865550,116.415733,187,39424.300486,2007-12-08 07:12:42
9681754,9681755,20071208010931181,39.864883,116.415750,226,39424.301412,2007-12-08 07:14:02


In [8]:
#Task 1: 

sql_query1 = """
    SELECT COUNT(*) AS num_users
    FROM user;
"""

sql_query2 = """
    SELECT COUNT(*) AS num_activities 
    FROM activity;
"""

sql_query3 = """ 
    SELECT COUNT(*) AS num_trackpoints
    FROM track_point;
"""

df = query_to_dataframe(connection_string, sql_query3)
df





Unnamed: 0,num_trackpoints
0,9681756


In [9]:
# Task 2
sql_query = """
    WITH user_tp_count AS (
        SELECT user_id, count(*) AS tp_count
        FROM track_point tp JOIN activity a ON a.id = tp.activity_id
        JOIN user u ON a.user_id = u.id
        GROUP BY user_id
    )
    SELECT MIN(tp_count), MAX(tp_count), AVG(tp_count) FROM user_tp_count;
    """

# Get the data as a DataFrame
df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,MIN(tp_count),MAX(tp_count),AVG(tp_count)
0,17,1010325,55963.9075


In [10]:
# Task 3

sql_query = """ 
    SELECT user_id, count(a.id) AS activity_count
    FROM user u JOIN activity a ON a.user_id = u.id
    GROUP BY u.id
    ORDER BY activity_count DESC
    LIMIT 15;
"""
df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,user_id,activity_count
0,128,2102
1,153,1793
2,25,715
3,163,704
4,62,691
5,144,563
6,41,399
7,85,364
8,4,346
9,140,345


In [11]:
#Task 4

sql_query = """ 
    SELECT user_id, transportation_mode  
    FROM user u JOIN activity a ON a.user_id = u.id
    WHERE a.transportation_mode = "bus"; 
"""
df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,user_id,transportation_mode
0,091,bus
1,175,bus
2,091,bus
3,092,bus
4,010,bus
...,...,...
194,085,bus
195,085,bus
196,085,bus
197,085,bus


In [12]:
#Task 5

#This is including those that have transportation_mode = NONE
sql_query = """ 
    SELECT user_id, COUNT(DISTINCT a.transportation_mode) AS num_of_different_transport_mode
    FROM user u JOIN activity a ON a.user_id = u.id
    GROUP BY u.id
    ORDER BY num_of_different_transport_mode DESC
    LIMIT 10; 
"""
df = query_to_dataframe(connection_string, sql_query)
df


Unnamed: 0,user_id,num_of_different_transport_mode
0,128,9
1,62,7
2,85,4
3,84,3
4,58,3
5,163,3
6,78,3
7,81,3
8,112,3
9,65,2


In [13]:
#Task 5 ALternative way
sql_query1 = """ 
    SELECT transportation_mode 
    FROM activity
"""
activity_df = query_to_dataframe(connection_string, sql_query1)
trans_mode_list = activity_df["transportation_mode"].unique()

select_statement = ", ".join([f"MAX(CASE WHEN transportation_mode = '{mode}' THEN 1 ELSE 0 END) AS {mode}" for mode in trans_mode_list])

sql_query = f"""
    SELECT
        user_id, 
        {select_statement},
        COUNT(DISTINCT a.transportation_mode) as num_of_different_transport_mode
    FROM user u JOIN activity a ON a.user_id = u.id
    GROUP BY u.id
    ORDER BY num_of_different_transport_mode DESC
    LIMIT 10; 
"""
df = query_to_dataframe(connection_string, sql_query)
df


Unnamed: 0,user_id,None,bike,taxi,walk,car,bus,subway,run,train,airplane,boat,num_of_different_transport_mode
0,128,0,1,1,1,1,1,1,0,1,1,1,9
1,62,0,1,1,1,1,1,0,1,1,0,0,7
2,85,0,0,1,1,0,1,1,0,0,0,0,4
3,84,0,0,0,1,0,1,1,0,0,0,0,3
4,58,0,0,1,1,1,0,0,0,0,0,0,3
5,163,0,1,1,1,0,0,0,0,0,0,0,3
6,78,0,0,1,1,0,0,1,0,0,0,0,3
7,81,0,1,0,1,0,1,0,0,0,0,0,3
8,112,0,1,0,1,0,1,0,0,0,0,0,3
9,65,0,1,0,1,0,0,0,0,0,0,0,2


In [14]:
#Task 6

sql_query = """ 

    SELECT id, COUNT(*) AS activity_count
    FROM activity 
    GROUP BY id
    HAVING COUNT(*) > 1 
    ORDER BY activity_count DESC;
"""

df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,id,activity_count


In [15]:
#Task 7a 

sql_query = """

    SELECT COUNT(DISTINCT user_id) as number_of_users
    FROM activity a 
    WHERE DATE(a.start_date_time) != DATE(a.end_date_time) 
    AND TIMESTAMPDIFF(DAY, a.start_date_time, a.end_date_time) >= 1

"""

df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,number_of_users
0,5


In [16]:
#Task 7b 

sql_query = """

    SELECT a.transportation_mode, user_id, 
    CONCAT(
        TIMESTAMPDIFF(DAY, a.start_date_time, a.end_date_time), ' days ',
        HOUR(TIMEDIFF(a.end_date_time, a.start_date_time)), ' hours ',
        MINUTE(TIMEDIFF(a.end_date_time, a.start_date_time)), ' minutes ',
        SECOND(TIMEDIFF(a.end_date_time, a.start_date_time)), ' seconds'
    ) AS Duration
    FROM activity a
    WHERE DATE(a.start_date_time) != DATE(a.end_date_time)
    AND TIMESTAMPDIFF(DAY, a.start_date_time, a.end_date_time) >= 1

"""

df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,transportation_mode,user_id,Duration
0,,99,1 days 28 hours 49 minutes 19 seconds
1,,51,1 days 29 hours 1 minutes 3 seconds
2,,144,1 days 28 hours 28 minutes 34 seconds
3,,144,1 days 26 hours 28 minutes 35 seconds
4,,28,1 days 29 hours 12 minutes 45 seconds
5,,17,1 days 24 hours 32 minutes 16 seconds


In [None]:
# Task 8

sql_query = """

    SELECT COUNT(DISTINCT u.id), TIMESTAMPDIFF(SECOND,  )
"""

df = query_to_dataframe(connection_string, sql_query)
df

In [38]:
#Task 9
#This query finds the top 15 users who have gained most altitude in meters across all their activities.  

sql_query = """ 

WITH alt_diff AS(
    SELECT 
        tp.activity_id, 
        tp.altitude * 0.3048 as current_altitude_meters, LAG(tp.altitude * 0.3048) OVER (PARTITION BY tp.activity_id ORDER BY date_time) AS prev_altitude_meters
    FROM track_point tp
    WHERE tp.altitude != -777 
), 

altitude_gained AS(
    SELECT 
        ad.activity_id, 
        SUM(CASE WHEN ad.current_altitude_meters > ad.prev_altitude_meters THEN ad.current_altitude_meters - ad.prev_altitude_meters ELSE 0 END) AS altitude_meters_gain
    FROM alt_diff ad
    GROUP BY ad.activity_id
)

SELECT a.user_id, SUM(ag.altitude_meters_gain) AS total_altitude_meters_gained
FROM activity a
JOIN altitude_gained ag ON a.id = ag.activity_id
GROUP BY a.user_id
ORDER BY SUM(ag.altitude_meters_gain) DESC 
LIMIT 15;
"""

df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,user_id,total_altitude_meters_gained
0,128,650978.4288
1,153,555019.464
2,4,332036.3184
3,41,240758.472
4,3,233663.6424
5,85,217642.1352
6,163,205278.228
7,62,181688.232
8,144,179456.1816
9,30,175641.9144


In [34]:
# Task 10

def calculate_distance(position): 
    distance = 0 
    for i in range(1,len(position)): 
        tp = position[i-1]
        next_tp = position[i]
        distance += haversine((tp[0], tp[1]), (next_tp[0], next_tp[1]), unit=Unit.KILOMETERS)

    return distance    

sql_query = """ 
    SELECT a.user_id, a.transportation_mode, tp.lat, tp.lon, tp.date_time 
    FROM activity a JOIN track_point tp ON a.id = tp.activity_id
    WHERE a.transportation_mode != "None"
    
"""

temp_df = query_to_dataframe(connection_string, sql_query)
temp_df["day"] = temp_df["date_time"].dt.date

result = temp_df.groupby(['user_id', 'day', 'transportation_mode']) \
                 .apply(lambda x: calculate_distance(x[['lat', 'lon']].values)) \
                 .reset_index(name='segment_distance')

highest_distance_per_mode = result.sort_values('segment_distance', ascending=False) \
    .groupby('transportation_mode').first().reset_index()

highest_distance_per_mode


Unnamed: 0,transportation_mode,user_id,day,segment_distance
0,airplane,128,2009-03-06,2527.119758
1,bike,128,2008-06-28,63.120602
2,boat,128,2008-11-22,65.554763
3,bus,128,2009-01-20,207.412969
4,car,128,2009-01-19,1613.73093
5,run,62,2008-09-02,0.033253
6,subway,128,2008-10-31,38.877547
7,taxi,128,2008-09-30,40.223277
8,train,62,2008-09-02,277.257577
9,walk,62,2008-10-05,42.352596


In [4]:
# Task 8

sql_query = """

    SELECT a.user_id, tp.date_time, tp.lat, tp.lon
    FROM activity a JOIN track_point tp ON a.id = tp.activity_id
"""

df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,user_id,date_time,lat,lon
0,000,2009-04-12 07:33:03,40.000017,116.327479
1,000,2009-04-12 07:33:09,40.000168,116.327474
2,000,2009-04-12 07:33:14,40.000055,116.327454
3,000,2009-04-12 07:33:19,40.000021,116.327407
4,000,2009-04-12 07:33:24,40.000035,116.327281
...,...,...,...,...
9681751,181,2007-12-08 07:09:44,39.866483,116.415383
9681752,181,2007-12-08 07:11:20,39.866033,116.415683
9681753,181,2007-12-08 07:12:42,39.865550,116.415733
9681754,181,2007-12-08 07:14:02,39.864883,116.415750


In [22]:
def get_coordinates(group):
    min_lat = group.loc[group['lat'].idxmin()]
    max_lat = group.loc[group['lat'].idxmax()]
    min_lon = group.loc[group['lon'].idxmin()]
    max_lon = group.loc[group['lon'].idxmax()]

    return pd.Series({
        'min_lat': (min_lat['lat'], min_lat['lon']),
        'min_lon': (min_lon['lat'], min_lon['lon']),
        'max_lat': (max_lat['lat'], max_lat['lon']),
        'max_lon': (max_lon['lat'], max_lon['lon']),
        
    })



# Group by user_id and aggregate min, max, and combinations
grouped_df = df.groupby('user_id').apply(get_coordinates)
grouped_df['bounding_box'] = list(zip(grouped_df['min_lat'], grouped_df['min_lon'],
                                      grouped_df['min_lat'], grouped_df['max_lon'],
                                      grouped_df['max_lat'], grouped_df['min_lon'],
                                      grouped_df['max_lat'], grouped_df['max_lon']))

grouped_df

Unnamed: 0_level_0,min_lat,min_lon,max_lat,max_lon,bounding_box
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
000,"(31.189083, 121.352435)","(40.12386, 116.185354)","(40.21096, 116.495582)","(31.287439, 121.551465)","((31.189083, 121.352435), (40.12386, 116.18535..."
001,"(39.935299, 116.327091)","(39.986319, 116.303946)","(40.018145, 116.307573)","(39.970511, 116.341455)","((39.935299, 116.327091), (39.986319, 116.3039..."
002,"(37.408024, 110.852015)","(37.432482, 110.762591)","(40.052399, 116.295177)","(39.902815, 116.391503)","((37.408024, 110.852015), (37.432482, 110.7625..."
003,"(31.189083, 121.352435)","(39.906268, 116.182847)","(40.21096, 116.495582)","(31.287439, 121.551465)","((31.189083, 121.352435), (39.906268, 116.1828..."
004,"(38.871072, 121.625418)","(40.122016, 116.185872)","(40.210947, 116.495369)","(38.884661, 121.635729)","((38.871072, 121.625418), (40.122016, 116.1858..."
...,...,...,...,...,...
176,"(33.6517, 116.613533333333)","(39.9560166666667, 116.2831)","(40.6757833333333, 116.79015)","(39.7545833333333, 117.0439)","((33.6517, 116.613533333333), (39.956016666666..."
178,"(39.975738, 116.331939)","(39.978017, 116.331233)","(39.978149, 116.331379)","(39.977804, 116.332775)","((39.975738, 116.331939), (39.978017, 116.3312..."
179,"(39.906029, 116.274038)","(39.906552, 116.270286)","(40.093692, 116.308108)","(39.976848, 116.356776)","((39.906029, 116.274038), (39.906552, 116.2702..."
180,"(26.16108, 119.94249)","(39.976189, 116.330222)","(40.08721, 116.591101)","(26.162202, 119.943787)","((26.16108, 119.94249), (39.976189, 116.330222..."


In [18]:
sorted_df = df.sort_values(by='date_time').groupby('user_id').agg(
    first_registered_time=pd.NamedAgg(column='date_time', aggfunc='first'),
    last_registered_time=pd.NamedAgg(column='date_time', aggfunc='last')
)

sorted_df

Unnamed: 0_level_0,first_registered_time,last_registered_time
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000,2008-10-23 02:53:04,2009-07-05 07:45:15
001,2008-10-23 05:53:05,2008-12-15 00:31:18
002,2008-10-23 12:45:23,2009-03-21 12:11:34
003,2008-10-23 17:58:54,2009-07-05 07:45:15
004,2008-10-23 17:58:52,2009-07-29 06:16:11
...,...,...
176,2007-11-30 09:33:10,2007-12-09 10:52:57
178,2010-03-12 17:26:08,2010-03-12 17:33:08
179,2008-08-21 10:49:40,2008-11-17 03:04:17
180,2009-04-15 13:44:00,2009-05-30 16:49:05


In [23]:
# Find users with non-overlapping time intervals considering a 30-second window
non_overlapping_users = []
unique_user_ids = sorted_df.index.unique()

for i in range(len(unique_user_ids)):
    for j in range(i + 1, len(unique_user_ids)):
        user1_id = unique_user_ids[i]
        user2_id = unique_user_ids[j]
        user1_interval = sorted_df.loc[user1_id]
        user2_interval = sorted_df.loc[user2_id]

        # Check if the last registered time of one user plus 30 seconds is not within the first registered time of the other user
        if (user1_interval['last_registered_time'] + pd.Timedelta(seconds=30) < user2_interval['first_registered_time']) or \
                (user2_interval['last_registered_time'] + pd.Timedelta(seconds=30) < user1_interval['first_registered_time']):
            non_overlapping_users.append((user1_id, user2_id))

print("Users with non-overlapping time intervals (with a 30-second window):")
for user1, user2 in non_overlapping_users:
    print(f"User {user1} and User {user2}")

Users with non-overlapping time intervals (with a 30-second window):
User 000 and User 020
User 000 and User 021
User 000 and User 045
User 000 and User 046
User 000 and User 047
User 000 and User 048
User 000 and User 051
User 000 and User 052
User 000 and User 053
User 000 and User 056
User 000 and User 057
User 000 and User 059
User 000 and User 060
User 000 and User 061
User 000 and User 063
User 000 and User 064
User 000 and User 065
User 000 and User 067
User 000 and User 069
User 000 and User 070
User 000 and User 071
User 000 and User 072
User 000 and User 073
User 000 and User 074
User 000 and User 075
User 000 and User 076
User 000 and User 077
User 000 and User 078
User 000 and User 079
User 000 and User 080
User 000 and User 081
User 000 and User 086
User 000 and User 087
User 000 and User 088
User 000 and User 089
User 000 and User 090
User 000 and User 091
User 000 and User 092
User 000 and User 093
User 000 and User 094
User 000 and User 095
User 000 and User 097
User 00

In [42]:
from itertools import combinations
from datetime import timedelta

unique_user_ids = df["user_id"].unique()
user_combinations = list(combinations(unique_user_ids, 2))

def filter_by_time(df, other_user_id):
    results = []
    for i, row in df.iterrows(): 
        time_window = (row["date_time"] - timedelta(seconds=30), row["date_time"] + timedelta(seconds=30))
        other_user_rows = df[(df['user_id'] == other_user_id) & (df['date_time'] >= time_window[0]) & (df['date_time'] <= time_window[1])]
        results.extend(other_user_rows.to_dict("records"))
    return pd.DataFrame(results)

filtered_rows = []
for user_1, user_2 in user_combinations:
    user_1_rows = df[df["user_id"] == user_1]
    filtered_rows.append(filter_by_time(user_1_rows, user_2))

filtered_df = pd.concat(filtered_rows, ignore_index=True)

KeyboardInterrupt: 

In [None]:
filtered_df

In [4]:
#Task 11

sql_query = """ 
    WITH track_point_diff AS(
        SELECT tp.activity_id, tp.date_time, 
        TIMESTAMPDIFF(MINUTE, tp.date_time, LEAD(tp.date_time) OVER (PARTITION BY tp.activity_id ORDER BY tp.date_time)) AS minute_diff
        FROM track_point tp
    )

    SELECT a.user_id, COUNT(DISTINCT tpd.activity_id) AS invalid_activities_number 
    FROM track_point_diff tpd JOIN activity a ON tpd.activity_id = a.id 
    WHERE tpd.minute_diff >= 5 
    GROUP BY a.user_id
    ORDER BY COUNT(DISTINCT tpd.activity_id) DESC
    
"""

df = query_to_dataframe(connection_string, sql_query)
df

Unnamed: 0,user_id,invalid_activities_number
0,128,720
1,153,557
2,025,263
3,062,249
4,163,233
...,...,...
166,060,1
167,107,1
168,113,1
169,141,1


In [5]:
#Task 12
""" 
In this query we have chosen to use MAX()-operator for those instances where a user may have same number of activites tagged with different transporation_mode. Since this operator will select the maximum based on 
alphabetic order of the transportation mode string then for instance bus would be chosen over walk etc. 

"""
sql_query = """ 
    WITH filter_user_transportation AS(
        SELECT a.user_id, a.transportation_mode 
        FROM activity a 
        WHERE a.transportation_mode != "None"
        GROUP BY a.transportation_mode, a.user_id
    )

    SELECT u.id, MAX(fut.transportation_mode) AS most_used_transportation_mode
    FROM user u JOIN filter_user_transportation fut ON u.id = fut.user_id 
    GROUP BY u.id 
    ORDER BY u.id 

"""
df = query_to_dataframe(connection_string, sql_query)
df


Unnamed: 0,id,most_used_transportation_mode
0,10,taxi
1,20,walk
2,21,walk
3,52,bus
4,56,bike
5,58,walk
6,60,walk
7,62,walk
8,64,bike
9,65,walk
