In [1]:
from DbConnector import DbConnector
from part2 import Database 
from tabulate import tabulate

# Set up the program
from dotenv import load_dotenv
import os

load_dotenv()

user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')

program = DbConnector(USER=user, PASSWORD=password)

Using user:  common
Connected to: 8.0.34-0ubuntu0.22.04.1
You are connected to the database: ('assignment2',)
-----------------------------------------------



### Task 1

How many users, activities and trackpoints are there in the dataset (after it is inserted into the database)

In [None]:

result: list = []

for table_name in ["User", "Activity", "TrackPoint"]:
    query = "SELECT COUNT(*) FROM %s"
    program.cursor.execute(query % table_name)
    result.append(program.cursor.fetchone())

print("Number of Users, Activities and TrackPoints")
print(tabulate([["Users", "Activities", "TrackPoints"], [result[0][0], result[1][0], result[2][0]]], headers="firstrow"))

### Task 3

Find the top 15 users with the highest number of activities.

In [None]:
query = "SELECT user_id FROM Activity GROUP BY user_id ORDER BY COUNT(*) DESC LIMIT 15"
program.cursor.execute(query)
rows = program.cursor.fetchall()
text = "Top 15 users with the highest number of activities:\n"

for i, val in enumerate(rows):
    text += f"Top {i+1}. {val[0]}\n"
print(text)

### Task 4

Find all users who have taken a bus.

In [None]:
# Get the users who have taken the bus
query = """
SELECT DISTINCT user_id
FROM Activity
WHERE transportation_mode = 'bus'
"""
program.cursor.execute(query)
users = program.cursor.fetchall()

table = [["User id"]]
table.extend([user for user in users])
print(tabulate(table, headers="firstrow"))

### Task 6

Find activities that are registered multiple times. You should find the query even
if it gives zero result.

In [None]:
query = """
SELECT a.id 
FROM Activity AS a 
WHERE EXISTS (
    SELECT b.id 
    FROM Activity AS b 
    WHERE a.user_id = b.user_id 
    AND a.transportation_mode = b.transportation_mode 
    AND a.start_date_time = b.start_date_time 
    AND a.end_date_time = b.end_date_time 
    AND a.id != b.id)"""
program.cursor.execute(query)
rows = program.cursor.fetchall()
print("Find activities that are registered multiple times:\n", rows)

### Task 7

a) Find the number of users that have started an activity in one day and ended the activity the next day.

In [None]:
query = """
SELECT COUNT(DISTINCT user_id) 
FROM Activity 
WHERE DATEDIFF(end_date_time, start_date_time) > 0;
"""
program.cursor.execute(query)
num_users = program.cursor.fetchone()

print("Number of users with activity that ends the next day")
print(tabulate([["Number of users"], [num_users[0]]], headers="firstrow"))

b) List the transportation mode, user id and duration for these activities.

In [3]:
query = """
SELECT 
    user_id, 
    transportation_mode, 
    SEC_TO_TIME(TIMESTAMPDIFF(SECOND, start_date_time, end_date_time))
FROM Activity 
WHERE DATEDIFF(end_date_time, start_date_time) > 0
AND transportation_mode IS NOT NULL;
"""
program.cursor.execute(query)
users_info = program.cursor.fetchall()

table = [["User", "Transportation Mode", "Duration"]]
content = [[user[0], user[1], user[2]] for user in users_info]
table.extend(content)

print(tabulate(table, headers="firstrow"))

  User  Transportation Mode    Duration
------  ---------------------  ----------
   020  bike                   10:10:01
   021  walk                   3:57:13
   058  car                    0:36:12
   062  walk                   1:23:44
   085  bus                    0:58:25
   115  car                    1:28:04
   115  car                    1:02:50
   115  car                    1:00:56
   115  car                    1:05:10
   115  car                    1:23:14
   115  car                    0:54:09
   115  car                    1:08:43
   115  car                    1:09:36
   115  car                    1:03:47
   115  car                    1:01:42
   115  car                    0:55:50
   115  car                    1:24:08
   115  car                    1:17:13
   115  car                    1:01:33
   115  car                    1:03:29
   115  car                    0:58:19
   115  car                    7:46:29
   115  car                    1:05:39
   115  car         

### Task 9

Find the top 15 users who have gained the most altitude meters.

In [None]:
query = "SELECT user_id, sum(activity_altitude)*0.304 as altitude_in_meters FROM Activity JOIN (SELECT activity_id, SUM(difference) AS activity_altitude FROM (SELECT activity_id, altitude - LAG(altitude) OVER (PARTITION BY activity_id ORDER BY date_time) AS difference FROM TrackPoint) AS altitude_difference WHERE difference > 0 GROUP BY activity_id) AS difference_table ON Activity.id = difference_table.activity_id GROUP BY user_id ORDER BY altitude_in_meters DESC LIMIT 15"
program.cursor.execute(query)
rows = program.cursor.fetchall()
text = "Find the top 15 users who have gained the most altitude meters\n"
for i in rows:
    text += f"User: {i[0]}, Altitude in meters: {i[1]}\n"
print(text)

### Task 10

Find the users that have traveled the longest total distance in one day for each
transportation mode.

In [4]:
from haversine import haversine, Unit

query = """
SELECT a.id, tp.lat, tp.lon, a.transportation_mode, a.user_id
FROM Activity a
INNER JOIN TrackPoint tp ON a.id = tp.activity_id 
WHERE (TIMESTAMPDIFF(SECOND, a.start_date_time, a.end_date_time)) < 60 * 60 * 24
AND a.transportation_mode IS NOT NULL;
"""

program.cursor.execute(query)
result = program.cursor.fetchall()

activity = {}
last_activity_position = {}
transportation = {}
activity_user = {}

for res in result:
    activity_id = res[0]
    current_position = (res[1], res[2])
    last_position = last_activity_position.get(activity_id, current_position)
    last_activity_position[activity_id] = current_position
    distance = haversine(current_position, last_position, unit=Unit.KILOMETERS)
    activity[activity_id] = activity.get(activity_id, 0) + distance
    transportation[activity_id] = res[3]
    activity_user[activity_id] = res[4]

longest_activity = {}
for act in activity.items():
    transportation_mode = transportation[act[0]]
    longest_activity[transportation_mode] = (
        act[0],
        max(act[1], longest_activity.get(transportation_mode, (0, 0))[1]),
    )

users = []
for longest in longest_activity.items():
    activity_id = longest[1][0]
    transportation_mode = longest[0]
    user_id = activity_user[activity_id]
    distance = activity[activity_id]
    users.append((user_id, transportation_mode, distance))

table = [["User", "Transportation mode", "Distance (km)"]]
table.extend(users)

print(tabulate(table, headers="firstrow"))

  User  Transportation mode      Distance (km)
------  ---------------------  ---------------
   175  bus                          1.80771
   163  taxi                        11.3794
   167  walk                         0.455136
   167  bike                         4.0598
   128  car                         18.5745
   062  run                          0.0332532
   128  train                       19.0923
   128  subway                      10.8537
   128  airplane                  4391.42
   128  boat                        65.5548


### Task 12

Find all the users who have registered transportation_mode and their most used transportation_mode

Comment on implementation: Although with this dataset, does not need to join User, however in cases where user does not have labels, but still has some activities in Activity with transportation_mode then a join is necessary. For users that have the same number of activities tagged on multiple transportation mode, we've decided to take the first transportation mode in alphabetical order.

In [None]:
query = "SELECT user_id, transportation_mode FROM ( SELECT user_id, transportation_mode, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY COUNT(*) DESC, transportation_mode ASC) AS rownum FROM Activity JOIN User ON Activity.user_id = User.id WHERE transportation_mode IS NOT NULL AND User.has_labels=TRUE GROUP BY user_id, transportation_mode) AS activity_grouped WHERE rownum=1 ORDER BY user_id"
program.cursor.execute(query)
rows = program.cursor.fetchall()
text = "Find all the users who have registered transportation_mode and their most used transportation_mode\n"
for row in rows:
    text += f"User {row[0]}, Transportation mode: {row[1]}\n"
print(tabulate(rows))

### End

In [None]:
# Closing the connection
program.close_connection()