In [108]:
from DbConnector import DbConnector
#from part2 import Database
from tabulate import tabulate

# Set up the program
from dotenv import load_dotenv
import os
import numpy as np
from datetime import datetime, timedelta
from haversine import haversine, Unit

load_dotenv()

db = DbConnector()

Using user:  common
Connected to: 8.0.34-0ubuntu0.22.04.1
You are connected to the database: ('assignment2',)
-----------------------------------------------



### Task 1

How many users, activities and trackpoints are there in the dataset (after it is inserted into the database)

In [None]:
result: list = []

for table_name in ["User", "Activity", "TrackPoint"]:
    query = "SELECT COUNT(*) FROM %s"
    db.cursor.execute(query % table_name)
    result.append(db.cursor.fetchone())

print("Number of Users, Activities and TrackPoints after the dataset has been inserted into the database")
print(tabulate([["Users", "Activities", "TrackPoints"], [result[0][0], result[1][0], result[2][0]]], headers="firstrow"))

### Task 2
Find the average, maximum and minimum number of trackpoints per user

##### Average

In [None]:
# Get the average number of trackpoints per user
query = """
SELECT User.id, COALESCE(average, 0) AS average 
    FROM User LEFT JOIN (SELECT user_id, AVG(trackpoints) average 
    FROM (SELECT Activity.user_id , COUNT(t.id) AS trackpoints 
    FROM TrackPoint t JOIN Activity ON t.activity_id = Activity.id 
    JOIN User ON Activity.user_id = User.id GROUP BY Activity.id) 
AS Trackpoints GROUP BY user_id) a ON a.user_id = User.id"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
print("Average trackpoints per user")
print(tabulate(rows, headers=db.cursor.column_names))


##### Maximum

In [None]:
# Get the maximum number of trackpoints per user
query = """SELECT User.id, COALESCE(maximum, 0) AS maximum 
                FROM User LEFT JOIN (SELECT user_id, MAX(trackpoints) AS maximum 
                FROM (SELECT Activity.user_id , COUNT(t.id) AS trackpoints FROM TrackPoint t 
                INNER JOIN Activity ON t.activity_id = Activity.id GROUP BY Activity.id) 
                AS Trackpoints GROUP BY user_id) a ON a.user_id = User.id"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
print("Maximum trackpoints per user")
print(tabulate(rows, headers=db.cursor.column_names))

##### Minimum

In [None]:
# Get the minimum number of trackpoints per user
query = """
        SELECT User.id, COALESCE(minimum, 0) AS minimum FROM User LEFT JOIN (SELECT user_id, MIN(trackpoints) AS minimum
            FROM (SELECT Activity.user_id , COUNT(t.id) AS trackpoints 
            FROM TrackPoint t
            INNER JOIN Activity ON t.activity_id = Activity.id 
            GROUP BY Activity.id) 
            AS Trackpoints 
            GROUP BY user_id) a ON a.user_id = User.id
            """
db.cursor.execute(query)
rows = db.cursor.fetchall()
print("Minimum trackpoints per user")
print(tabulate(rows, headers=db.cursor.column_names))

### Task 3

Find the top 15 users with the highest number of activities.

In [None]:
query = """
    SELECT RANK() OVER (
        ORDER BY COUNT(*) DESC
    ) Top, user_id, COUNT(*) as num_of_activities FROM Activity GROUP BY user_id ORDER BY COUNT(*) DESC LIMIT 15
    """
db.cursor.execute(query)
rows = db.cursor.fetchall()
columns = db.cursor.column_names
print("Top 15 users with the highest number of activities")
print(tabulate(rows, headers=columns))

### Task 4

Find all users who have taken a bus.

In [None]:
# Get the users who have taken the bus
query = """
SELECT DISTINCT user_id
FROM Activity
WHERE transportation_mode = 'bus'
"""
db.cursor.execute(query)
users = db.cursor.fetchall()

table = [["User id"]]
table.extend([user for user in users])
print("All users who have taken a bus")
print(tabulate(table, headers="firstrow"))

### Task 5

List the top 10 users by their amount of different transportation modes.

In [None]:
query = """SELECT RANK() OVER (
        ORDER BY COUNT(DISTINCT(transportation_mode)) DESC
        ) AS Top, user_id, COUNT(DISTINCT(transportation_mode)) as DifferentTransportation 
                    FROM Activity GROUP BY user_id ORDER BY DifferentTransportation DESC LIMIT 10;"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
print("Top 10 users by their amount of different transportation modes")
print(tabulate(rows, headers=db.cursor.column_names))

### Task 6

Find activities that are registered multiple times. You should find the query even
if it gives zero result.

Assumption: Registered multiple times, meaning duplicates with the same user_id, transportation_mode, start_date_time, end_date_time.

In [None]:
query = """
    SELECT a.id FROM Activity AS a WHERE EXISTS 
        (SELECT b.id FROM Activity AS b WHERE a.user_id = b.user_id 
        AND a.transportation_mode = b.transportation_mode AND 
        a.start_date_time = b.start_date_time AND a.end_date_time = b.end_date_time 
        AND a.id != b.id)
    """
db.cursor.execute(query)
rows = db.cursor.fetchall()
columns = db.cursor.column_names
print("Activities that are registered multiple times")
print(tabulate(rows, headers=columns))

### Task 7

a) Find the number of users that have started an activity in one day and ended the activity the next day.

In [None]:
query = """
SELECT COUNT(DISTINCT user_id) 
FROM Activity 
WHERE DATEDIFF(end_date_time, start_date_time) > 0;
"""
db.cursor.execute(query)
num_users = db.cursor.fetchone()

print("Number of users with activity that ends the next day")
print(tabulate([["Number of users"], [num_users[0]]], headers="firstrow"))

b) List the transportation mode, user id and duration for these activities.

Assumption: As the question explicitly mentions transportation mode we do not count the rows in the Activity table with transportation mode equals NULL as relevant to this question. Thus, there are less rows in this sub task.

In [None]:
query = """
SELECT 
    user_id, 
    transportation_mode, 
    SEC_TO_TIME(TIMESTAMPDIFF(SECOND, start_date_time, end_date_time))
FROM Activity 
WHERE DATEDIFF(end_date_time, start_date_time) > 0
AND transportation_mode IS NOT NULL;
"""
db.cursor.execute(query)
users_info = db.cursor.fetchall()

table = [["User", "Transportation Mode", "Duration"]]
content = [[user[0], user[1], user[2]] for user in users_info]
table.extend(content)

print(tabulate(table, headers="firstrow"))

### Task 8

Find the number of users which have been close to each other in time and space. Close is defined as the same space (50 meters) and for the same half minute (30 seconds)

In [None]:
# Sorting the trackpoints by date_time to make it fast to find the datepoints within 30 seconds
query = """SELECT a.user_id, date_time, lat,lon FROM TrackPoint t
INNER JOIN Activity a
ON a.id = t.activity_id
ORDER BY date_time ASC"""

# Looping through all the squares
users = []
db.cursor.execute(query)
rows = db.cursor.fetchall()


for i in range(len(rows)-1):
    for j in range(i,len(rows)):
        # Skip the rest of the loop if the time difference is more than 30 seconds and they are sorted
        if rows[j][1] - rows[i][1] > timedelta(seconds=30):
            break
        # Dont do anything if the same user
        if rows[i][0] == rows[j][0]:
            continue
        # Checking if the trackpoints are within 30 seconds of each other
        if rows[j][1] - rows[i][1] <= timedelta(seconds=30):

            # Checking if the trackpoints are within 50 meters of each other
            distance = haversine((rows[i][2], rows[i][3]), (rows[j][2], rows[j][3]), unit=Unit.METERS)

            if distance <= 50:
                # Adding the users to the list
                users.append(rows[i][0])
                users.append(rows[j][0])


# Removes duplicates
users = np.unique(users)

print(f"{len(users)} have been close to each other in time and space")

### Task 9

Find the top 15 users who have gained the most altitude meters.

In [None]:
# 1. Retrieve all the trackpoints which is valid
# 2. Calculate the difference between the current trackpoint's altitude and the previous one
# 3. Calculate the sum of difference if the difference is postive based on the tip given in task for each activity
# 4. Sum each user's altitude positive differences in feet with 0.304 to convert it to meters

query = """SELECT RANK() OVER (
        ORDER BY SUM(activity_altitude)*0.304 DESC
        ) AS Top, user_id, SUM(activity_altitude)*0.304 as altitude_in_meters FROM Activity JOIN 
        (SELECT activity_id, SUM(difference) AS activity_altitude FROM 
        (SELECT activity_id, altitude - LAG(altitude) OVER 
        (PARTITION BY activity_id ORDER BY date_time) AS difference FROM 
        (SELECT activity_id, altitude, date_time FROM TrackPoint WHERE altitude != -777) AS t) 
        AS altitude_difference WHERE difference > 0 GROUP BY activity_id) AS difference_table 
        ON Activity.id = difference_table.activity_id GROUP BY user_id ORDER BY altitude_in_meters DESC 
        LIMIT 15"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
print("Top 15 users who have gained the most altitude meters")
print(tabulate(rows, headers=db.cursor.column_names))

### Task 10

Find the users that have traveled the longest total distance in one day for each
transportation mode.

The first approach, and the one we want to present, search for the longest distance by a user in one day (we interpreted it as one date day, like in task 7 where it did not go over to the next day). This implementation bases on that if a user has multiple activities of the same transportation mode within a day, then the distance for these activities will be summed up.

In [110]:
# Find the users that have traveld the longest total distance in one day for each transportation mode

from haversine import haversine, Unit
 
query = """
SELECT a.id, tp.lat, tp.lon, CAST(tp.date_time AS date) AS day, a.transportation_mode, a.user_id
FROM Activity a
INNER JOIN TrackPoint tp
ON a.id = tp.activity_id
WHERE a.transportation_mode IS NOT NULL
ORDER BY a.user_id, a.transportation_mode, tp.date_time ASC
"""

db.cursor.execute(query)
result = db.cursor.fetchall()

In [111]:
activities = {} # This contains an overview of all the activities for each user for each date

"""
activities = {
    user_id: {
        transportation_mode: {
            date: {
                distance: int,
                position: (lat, lon),
            }
        }
    }
}
"""

for res in result:
    activity_id = res[0]
    lat = res[1]
    lon = res[2]
    date = res[3]
    transportation_mode = res[4]
    user_id = res[5]

    # Checks if user id is in the activities dictionary, if not, add it
    if user_id not in activities:
        activities[user_id] = {}

    # Checks if transportation mode is in the user dictionary, if not, add it
    if transportation_mode not in activities[user_id]:
        activities[user_id][transportation_mode] = {}

    # Checks if date is in the transportation mode dictionary, if not, add it
    if date not in activities[user_id][transportation_mode]:
        activities[user_id][transportation_mode][date] = {
            "distance": 0,
            "position": (lat, lon),
            "activity_id": activity_id
        }

    # Checks if it is the same position as previously or that it is another activity then skip
    if activities[user_id][transportation_mode][date]["position"] == (lat, lon) or \
    activities[user_id][transportation_mode][date]["activity_id"] != activity_id:
        # Keep the current position to calculate the distance for the next row
        activities[user_id][transportation_mode][date]["position"] = (lat, lon)
        # Keep the current activity_id so that we don't calculate the distance between two different activities
        activities[user_id][transportation_mode][date]["activity_id"] = activity_id   
        continue
    # Calculates the distance between the current position and the previous position
    activities[user_id][transportation_mode][date]["distance"] += haversine(
        activities[user_id][transportation_mode][date]["position"], (lat, lon), unit=Unit.KILOMETERS)

    # Keep the current position to calculate the distance for the next row
    activities[user_id][transportation_mode][date]["position"] = (lat, lon)
    # Keep the current activity_id so that we don't calculate the distance between two different activities
    activities[user_id][transportation_mode][date]["activity_id"] = activity_id    
longest_activities = {} # This contains the longest activity for each transportation mode 

# Loop through the user date activities
for user_id in activities:
    for transportation_mode in activities[user_id]:
        for date in activities[user_id][transportation_mode]:

            # Check if there's no dates of users with that activity type
            # Add the current one as the longest distance activity
            if transportation_mode not in longest_activities:
                longest_activities[transportation_mode] = {
                    "distance": activities[user_id][transportation_mode][date]["distance"],
                    "user_id": user_id,
                }
            # Else check if the current one is longer, if so, then replace   
            elif longest_activities[transportation_mode]["distance"] < activities[user_id][transportation_mode][date]["distance"]:
                longest_activities[transportation_mode] = {
                    "distance": activities[user_id][transportation_mode][date]["distance"],
                    "user_id": user_id,
                }

# Create a table with the longest activity for each transportation mode
# to present the result 
users = []
for transportation_mode in longest_activities:
    distance = longest_activities[transportation_mode]["distance"]
    user_id = longest_activities[transportation_mode]["user_id"]
    users.append((user_id, transportation_mode, activity_id, distance))

table = [["User", "Transportation mode", "Distance (km)"]]
table.extend(users)

print(tabulate(table, headers="firstrow"))

     User        Transportation mode    Distance (km)
---  --------  ---------------------  ---------------
128  bus                       20710      207.413
128  taxi                      20710       40.2233
128  bike                      20710       52.5372
108  walk                      20710       22.8074
128  car                       20710      331.418
062  run                       20710        0.0332532
062  train                     20710      277.258
128  subway                    20710       23.2978
128  airplane                  20710     2527.12
128  boat                      20710       65.5548


Another approach we have done before ending up with the previous one is the one when we first interpreted it as that we are tied up to one activity and check if the hours between start and endtime are not over 24 hours. However, afterwards we find this approach quite restricted to one activity and does not really answer the question about travelling the most distance in one day.

Note: When it comes to code, it was written a bit differently (with less for loops), reason being that the "run" transportation mode was suspiciously low, so did not want to use the same approach (or code wise with regards of for loops and dict inside dict inside dict etc.) to ensure that the code was correct (at least one of them).

In [None]:
from haversine import haversine, Unit

query = """
SELECT a.id, tp.lat, tp.lon, a.transportation_mode, a.user_id
FROM Activity a
INNER JOIN TrackPoint tp ON a.id = tp.activity_id 
WHERE (TIMESTAMPDIFF(SECOND, a.start_date_time, a.end_date_time)) < 60 * 60 * 24
AND a.transportation_mode IS NOT NULL
ORDER BY a.id, tp.date_time ASC;
"""

db.cursor.execute(query)
result = db.cursor.fetchall()

# Keeps track of the activities
activity_information = {} # dict: activity_id: [user_id, transportation_mode, distance]
previous_activity_position = {} # Keep track of the last position of each activity

activity_distance = {} # Keep track of the distance of each activity
transportation = {} # Keep track of the transportation mode of each activity
activity_user = {} # Keep track of the user of each activity

# Go through all the activities and calculate the distance between the current and last position
for res in result:
    # Read from query result
    activity_id = res[0]
    current_position = (res[1], res[2])
    transportation_mode = res[3]
    user_id = res[4]

    # Set the previous positions
    previous_position = previous_activity_position.get(activity_id, current_position)
    previous_activity_position[activity_id] = current_position # Update the last position

    # Calculate the distance
    current_activity_information = activity_information.get(activity_id, [0,0,0])
    distance = current_activity_information[2] + haversine(current_position, previous_position, unit=Unit.KILOMETERS)

    # Update the activity information
    activity_information[activity_id] = [user_id, transportation_mode, distance]

# Find the longest activity for each transportation mode
longest_activities = {} # dict: transportation_mode: [distance, user_id, activity_id]
for activity_id, info in activity_information.items():
    user_id, transportation_mode, distance = info
    previous_longest_activity = longest_activities.get(transportation_mode, [0,0,0])

    # Check if activity is longer
    if previous_longest_activity[0] > distance:
        continue

    # Update longest activity
    longest_activities[transportation_mode] = [distance, user_id, activity_id]


# Create a table with the longest activity for each transportation mode
users = []
for transportation_mode, items in longest_activities.items():
    distance, user_id, activity_id = items
    users.append((user_id, transportation_mode, distance))

table = [["User", "Transportation mode", "Distance (km)"]]
table.extend(users)

print(tabulate(table, headers="firstrow"))

### Task 11

Find all users who have invalid activities, and the number of invalid activities per user.

In [None]:

query = """SELECT a.user_id, count(*)
                FROM Activity a
                JOIN TrackPoint t1
                ON a.id = t1.activity_id
                JOIN TrackPoint t2
                ON t1.id = t2.id-1
                AND t1.activity_id = t2.activity_id
                WHERE t2.date_time > t1.date_time + INTERVAL 5 MINUTE
                AND t1.id != t2.id
                GROUP BY a.user_id
                ORDER BY a.user_id ASC"""

db.cursor.execute(query)
rows = db.cursor.fetchall()
print(tabulate(rows, headers=db.cursor.column_names))

### Task 12

Find all the users who have registered transportation_mode and their most used transportation_mode

Comment on implementation: Although with this dataset, does not need to join User, however in cases where user does not have labels, but still has some activities in Activity with transportation_mode then a join is necessary. For users that have the same number of activities tagged on multiple transportation mode, we've decided to take the first transportation mode in alphabetical order.

In [None]:
query = """SELECT user_id, transportation_mode FROM 
        (SELECT user_id, transportation_mode, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY COUNT(*) DESC, transportation_mode ASC) 
        AS rownum FROM Activity JOIN User ON Activity.user_id = User.id WHERE transportation_mode IS NOT NULL 
        AND User.has_labels=TRUE GROUP BY user_id, transportation_mode)
        AS activity_grouped WHERE rownum=1 ORDER BY user_id"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
print("Find all the users who have registered transportation_mode and their most used transportation_mode")
print(tabulate(rows, headers=db.cursor.column_names))

### End

In [None]:
# Closing the connection
db.close_connection()