In [None]:
from DbConnector import DbConnector
from part2 import Database 
from tabulate import tabulate

# Set up the program
from dotenv import load_dotenv
import os

load_dotenv()

user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')

program = DbConnector(USER=user, PASSWORD=password)
db = Database()

### Task 1

How many users, activities and trackpoints are there in the dataset (after it is inserted into the database)

In [None]:
# Get the number of Users
users = db.get_table_size("User")
activities = db.get_table_size("Activity")
trackpoints = db.get_table_size("TrackPoint")
print("Number of Users, Activities and TrackPoints")
print(tabulate([["Users", "Activities", "TrackPoints"], [users[0], activities[0], trackpoints[0]]], headers="firstrow"))

### Task2
Find the average, maximum and minimum number of trackpoints per user

##### Average

In [None]:
# Get the average number of trackpoints per user
query = """
SELECT User.id, COALESCE(average, 0) AS average 
FROM User 
LEFT JOIN (
    SELECT user_id, AVG(trackpoints) average 
    FROM (
        SELECT Activity.user_id , COUNT(t.id) AS trackpoints 
        FROM TrackPoint t 
        JOIN Activity ON t.activity_id = Activity.id 
        JOIN User ON Activity.user_id = User.id 
        GROUP BY Activity.id
    ) AS Trackpoints GROUP BY user_id
) a ON a.user_id = User.id
"""
program.cursor.execute(query)
rows = program.cursor.fetchall()
columns = program.cursor.column_names

print("Average trackpoints per user")
print(tabulate(users, headers=columns))


##### Maximum

In [None]:
# Get the maximum number of trackpoints per user
query = """
SELECT User.id, COALESCE(maximum, 0) AS maximum 
FROM User 
LEFT JOIN (
    SELECT user_id, MAX(trackpoints) AS maximum 
    FROM (
        SELECT Activity.user_id , COUNT(t.id) AS trackpoints 
        FROM TrackPoint t 
        INNER JOIN Activity ON t.activity_id = Activity.id 
        GROUP BY Activity.id
    ) AS Trackpoints GROUP BY user_id
) a ON a.user_id = User.id
"""
program.cursor.execute(query)
rows = program.cursor.fetchall()
columns = program.cursor.column_names
print("Maximum trackpoints per user")
print(tabulate(users, headers=columns))

##### Minimum

In [None]:
# Get the minimum number of trackpoints per user
query = """
    SELECT User.id, COALESCE(minimum, 0) AS minimum 
    FROM User LEFT JOIN (
        SELECT user_id, MIN(trackpoints) AS minimum
        FROM (
            SELECT Activity.user_id , COUNT(t.id) AS trackpoints 
            FROM TrackPoint t
            INNER JOIN Activity ON t.activity_id = Activity.id 
            GROUP BY Activity.id
        ) AS Trackpoints 
        GROUP BY user_id
) a ON a.user_id = User.id
"""
program.cursor.execute(query)
rows = program.cursor.fetchall()
columns = program.cursor.column_names
print("Minimum trackpoints per user")
print(tabulate(users, headers=columns))

### Task 3

Find the top 15 users with the highest number of activities.

In [None]:
query = """
SELECT RANK() OVER (
    ORDER BY COUNT(*) DESC
) Top, user_id, COUNT(*) as num_of_activities 
FROM Activity GROUP BY user_id 
ORDER BY COUNT(*) DESC LIMIT 15
"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
columns = db.cursor.column_names
print("Top 15 users with the highest number of activities")
print(tabulate(rows, headers=columns))

### Task 4

Find all users who have taken a bus.

In [None]:
# Get the users who have taken the bus
users, columns = db.get_user_taken_bus()
print("All users who have taken a bus")
print(tabulate(users, headers=columns))

### Task 5

List the top 10 users by their amount of different transportation modes.

In [6]:
query = """
SELECT user_id, COUNT(DISTINCT(transportation_mode)) as DifferentTransportation 
FROM Activity 
GROUP BY user_id 
ORDER BY DifferentTransportation DESC LIMIT 10;"""
program.cursor.execute(query)
rows = program.cursor.fetchall()
columns = program.cursor.column_names
print("Top 10 users by their amount of different transportation modes")
print(tabulate(users, headers=columns))

Top 10 users by their amount of different transportation modes
  user_id    DifferentTransportation
---------  -------------------------
      000                   670.871
      001                   824.965
      002                   924.801
      003                   807.387
      004                   761.858
      005                   587.548
      006                   801.125
      007                   758.275
      008                  1376
      009                  1218.32
      010                  1107.5
      011                   451.756
      012                   629.783
      013                  1388.3
      014                   905.936
      015                   806.583
      016                   690.222
      017                   868.245
      018                   540.795
      019                   372.291
      020                   704.638
      021                   475
      022                   900.695
      023                   979.312
      024   

### Task 6

Find activities that are registered multiple times. You should find the query even
if it gives zero result.

In [7]:
query = """
SELECT a.id FROM Activity AS a WHERE EXISTS (
    SELECT b.id FROM Activity AS b 
    WHERE a.user_id = b.user_id 
    AND a.transportation_mode = b.transportation_mode 
    AND a.start_date_time = b.start_date_time 
    AND a.end_date_time = b.end_date_time 
    AND a.id != b.id)
"""
program.cursor.execute(query)
rows = program.cursor.fetchall()
print("Find activities that are registered multiple times:\n", rows)

Find activities that are registered multiple times:
 []


### Task 7

a) Find the number of users that have started an activity in one day and ended the activity the next day.

In [None]:
num_users = db.get_num_user_activity_over_a_day()

print("Number of users with activity that ends the next day")
print(tabulate([["Number of users"], [num_users[0]]], headers="firstrow", tablefmt="fancy_grid"))

b) List the transportation mode, user id and duration for these activities.

In [None]:
users_info = db.get_user_activity_over_a_day()

table = [["User", "Transportation Mode", "Duration"]]
content = [[user[0], user[1], user[2]] for user in users_info]
table.extend(content)

print(tabulate(table, headers="firstrow", tablefmt="fancy_grid"))

### Task 9

Find the top 15 users who have gained the most altitude meters.

In [None]:
query = "SELECT user_id, sum(activity_altitude)*0.304 as altitude_in_meters FROM Activity JOIN (SELECT activity_id, SUM(difference) AS activity_altitude FROM (SELECT activity_id, altitude - LAG(altitude) OVER (PARTITION BY activity_id ORDER BY date_time) AS difference FROM TrackPoint) AS altitude_difference WHERE difference > 0 GROUP BY activity_id) AS difference_table ON Activity.id = difference_table.activity_id GROUP BY user_id ORDER BY altitude_in_meters DESC LIMIT 15"
program.cursor.execute(query)
rows = program.cursor.fetchall()
text = "Find the top 15 users who have gained the most altitude meters\n"
for i in rows:
    text += f"User: {i[0]}, Altitude in meters: {i[1]}\n"
print(text)

### Task 10

Find the users that have traveled the longest total distance in one day for each
transportation mode.

In [None]:
users = db.get_user_with_max_distance()

table = [["User", "Transportation mode", "Distance (km)"]]
table.extend(users)

print(tabulate(table, headers="firstrow", tablefmt="fancy_grid"))

### Task 12

Find all the users who have registered transportation_mode and their most used transportation_mode

Comment on implementation: Although with this dataset, does not need to join User, however in cases where user does not have labels, but still has some activities in Activity with transportation_mode then a join is necessary. For users that have the same number of activities tagged on multiple transportation mode, we've decided to take the first transportation mode in alphabetical order.

In [None]:
query = "SELECT user_id, transportation_mode FROM ( SELECT user_id, transportation_mode, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY COUNT(*) DESC, transportation_mode ASC) AS rownum FROM Activity JOIN User ON Activity.user_id = User.id WHERE transportation_mode IS NOT NULL AND User.has_labels=TRUE GROUP BY user_id, transportation_mode) AS activity_grouped WHERE rownum=1 ORDER BY user_id"
program.cursor.execute(query)
rows = program.cursor.fetchall()
text = "Find all the users who have registered transportation_mode and their most used transportation_mode\n"
for row in rows:
    text += f"User {row[0]}, Transportation mode: {row[1]}\n"
print(text)

### End

In [None]:
# Closing the connection
program.close_connection()