In [None]:
from DbConnector import DbConnector
from part2 import Database 
from tabulate import tabulate

# Set up the program
from dotenv import load_dotenv
import os

load_dotenv()

user = os.getenv('DB_USER')
password = os.getenv('DB_PASSWORD')

program = DbConnector(USER=user, PASSWORD=password)
db = Database()

### Task 1

How many users, activities and trackpoints are there in the dataset (after it is inserted into the database)

In [None]:
# Get the number of Users
users = db.get_table_size("User")
activities = db.get_table_size("Activity")
trackpoints = db.get_table_size("TrackPoint")
print("Number of Users, Activities and TrackPoints")
print(tabulate([["Users", "Activities", "TrackPoints"], [users[0], activities[0], trackpoints[0]]], headers="firstrow"))

### Task2
Find the average, maximum and minimum number of trackpoints per user

##### Average

In [11]:
# Get the average number of trackpoints per user
query = """
SELECT User.id, COALESCE(average, 0) AS average 
FROM User 
LEFT JOIN (
    SELECT user_id, AVG(trackpoints) average 
    FROM (
        SELECT Activity.user_id , COUNT(t.id) AS trackpoints 
        FROM TrackPoint t 
        JOIN Activity ON t.activity_id = Activity.id 
        JOIN User ON Activity.user_id = User.id 
        GROUP BY Activity.id
    ) AS Trackpoints GROUP BY user_id
) a ON a.user_id = User.id
"""
program.cursor.execute(query)
rows = program.cursor.fetchall()
columns = program.cursor.column_names

print("Average trackpoints per user")
print(tabulate(rows, headers=columns))


Average trackpoints per user
  id    average
----  ---------
 000   670.871
 001   824.965
 002   924.801
 003   807.387
 004   761.858
 005   587.548
 006   801.125
 007   758.275
 008  1376
 009  1218.32
 010  1107.5
 011   451.756
 012   629.783
 013  1388.3
 014   905.936
 015   806.583
 016   690.222
 017   868.245
 018   540.795
 019   372.291
 020   704.638
 021   475
 022   900.695
 023   979.312
 024   860.408
 025   606.295
 026  1034.71
 027  2037
 028  1195.36
 029  1713.17
 030   871.352
 031   489.5
 032  1190.43
 033   735.25
 034   558.678
 035  1449.71
 036   803.75
 037   890.977
 038  1071.51
 039   756.51
 040   953.55
 041   797.416
 042   555.946
 043   795.094
 044   628.492
 045   752.125
 046   703.807
 047    65.8333
 048  1118
 049     0
 050   760.833
 051   231.3
 052   177
 053     0
 054  1093
 055   132.526
 056    39.7333
 057   127
 058    38.5
 059     0
 060     8
 061   140
 062   439.054
 063   709.857
 064   695
 065  1114.94
 066   889.375
 067  

##### Maximum

In [12]:
# Get the maximum number of trackpoints per user
query = """
SELECT User.id, COALESCE(maximum, 0) AS maximum 
FROM User 
LEFT JOIN (
    SELECT user_id, MAX(trackpoints) AS maximum 
    FROM (
        SELECT Activity.user_id , COUNT(t.id) AS trackpoints 
        FROM TrackPoint t 
        INNER JOIN Activity ON t.activity_id = Activity.id 
        GROUP BY Activity.id
    ) AS Trackpoints GROUP BY user_id
) a ON a.user_id = User.id
"""
program.cursor.execute(query)
rows = program.cursor.fetchall()
columns = program.cursor.column_names
print("Maximum trackpoints per user")
print(tabulate(rows, headers=columns))

Maximum trackpoints per user
  id    maximum
----  ---------
 000       2359
 001       2472
 002       2438
 003       2485
 004       2482
 005       2058
 006       2478
 007       2228
 008       2499
 009       2396
 010       1964
 011       2306
 012       2277
 013       2486
 014       2499
 015       2411
 016       2360
 017       2471
 018       2245
 019       2276
 020       2201
 021        475
 022       2421
 023       2215
 024       2377
 025       2464
 026       2396
 027       2480
 028       2477
 029       2440
 030       2454
 031       1254
 032       2358
 033       1205
 034       2448
 035       2493
 036       2299
 037       2421
 038       2500
 039       2464
 040       2434
 041       2491
 042       2350
 043       2005
 044       2206
 045       1932
 046       2457
 047        227
 048       1118
 049          0
 050       2484
 051        951
 052        177
 053          0
 054       1527
 055        670
 056        165
 057        510
 058       

##### Minimum

In [13]:
# Get the minimum number of trackpoints per user
query = """
SELECT User.id, COALESCE(minimum, 0) AS minimum 
FROM User LEFT JOIN (
    SELECT user_id, MIN(trackpoints) AS minimum
    FROM (
        SELECT Activity.user_id , COUNT(t.id) AS trackpoints 
        FROM TrackPoint t
        INNER JOIN Activity ON t.activity_id = Activity.id 
        GROUP BY Activity.id
    ) AS Trackpoints 
    GROUP BY user_id
) a ON a.user_id = User.id
"""
program.cursor.execute(query)
rows = program.cursor.fetchall()
columns = program.cursor.column_names
print("Minimum trackpoints per user")
print(tabulate(rows, headers=columns))

Minimum trackpoints per user
  id    minimum
----  ---------
 000          5
 001         33
 002          4
 003          3
 004          4
 005          5
 006         14
 007          6
 008        165
 009        134
 010        663
 011          3
 012         64
 013         13
 014          3
 015         22
 016          7
 017          4
 018          3
 019         11
 020         22
 021        475
 022         15
 023         31
 024          4
 025          3
 026          9
 027       1760
 028         32
 029          4
 030          9
 031         23
 032         62
 033         39
 034          4
 035        200
 036         13
 037         14
 038          8
 039          7
 040         39
 041          3
 042          4
 043         26
 044          3
 045         71
 046        175
 047          4
 048       1118
 049          0
 050          3
 051          5
 052        177
 053          0
 054        659
 055          5
 056         16
 057          9
 058       

### Task 3

Find the top 15 users with the highest number of activities.

In [None]:
query = """
SELECT RANK() OVER (
    ORDER BY COUNT(*) DESC
) Top, user_id, COUNT(*) as num_of_activities 
FROM Activity GROUP BY user_id 
ORDER BY COUNT(*) DESC LIMIT 15
"""
db.cursor.execute(query)
rows = db.cursor.fetchall()
columns = db.cursor.column_names
print("Top 15 users with the highest number of activities")
print(tabulate(rows, headers=columns))

### Task 4

Find all users who have taken a bus.

In [None]:
# Get the users who have taken the bus
users, columns = db.get_user_taken_bus()
print("All users who have taken a bus")
print(tabulate(users, headers=columns))

### Task 5

List the top 10 users by their amount of different transportation modes.

In [None]:
query = """
SELECT user_id, COUNT(DISTINCT(transportation_mode)) as DifferentTransportation 
FROM Activity 
GROUP BY user_id 
ORDER BY DifferentTransportation DESC LIMIT 10;"""
program.cursor.execute(query)
rows = program.cursor.fetchall()
columns = program.cursor.column_names
print("Top 10 users by their amount of different transportation modes")
print(tabulate(users, headers=columns))

### Task 6

Find activities that are registered multiple times. You should find the query even
if it gives zero result.

In [None]:
query = """
SELECT a.id FROM Activity AS a WHERE EXISTS (
    SELECT b.id FROM Activity AS b 
    WHERE a.user_id = b.user_id 
    AND a.transportation_mode = b.transportation_mode 
    AND a.start_date_time = b.start_date_time 
    AND a.end_date_time = b.end_date_time 
    AND a.id != b.id)
"""
program.cursor.execute(query)
rows = program.cursor.fetchall()
print("Find activities that are registered multiple times:\n", rows)

### Task 7

a) Find the number of users that have started an activity in one day and ended the activity the next day.

In [None]:
num_users = db.get_num_user_activity_over_a_day()

print("Number of users with activity that ends the next day")
print(tabulate([["Number of users"], [num_users[0]]], headers="firstrow", tablefmt="fancy_grid"))

b) List the transportation mode, user id and duration for these activities.

In [None]:
users_info = db.get_user_activity_over_a_day()

table = [["User", "Transportation Mode", "Duration"]]
content = [[user[0], user[1], user[2]] for user in users_info]
table.extend(content)

print(tabulate(table, headers="firstrow", tablefmt="fancy_grid"))

### Task 8
Find the number of users which have been close to each other in time and space. Close is defined as the same space (50 meters) and for the same half minute (30 seconds)

In [16]:
import numpy as np

#Finding min,max of lon and lat
min_max_lat_lon_query = """
SELECT MIN(lat), MAX(lat), MIN(lon), MAX(lat)
FROM TrackPoint
"""

program.cursor.execute(min_max_lat_lon_query)
min_max = program.cursor.fetchall()

min_lat = min_max[0][0] - 0.1
max_lat = min_max[0][1] + 0.1
min_lon = min_max[0][2] - 0.1
max_lon = min_max[0][3] + 0.1


lat = np.linspace(min_lat,max_lat, 5)
lon = np.linspace(min_lon,max_lon, 5)



    # Divinding trackpoints into smaller areas
query = """
WITH limitedTrackPoints AS (
    SELECT * 
    FROM TrackPoint 
    WHERE lat > %s 
    AND lat < %s 
    AND lon > %s 
    AND lon < %s)
SELECT DISTINCT *
FROM Activity a WHERE EXISTS ( 
    SELECT * FROM limitedTrackPoints t1 WHERE EXISTS(
        SELECT * FROM limitedTrackPoints t2 WHERE
        t2.lon < t1.lon + 0.0040
        AND t2.lat < t1.lat + 0.0040
        AND t1.date_time BETWEEN t2.date_time - INTERVAL '30' SECOND AND t2.date_time + INTERVAL '30' SECOND)
) GROUP BY a.user_id
"""


users = []
for i in range(len(lat)-1):
    for j in range(len(lon)-1):
        program.cursor.execute(query, (lat[i], lat[i + 1], lon[j], lon[j + 1]))
        rows = program.cursor.fetchall()

            # Checks if there is content in output
        if len(rows) > 0:
            # Adds userid to list
            for row in rows:
                users.append(row[1])
    # Removes duplicates
users = np.unique(users)

# Build a table
columns = program.cursor.column_names
print("Users who have been in the same area within 30 seconds")
print(tabulate(users, headers=columns))

ProgrammingError: 1055 (42000): Expression #1 of SELECT list is not in GROUP BY clause and contains nonaggregated column 'assignment2.a.id' which is not functionally dependent on columns in GROUP BY clause; this is incompatible with sql_mode=only_full_group_by

### Task 9

Find the top 15 users who have gained the most altitude meters.

In [None]:
query = "SELECT user_id, sum(activity_altitude)*0.304 as altitude_in_meters FROM Activity JOIN (SELECT activity_id, SUM(difference) AS activity_altitude FROM (SELECT activity_id, altitude - LAG(altitude) OVER (PARTITION BY activity_id ORDER BY date_time) AS difference FROM TrackPoint) AS altitude_difference WHERE difference > 0 GROUP BY activity_id) AS difference_table ON Activity.id = difference_table.activity_id GROUP BY user_id ORDER BY altitude_in_meters DESC LIMIT 15"
program.cursor.execute(query)
rows = program.cursor.fetchall()
text = "Find the top 15 users who have gained the most altitude meters\n"
for i in rows:
    text += f"User: {i[0]}, Altitude in meters: {i[1]}\n"
print(text)

### Task 10

Find the users that have traveled the longest total distance in one day for each
transportation mode.

In [None]:
users = db.get_user_with_max_distance()

table = [["User", "Transportation mode", "Distance (km)"]]
table.extend(users)

print(tabulate(table, headers="firstrow", tablefmt="fancy_grid"))

### Task 12

Find all the users who have registered transportation_mode and their most used transportation_mode

Comment on implementation: Although with this dataset, does not need to join User, however in cases where user does not have labels, but still has some activities in Activity with transportation_mode then a join is necessary. For users that have the same number of activities tagged on multiple transportation mode, we've decided to take the first transportation mode in alphabetical order.

In [None]:
query = "SELECT user_id, transportation_mode FROM ( SELECT user_id, transportation_mode, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY COUNT(*) DESC, transportation_mode ASC) AS rownum FROM Activity JOIN User ON Activity.user_id = User.id WHERE transportation_mode IS NOT NULL AND User.has_labels=TRUE GROUP BY user_id, transportation_mode) AS activity_grouped WHERE rownum=1 ORDER BY user_id"
program.cursor.execute(query)
rows = program.cursor.fetchall()
text = "Find all the users who have registered transportation_mode and their most used transportation_mode\n"
for row in rows:
    text += f"User {row[0]}, Transportation mode: {row[1]}\n"
print(text)

### End

In [None]:
# Closing the connection
program.close_connection()