In [6]:
from DbConnector import DbConnector
#from part2 import Database
from tabulate import tabulate

# Set up the program
from dotenv import load_dotenv
import os
import numpy as np
from datetime import datetime, timedelta
from haversine import haversine, Unit
from pprint import pprint

load_dotenv()

connector = DbConnector()

You are connected to the database: assignment3
-----------------------------------------------



### Task 1

How many users, activities and trackpoints are there in the dataset (after it is inserted into the database)

In [7]:
user_collection = connector.db['User']
user_count = user_collection.count_documents({})

activities_collection = connector.db['Activity']
activities_count = activities_collection.count_documents({})

trackpoints_collection = connector.db['TrackPoint']
trackpoints_count = trackpoints_collection.count_documents({})

result = [user_count, activities_count, trackpoints_count]

print("Number of Users, Activities and TrackPoints after the dataset has been inserted into the database")
print(tabulate([["Users", "Activities", "TrackPoints"], result], headers="firstrow"))

Number of Users, Activities and TrackPoints after the dataset has been inserted into the database
  Users    Activities    TrackPoints
-------  ------------  -------------
    182          7877        5355109


### Task 2
Find the average number of activities per user.

### Task 3

Find the top 20 users with the highest number of activities.

In [12]:
collection = connector.db["Activity"]

rows = collection.aggregate(
    [
        {"$group": {"_id":  "$user_id", "count": {"$sum": 1}}},
        {"$sort": {"count": -1}},
        {
            "$setWindowFields": {
                "sortBy": {"count": -1},
                "output": {"Top": {"$rank": {}}},
            }
        },
        {"$limit": 20},
    ]
)

print(tabulate(rows, headers="keys"))

{'_id': '025', 'count': 715, 'Top': 1}
{'_id': '128', 'count': 519, 'Top': 2}
{'_id': '062', 'count': 406, 'Top': 3}
{'_id': '041', 'count': 399, 'Top': 4}
{'_id': '004', 'count': 346, 'Top': 5}
{'_id': '140', 'count': 345, 'Top': 6}
{'_id': '017', 'count': 265, 'Top': 7}
{'_id': '003', 'count': 261, 'Top': 8}
{'_id': '014', 'count': 236, 'Top': 9}
{'_id': '030', 'count': 210, 'Top': 10}
{'_id': '011', 'count': 201, 'Top': 11}
{'_id': '039', 'count': 198, 'Top': 12}
{'_id': '034', 'count': 180, 'Top': 13}
{'_id': '000', 'count': 155, 'Top': 14}
{'_id': '002', 'count': 146, 'Top': 15}
{'_id': '142', 'count': 138, 'Top': 16}
{'_id': '037', 'count': 129, 'Top': 17}
{'_id': '013', 'count': 119, 'Top': 18}
{'_id': '042', 'count': 110, 'Top': 19}
{'_id': '020', 'count': 94, 'Top': 20}



### Task 4
Find all users who have taken a taxi. 

### Task 5
Find all types of transportation modes and count how many activities that are tagged with these transportation mode labels. Do not count the rows where the mode is null.

In [11]:
collection = connector.db["Activity"]

rows = collection.aggregate(
    [
        { "$match": { "transportation_mode": { "$ne": None} } },
        { "$group": { "_id": "$transportation_mode", "count": { "$sum": 1 } } },
    ]
)

print(tabulate(rows, headers="keys"))

_id         count
--------  -------
train           2
airplane        3
bike          263
taxi           37
car           419
bus           199
subway        133
boat            1
run             1
walk          480


### Task 6
a) Find the year with the most activities.

We assume that an activity belongs to a year based on the start date time such that even though the end date time ends in another year, it still belongs to the previous year.

In [12]:
collection = connector.db["Activity"]

rows = collection.aggregate([
    { "$match": { "start_date_time": { "$ne": None} } },
    { "$project": { "year": { "$year": "$start_date_time" } } },
    { "$group": { "_id": "$year", "count": { "$sum": 1 } } },
    { "$sort": { "count": -1 } },
    { "$limit": 1 }]
)

print(tabulate(rows, headers="keys"))

  _id    count
-----  -------
 2009     3975


b) Is this also the year with most recorded hours?

In [13]:
collection = connector.db["Activity"]

rows = collection.aggregate([
    { "$match": { "start_date_time": { "$ne": None} } },
    { "$match": { "end_date_time": { "$ne": None} } },
    { "$addFields": {
        "duration_hours": {
            "$divide": [
                { "$subtract": [ "$end_date_time", "$start_date_time" ] },
                1000 * 60 * 60 
            ]
        },
        "year": { "$year": "$start_date_time" }
    }},
    { "$group": {
        "_id": { "year": "$year", },
        "total_duration": { "$sum": "$duration_hours" }
    },
    },
    { "$sort": { "total_duration": -1 } },
    { "$limit": 1 }]
)

print(tabulate(rows, headers="keys"))

_id               total_duration
--------------  ----------------
{'year': 2009}           8918.61


### Task 7
Find the total distance (in km) walked in 2008, by user with id=112.

In [14]:
collection = connector.db["TrackPoint"]

rows = collection.aggregate([
    { "$match": { "date_time": { "$gte": datetime(2008, 1, 1), "$lt": datetime(2009, 1, 1) } } },
    { "$lookup": {
        "from": "Activity", 
        "let": { "activity_id": "$activity_id" },
        "pipeline": [
            { "$match":
                { "$expr":
                    { "$eq": [ "$_id",  "$$activity_id" ] }
                }
            },
            { "$project": { "_id": 1, "user_id": 1, "transportation_mode": 1 } }
        ],
        "as": "activity"
    } },
    { "$unwind": "$activity" },
    { "$match": { "activity.user_id": "112" } },
    { "$match": { "activity.transportation_mode": "walk" } },
    { "$project": { 
        "lat": 1,
        "lon": 1,
        "activity_id": 1,
    } },
])

documents = list(rows)


Becaue MongoDB do not have the best geospatial support, We could have used the some [geospatial-queries](https://www.mongodb.com/docs/manual/geospatial-queries/). However, it is not optimal for this use case (because we have to calculate the distance and not just check if a point is inside a polygon), and solving it in python may be faster.

In [None]:
activities = {}

"""
activities: dict = {
    activity_id: dict = {
        distance: float,
        lat: float,
        lon: float,
    }
}
"""

for doc in documents:
    activity_id = doc["activity_id"]
    lat = float(doc["lat"])
    lon = float(doc["lon"])
    if activity_id not in activities:
        activities[activity_id] = {
            "distance": 0,
            "lat": lat,
            "lon": lon, 
        }
    else:
        activities[activity_id]["distance"] += haversine(
            (activities[activity_id]["lat"], activities[activity_id]["lon"]),
            (lat, lon),
            unit=Unit.KILOMETERS
        )
        activities[activity_id]["lat"] = lat
        activities[activity_id]["lon"] = lon

total_distance = 0

for activity_id in activities:
    total_distance += activities[activity_id]["distance"]

print(tabulate([["Total distance walked in 2008", total_distance]], headers="firstrow"))

### Task 8

 Find the top 20 users who have gained the most altitude meters

In [8]:
collection = connector.db["TrackPoint"]

rows = collection.aggregate(
    [
        {"$match": {"altitude": {"$ne": -777}}},
        {
            "$setWindowFields": {
                "partitionBy": "$activity_id",
                "sortBy": {"date_time": 1},
                "output": {
                    "previous_altitude": {"$shift": {"output": "$altitude", "by": -1}}
                },
            }
        },
        {
            "$project": {
                "activity_id": "$activity_id",
                "difference": {"$subtract": ["$altitude", "$previous_altitude"]},
            }
        },
        {"$match": {"difference": {"$gt": 0}}},
        {
            "$lookup": {
                "from": "Activity",
                "localField": "activity_id",
                "foreignField": "_id",
                "as": "activity",
            }
        },
        {
            "$group": {
                "_id": {
                    "user_id": "$activity.user_id",
                },
                "sum": {"$sum": "$difference"},
            }
        },
        {
            "$project": {
                "altitude_in_meters": {"$multiply": ["$sum", 0.304]},
            }
        },
        {"$sort": {"altitude_in_meters": -1}},
        {
            "$setWindowFields": {
                "sortBy": {"altitude_in_meters": -1},
                "output": {"Top": {"$rank": {}}},
            }
        },
        {"$limit": 20},
    ]
)

print(tabulate(rows, headers="keys"))

{'_id': 1, 'activity_id': 1, 'alt': '492'}
{'_id': 2, 'activity_id': 1, 'alt': '492'}
{'_id': 3, 'activity_id': 1, 'alt': '492'}
{'_id': 4, 'activity_id': 1, 'alt': '492'}
{'_id': 5, 'activity_id': 1, 'alt': '492'}
{'_id': 6, 'activity_id': 1, 'alt': '493'}
{'_id': 7, 'activity_id': 1, 'alt': '493'}
{'_id': 8, 'activity_id': 1, 'alt': '496'}
{'_id': 9, 'activity_id': 1, 'alt': '500'}
{'_id': 10, 'activity_id': 1, 'alt': '505'}
{'_id': 11, 'activity_id': 1, 'alt': '510'}
{'_id': 12, 'activity_id': 1, 'alt': '515'}
{'_id': 13, 'activity_id': 1, 'alt': '520'}
{'_id': 14, 'activity_id': 1, 'alt': '525'}
{'_id': 15, 'activity_id': 1, 'alt': '531'}
{'_id': 16, 'activity_id': 1, 'alt': '536'}
{'_id': 17, 'activity_id': 1, 'alt': '541'}
{'_id': 18, 'activity_id': 1, 'alt': '546'}
{'_id': 19, 'activity_id': 1, 'alt': '551'}
{'_id': 20, 'activity_id': 1, 'alt': '556'}


### Task 9
Find all users who have invalid activities, and the number of invalid activities per user

### Task 10
Find the users who have tracked an activity in the Forbidden City of Beijing.

### Task 11
Find all users who have registered transportation_mode and their most used transportation_mode. 

In [None]:
collection = connector.db["Activity"]

rows = collection.aggregate(
    [
        {"$match": {"transportation_mode": {"$ne": None}}},
        {
            "$group": {
                "_id": {
                    "user_id": "$user_id",
                    "transportation_mode": "$transportation_mode",
                },
                "count": {"$sum": 1},
            }
        },
        {
            "$lookup": {
                "from": "User",
                "let": {
                    "a_id": "$_id",
                    "a_has_labels": "$has_labels",
                },
                "pipeline": [{"$match": {"$expr": {"$eq": ["$$a_id", "$user_id"]}}}],
                "as": "saves_transportation",
            }
        },
        {"$match": {"saves_transportation": {"$ne": {"$a_has_labels": False}}}},
        {"$sort": {"_id.user_id": 1, "count": -1}},
        {
            "$group": {
                "_id": "$_id.user_id",
                "most_used_transportation_mode": {"$first": "$_id.transportation_mode"},
            }
        },
        {"$sort": {"_id": 1}},
    ]
)

print(tabulate(rows, headers="keys"))