In [1]:
from DbConnector import DbConnector
#from part2 import Database
from tabulate import tabulate

# Set up the program
from dotenv import load_dotenv
import os
import numpy as np
from datetime import datetime, timedelta
from haversine import haversine, Unit
from pprint import pprint

load_dotenv()

connector = DbConnector()

You are connected to the database: assignment3
-----------------------------------------------



### Task 1

How many users, activities and trackpoints are there in the dataset (after it is inserted into the database)

In [49]:
user_collection = connector.db['User']
user_count = user_collection.count_documents({})

activities_collection = connector.db['Activity']
activities_count = activities_collection.count_documents({})

trackpoints_collection = connector.db['TrackPoint']
trackpoints_count = trackpoints_collection.count_documents({})

result = [user_count, activities_count, trackpoints_count]

print("Number of Users, Activities and TrackPoints after the dataset has been inserted into the database")
print(tabulate([["Users", "Activities", "TrackPoints"], result], headers="firstrow"))

Number of Users, Activities and TrackPoints after the dataset has been inserted into the database
  Users    Activities    TrackPoints
-------  ------------  -------------
    182          7877        5355109


### Task 2
Find the average number of activities per user.

{'average avtivites': 51.82236842105263}


### Task 3

Find the top 20 users with the highest number of activities.

In [17]:
collection = connector.db["Activity"]

rows = collection.aggregate(
    [
        {"$group": {"_id": "$user_id", "count": {"$sum": 1}}},
        {"$sort": {"count": -1}},
        {
            "$setWindowFields": {
                "sortBy": {"count": -1},
                "output": {"Top": {"$rank": {}}},
            }
        },
        {"$project": {"_id": 0, "Top": "$Top", "user_id": "$_id", "count": "$count"}},
        {"$limit": 20},
    ]
)

print(tabulate(rows, headers="keys"))

  Top    user_id    count
-----  ---------  -------
    1        025      715
    2        128      519
    3        062      406
    4        041      399
    5        004      346
    6        140      345
    7        017      265
    8        003      261
    9        014      236
   10        030      210
   11        011      201
   12        039      198
   13        034      180
   14        000      155
   15        002      146
   16        142      138
   17        037      129
   18        013      119
   19        042      110
   20        020       94


### Task 4
Find all users who have taken a taxi. 

In [18]:
collection = connector.db["Activity"]

rows = collection.aggregate(
    [
        {"$match": {"transportation_mode" : "taxi"}},
        {"$group": {"_id":  "$user_id"}},
        {"$sort": {"_id": 1}},

    ]
)

for row in rows:
    print(row)

{'_id': '010'}
{'_id': '058'}
{'_id': '062'}
{'_id': '078'}
{'_id': '080'}
{'_id': '085'}
{'_id': '098'}
{'_id': '111'}
{'_id': '128'}
{'_id': '163'}


### Task 5
Find all types of transportation modes and count how many activities that are tagged with these transportation mode labels. Do not count the rows where the mode is null.

In [21]:
collection = connector.db["Activity"]

rows = collection.aggregate(
    [
        { "$match": { "transportation_mode": { "$ne": None} } },
        { "$group": { "_id": "$transportation_mode", "count": { "$sum": 1 } } },
    ]
)

print(tabulate(rows, headers="keys"))

_id         count
--------  -------
walk          480
bike          263
airplane        3
car           419
train           2
subway        133
taxi           37
run             1
bus           199
boat            1


### Task 6
a) Find the year with the most activities.

We assume that an activity belongs to a year based on the start date time such that even though the end date time ends in another year, it still belongs to the previous year.

In [22]:
collection = connector.db["Activity"]

rows = collection.aggregate([
    { "$match": { "start_date_time": { "$ne": None} } },
    { "$project": { "year": { "$year": "$start_date_time" } } },
    { "$group": { "_id": "$year", "count": { "$sum": 1 } } },
    { "$sort": { "count": -1 } },
    { "$limit": 1 }]
)

print(tabulate(rows, headers="keys"))

  _id    count
-----  -------
 2009     3975


b) Is this also the year with most recorded hours?

In [25]:
collection = connector.db["Activity"]

rows = collection.aggregate([
    { "$match": { "start_date_time": { "$ne": None} } },
    { "$match": { "end_date_time": { "$ne": None} } },
    { "$addFields": {
        "duration_hours": {
            "$divide": [
                { "$subtract": [ "$end_date_time", "$start_date_time" ] },
                1000 * 60 * 60 
            ]
        },
        "year": { "$year": "$start_date_time" }
    }},
    { "$group": {
        "_id": { "year": "$year", },
        "total_duration": { "$sum": "$duration_hours" }
    },
    },
    { "$sort": { "total_duration": -1 } },
    { "$limit": 1 }]
)

print(tabulate(rows, headers="keys"))

_id               total_duration
--------------  ----------------
{'year': 2009}           8918.61


### Task 7
Find the total distance (in km) walked in 2008, by user with id=112.

In [27]:
collection = connector.db["TrackPoint"]

rows = collection.aggregate([
    { "$match": { "date_time": { "$gte": datetime(2008, 1, 1), "$lt": datetime(2009, 1, 1) } } },
    { "$lookup": {
        "from": "Activity", 
        "let": { "activity_id": "$activity_id" },
        "pipeline": [
            { "$match":
                { "$expr":
                    { "$eq": [ "$_id",  "$$activity_id" ] }
                }
            },
            { "$project": { "_id": 1, "user_id": 1, "transportation_mode": 1 } }
        ],
        "as": "activity"
    } },
    { "$unwind": "$activity" },
    { "$match": { "activity.user_id": "112" } },
    { "$match": { "activity.transportation_mode": "walk" } },
    { "$project": { 
        "lat": 1,
        "lon": 1,
        "activity_id": 1,
    } },
])

documents = list(rows)


Becaue MongoDB do not have the best geospatial support, We could have used the some [geospatial-queries](https://www.mongodb.com/docs/manual/geospatial-queries/). However, it is not optimal for this use case (because we have to calculate the distance and not just check if a point is inside a polygon), and solving it in python may be faster.

In [29]:
activities = {}

"""
activities: dict = {
    activity_id: dict = {
        distance: float,
        lat: float,
        lon: float,
    }
}
"""

for doc in documents:
    activity_id = doc["activity_id"]
    lat = float(doc["lat"])
    lon = float(doc["lon"])
    if activity_id not in activities:
        activities[activity_id] = {
            "distance": 0,
            "lat": lat,
            "lon": lon, 
        }
    else:
        activities[activity_id]["distance"] += haversine(
            (activities[activity_id]["lat"], activities[activity_id]["lon"]),
            (lat, lon),
            unit=Unit.KILOMETERS
        )
        activities[activity_id]["lat"] = lat
        activities[activity_id]["lon"] = lon

total_distance = 0

for activity_id in activities:
    total_distance += activities[activity_id]["distance"]

print(tabulate([["Total distance walked in 2008", total_distance]], headers="firstrow"))

Total distance walked in 2008    115.47465961508007
-------------------------------  --------------------


### Task 8

 Find the top 20 users who have gained the most altitude meters

In [16]:
collection = connector.db["TrackPoint"]

rows = collection.aggregate(
    [
        {"$match": {"altitude": {"$ne": -777}}},
        {
            "$setWindowFields": {
                "partitionBy": "$activity_id",
                "sortBy": {"date_time": 1},
                "output": {
                    "previous_altitude": {"$shift": {"output": "$altitude", "by": -1}}
                },
            }
        },
        {
            "$project": {
                "activity_id": "$activity_id",
                "difference": {"$subtract": ["$altitude", "$previous_altitude"]},
            }
        },
        {"$match": {"difference": {"$gt": 0}}},
        {
            "$lookup": {
                "from": "Activity",
                "localField": "activity_id",
                "foreignField": "_id",
                "as": "activity",
            }
        },
        {
            "$group": {
                "_id":"$activity.user_id",
                "sum": {"$sum": "$difference"},
            }
        },
        {
            "$project": {
                "altitude_in_meters": {"$multiply": ["$sum", 0.304]},
            }
        },
        {"$sort": {"altitude_in_meters": -1}},
        {
            "$setWindowFields": {
                "sortBy": {"altitude_in_meters": -1},
                "output": {"Top": {"$rank": {}}},
            }
        },
        {"$limit": 20},
        {"$project": {
            "_id": 0,
            "Top": 1,
            "user_id": "$_id",
            "altitude_in_meters": 1
        }}
    ]
)

print(tabulate(rows, headers="keys"))

  altitude_in_meters    Top  user_id
--------------------  -----  ---------
            331165        1  ['004']
            240127        2  ['041']
            233050        3  ['003']
            175219        4  ['030']
            152571        5  ['128']
            146319        6  ['039']
            121186        7  ['000']
            114761        8  ['002']
            108862        9  ['025']
             98960.5     10  ['037']
             94589.9     11  ['140']
             74559       12  ['062']
             62402.1     13  ['017']
             61278.5     14  ['034']
             61159       15  ['042']
             60636.1     16  ['007']
             60342.5     17  ['022']
             59991.1     18  ['014']
             53876.4     19  ['028']
             50455.5     20  ['013']


### Task 9
Find all users who have invalid activities, and the number of invalid activities per user

### Task 10
Find the users who have tracked an activity in the Forbidden City of Beijing.

In [34]:
collection = connector.db["TrackPoint"]

rows = collection.aggregate(
    [
        {
         "$project": {
                "activity_id": "$activity_id",
                "distance": {
                    "$sqrt": {
                      "$add": [
                        { "$pow": [{ "$subtract": [{ "$toDouble": "$lat"}, 39.917610]}, 2 ]},
                        { "$pow": [{ "$subtract": [{ "$toDouble": "$lon"}, 116.397028]}, 2 ]}
                        ]
                        }
                    }
                }
        },

        { "$match": { "distance": { "$lt": 0.0045 } }},
        {
            "$group": {
                "_id": "$activity_id",
            }
        }

    ]
)

ids = [row["_id"] for row in rows]

collection = connector.db["Activity"]
activities = collection.find({"_id": {"$in": ids}})

users = {}
for activity in activities:
    id = activity["user_id"]
    if id not in users:
        users[id] = {"user_id": id}

for user in users:
    print(users[user])

{'user_id': '004'}
{'user_id': '018'}
{'user_id': '019'}
{'user_id': '062'}
{'user_id': '067'}
{'user_id': '131'}
{'user_id': '140'}
{'user_id': '168'}


### Task 11
Find all users who have registered transportation_mode and their most used transportation_mode. 

In [33]:
collection = connector.db["Activity"]

rows = collection.aggregate(
    [
        {"$match": {"transportation_mode": {"$ne": None}}},
        {
            "$group": {
                "_id": {
                    "user_id": "$user_id",
                    "transportation_mode": "$transportation_mode",
                },
                "count": {"$sum": 1},
            }
        },
        {
            "$lookup": {
                "from": "User",
                "let": {
                    "a_id": "$_id",
                    "a_has_labels": "$has_labels",
                },
                "pipeline": [{"$match": {"$expr": {"$eq": ["$$a_id", "$user_id"]}}}],
                "as": "saves_transportation",
            }
        },
        {"$match": {"saves_transportation": {"$ne": {"$a_has_labels": False}}}},
        {"$sort": {"_id.user_id": 1, "count": -1, "_id.transportation_mode": 1}},
        {
            "$group": {
                "_id": "$_id.user_id",
                "most_used_transportation_mode": {"$first": "$_id.transportation_mode"},
            }
        },
        {"$sort": {"_id": 1}},
    ]
)

print(tabulate(rows, headers="keys"))

  _id  most_used_transportation_mode
-----  -------------------------------
  010  taxi
  020  bike
  021  walk
  052  bus
  056  bike
  058  car
  060  walk
  062  bus
  064  bike
  065  bike
  067  walk
  069  bike
  073  walk
  075  walk
  076  car
  078  walk
  080  bike
  081  bike
  082  walk
  084  walk
  085  walk
  086  car
  087  walk
  089  car
  091  bus
  092  bus
  097  bike
  098  taxi
  101  car
  102  bike
  107  walk
  108  walk
  111  taxi
  112  walk
  115  car
  117  walk
  125  bike
  126  bike
  128  car
  136  walk
  138  bike
  139  bike
  144  walk
  153  walk
  161  walk
  163  bike
  167  bike
  175  bus
