In [None]:
from DbConnector import DbConnector
#from part2 import Database
from tabulate import tabulate

# Set up the program
from dotenv import load_dotenv
import os
import numpy as np
from datetime import datetime, timedelta
from haversine import haversine, Unit
from pprint import pprint

load_dotenv()

connector = DbConnector()

You are connected to the database: assignment3
-----------------------------------------------



### Task 1

How many users, activities and trackpoints are there in the dataset (after it is inserted into the database)

In [2]:
user_collection = connector.db['User']
user_count = user_collection.count_documents({})

activities_collection = connector.db['Activity']
activities_count = activities_collection.count_documents({})

trackpoints_collection = connector.db['TrackPoint']
trackpoints_count = trackpoints_collection.count_documents({})

result = [user_count, activities_count, trackpoints_count]

print("Number of Users, Activities and TrackPoints after the dataset has been inserted into the database")
print(tabulate([["Users", "Activities", "TrackPoints"], result], headers="firstrow"))

Number of Users, Activities and TrackPoints after the dataset has been inserted into the database
  Users    Activities    TrackPoints
-------  ------------  -------------
    182          7877        5355109


### Task 2
Find the average number of activities per user.

In [4]:
collection = connector.db["Activity"]

rows = collection.aggregate(
    [

        {"$group": {"_id":  "$user_id", "count": {"$sum": 1}}},
        {
        "$group": {
            "_id": None,
            "average activities": {"$avg": "$count"}
            }
        },
        {
            "$project": {
                "_id": 0,
                "average_activities": 0
            }
        }

    ]
)

for row in rows:
    print(row)






{'average activities': 51.82236842105263}


### Task 3

Find the top 20 users with the highest number of activities.

In [None]:
collection = connector.db["Activity"]

rows = collection.aggregate(
    [
        {"$group": {"_id":  "$user_id", "count": {"$sum": 1}}},
        {"$sort": {"count": -1}},
        {
            "$setWindowFields": {
                "sortBy": {"count": -1},
                "output": {"Top": {"$rank": {}}},
            }
        },
        {"$limit": 20},
    ]
)

print(tabulate(rows, headers="keys"))

### Task 4
Find all users who have taken a taxi. 

In [4]:
collection = connector.db["Activity"]

rows = collection.aggregate(
    [
        {"$match": {"transportation_mode" : "taxi"}},
        {"$group": {"_id":  "$user_id"}},
        {"$sort": {"_id": 1}},

    ]
)

print(tabulate(rows, headers="keys"))

  _id
-----
  010
  058
  062
  078
  080
  085
  098
  111
  128
  163


### Task 5
Find all types of transportation modes and count how many activities that are tagged with these transportation mode labels. Do not count the rows where the mode is null.

In [None]:
collection = connector.db["Activity"]

rows = collection.aggregate(
    [
        { "$match": { "transportation_mode": { "$ne": None} } },
        { "$group": { "_id": "$transportation_mode", "count": { "$sum": 1 } } },
    ]
)

print(tabulate(rows, headers="keys"))

### Task 6
a) Find the year with the most activities.

We assume that an activity belongs to a year based on the start date time such that even though the end date time ends in another year, it still belongs to the previous year.

In [None]:
collection = connector.db["Activity"]

rows = collection.aggregate([
    { "$match": { "start_date_time": { "$ne": None} } },
    { "$project": { "year": { "$year": "$start_date_time" } } },
    { "$group": { "_id": "$year", "count": { "$sum": 1 } } },
    { "$sort": { "count": -1 } },
    { "$limit": 1 }]
)

print(tabulate(rows, headers="keys"))

b) Is this also the year with most recorded hours?

In [None]:
collection = connector.db["Activity"]

rows = collection.aggregate([
    { "$match": { "start_date_time": { "$ne": None} } },
    { "$match": { "end_date_time": { "$ne": None} } },
    { "$addFields": {
        "duration_hours": {
            "$divide": [
                { "$subtract": [ "$end_date_time", "$start_date_time" ] },
                1000 * 60 * 60 
            ]
        },
        "year": { "$year": "$start_date_time" }
    }},
    { "$group": {
        "_id": { "year": "$year", },
        "total_duration": { "$sum": "$duration_hours" }
    },
    },
    { "$sort": { "total_duration": -1 } },
    { "$limit": 1 }]
)

print(tabulate(rows, headers="keys"))

### Task 7
Find the total distance (in km) walked in 2008, by user with id=112.

In [None]:
collection = connector.db["TrackPoint"]

rows = collection.aggregate([
    { "$match": { "date_time": { "$gte": datetime(2008, 1, 1), "$lt": datetime(2009, 1, 1) } } },
    { "$lookup": {
        "from": "Activity", 
        "let": { "activity_id": "$activity_id" },
        "pipeline": [
            { "$match":
                { "$expr":
                    { "$eq": [ "$_id",  "$$activity_id" ] }
                }
            },
            { "$project": { "_id": 1, "user_id": 1, "transportation_mode": 1 } }
        ],
        "as": "activity"
    } },
    { "$unwind": "$activity" },
    { "$match": { "activity.user_id": "112" } },
    { "$match": { "activity.transportation_mode": "walk" } },
    { "$project": { 
        "lat": 1,
        "lon": 1,
        "activity_id": 1,
    } },
])

documents = list(rows)


Becaue MongoDB do not have the best geospatial support, We could have used the some [geospatial-queries](https://www.mongodb.com/docs/manual/geospatial-queries/). However, it is not optimal for this use case (because we have to calculate the distance and not just check if a point is inside a polygon), and solving it in python may be faster.

In [None]:
activities = {}

"""
activities: dict = {
    activity_id: dict = {
        distance: float,
        lat: float,
        lon: float,
    }
}
"""

for doc in documents:
    activity_id = doc["activity_id"]
    lat = float(doc["lat"])
    lon = float(doc["lon"])
    if activity_id not in activities:
        activities[activity_id] = {
            "distance": 0,
            "lat": lat,
            "lon": lon, 
        }
    else:
        activities[activity_id]["distance"] += haversine(
            (activities[activity_id]["lat"], activities[activity_id]["lon"]),
            (lat, lon),
            unit=Unit.KILOMETERS
        )
        activities[activity_id]["lat"] = lat
        activities[activity_id]["lon"] = lon

total_distance = 0

for activity_id in activities:
    total_distance += activities[activity_id]["distance"]

print(tabulate([["Total distance walked in 2008", total_distance]], headers="firstrow"))

### Task 8

 Find the top 20 users who have gained the most altitude meters

In [None]:
collection = connector.db["TrackPoint"]

rows = collection.aggregate(
    [
        {"$match": {"altitude": {"$ne": -777}}},
        {
            "$setWindowFields": {
                "partitionBy": "$activity_id",
                "sortBy": {"date_time": 1},
                "output": {
                    "previous_altitude": {"$shift": {"output": "$altitude", "by": -1}}
                },
            }
        },
        {
            "$project": {
                "activity_id": "$activity_id",
                "difference": {"$subtract": ["$altitude", "$previous_altitude"]},
            }
        },
        {"$match": {"difference": {"$gt": 0}}},
        {
            "$lookup": {
                "from": "Activity",
                "localField": "activity_id",
                "foreignField": "_id",
                "as": "activity",
            }
        },
        {
            "$group": {
                "_id": {
                    "user_id": "$activity.user_id",
                },
                "sum": {"$sum": "$difference"},
            }
        },
        {
            "$project": {
                "altitude_in_meters": {"$multiply": ["$sum", 0.304]},
            }
        },
        {"$sort": {"altitude_in_meters": -1}},
        {
            "$setWindowFields": {
                "sortBy": {"altitude_in_meters": -1},
                "output": {"Top": {"$rank": {}}},
            }
        },
        {"$limit": 20},
    ]
)

print(tabulate(rows, headers="keys"))

### Task 9
Find all users who have invalid activities, and the number of invalid activities per user

In [2]:
collection = connector.db["TrackPoint"]

rows = collection.aggregate(
    [
        {
            "$setWindowFields": {

                "sortBy": {"_id": 1},
                "output": {
                    "previous_activity_id" : {"$shift": {"output": "$activity_id", "by": -1}},
                    "previous_date_time": {"$shift": {"output": "$date_time", "by": -1}}
                },
            }
        },
        {
             "$project": {
                    "_id": "$_id",
                    "activity_id": "$activity_id",
                    "previous_activity_id": "$previous_activity_id",
                    "difference": {"$subtract": [ {"$toDate": "$date_time"}, { "$toDate": "$previous_date_time" } ]},
             }
        },
        {"$match": {"$expr": {"$eq" : [ "$activity_id" , "$previous_activity_id"]}}},
        { "$match": { "difference": { "$gt": 300000 } } },
        {
            "$group": {
                "_id": "$activity_id",
            }
        }





    ]
)

ids = [row["_id"] for row in rows]

collection = connector.db["Activity"]
activities = collection.find({"_id": {"$in": ids}})

users = {}
for activity in activities:

    id = activity["user_id"]
    if id not in users:
        users[id] = {"id": id, "invalid_activities": 1}
    else:
        users[id]["invalid_activities"] += 1

print(tabulate(users.values(), headers="keys"))


  id    invalid_activities
----  --------------------
 000                   101
 001                    45
 002                    98
 003                   179
 004                   219
 005                    44
 006                    17
 007                    30
 008                    16
 009                    31
 011                    32
 012                    43
 013                    29
 014                   118
 015                    46
 016                    20
 017                   129
 018                    27
 019                    31
 020                    11
 021                     1
 022                    55
 023                    11
 024                    27
 025                   263
 026                    18
 027                     2
 028                    36
 029                    25
 030                   112
 031                     3
 032                    12
 033                     2
 034                    88
 035                    23
 

### Task 10
Find the users who have tracked an activity in the Forbidden City of Beijing.

In [None]:
collection = connector.db["TrackPoint"]

# Calculates the euclidean distance between the user's coordinates and the centre of the Forbidden City of Beijing
rows = collection.aggregate(
    [
        {
         "$project": {
                "activity_id": "$activity_id",
                "distance": {
                    "$sqrt": {
                      "$add": [
                        { "$pow": [{ "$subtract": [{ "$toDouble": "$lat"}, 39.917610]}, 2 ]},
                        { "$pow": [{ "$subtract": [{ "$toDouble": "$lon"}, 116.397028]}, 2 ]}
                        ]
                        }
                    }
                }
        },

        { "$match": { "distance": { "$lt": 0.0045 } }},
        {
            "$group": {
                "_id": "$activity_id",
            }
        }

    ]
)

ids = [row["_id"] for row in rows]

collection = connector.db["Activity"]
activities = collection.find({"_id": {"$in": ids}})

users = {}
for activity in activities:
    id = activity["user_id"]
    if id not in users:
        users[id] = {"user_id": id}

for user in users:
    print(users[user])

### Task 11
Find all users who have registered transportation_mode and their most used transportation_mode. 

In [None]:
collection = connector.db["Activity"]

rows = collection.aggregate(
    [
        {"$match": {"transportation_mode": {"$ne": None}}},
        {
            "$group": {
                "_id": {
                    "user_id": "$user_id",
                    "transportation_mode": "$transportation_mode",
                },
                "count": {"$sum": 1},
            }
        },
        {
            "$lookup": {
                "from": "User",
                "let": {
                    "a_id": "$_id",
                    "a_has_labels": "$has_labels",
                },
                "pipeline": [{"$match": {"$expr": {"$eq": ["$$a_id", "$user_id"]}}}],
                "as": "saves_transportation",
            }
        },
        {"$match": {"saves_transportation": {"$ne": {"$a_has_labels": False}}}},
        {"$sort": {"_id.user_id": 1, "count": -1}},
        {
            "$group": {
                "_id": "$_id.user_id",
                "most_used_transportation_mode": {"$first": "$_id.transportation_mode"},
            }
        },
        {"$sort": {"_id": 1}},
    ]
)

print(tabulate(rows, headers="keys"))