### Initialize

In [2]:
%pip install pymongo
import os
import pymongo
from datetime import datetime
from program import Program

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\henriklarsen\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [10]:
program = Program()

You are connected to the database: assignment3
-----------------------------------------------



In [14]:
program.drop_coll('User')
program.drop_coll('Activity')
program.drop_coll('TrackPoint')

### Create Collections

In [15]:
program.create_coll('User')
program.create_coll('Activity')
program.create_coll('TrackPoint')

Created collection:  Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'assignment3'), 'User')
Created collection:  Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'assignment3'), 'Activity')
Created collection:  Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'assignment3'), 'TrackPoint')


### Insert Documents

In [16]:
user_label = []
with open("./dataset/labeled_ids.txt", "r", encoding="UTF-8") as labeled_ids:
    for line in labeled_ids:
        user_label.append(line.strip())

trackpoint_counter = 1
activity_counter = 1

# Loop through all the folders containing data for each user
# As each folder correspond to a user, add the user to the User table
directory_path = "./dataset/Data"
folders = os.listdir(directory_path)
for user_id in folders:
    # If user saves transportation mode
    if user_id in user_label:
        # Insert into User table with has_labels=TRUE
        user = {"_id": user_id, "has_labels": True}

        program.insert_doc("User", user)

        activities = []

        # Insert the user's labeled activities
        with open(
            "./dataset/Data/" + user_id + "/labels.txt", "r", encoding="UTF-8"
        ) as labels:
            values = []
            labels.readline()
            for line in labels:
                # Process each line based on labels.txt format
                line = line.strip().split("\t")
                start_date_time = line[0].replace("/", "-")
                end_date_time = line[1].replace("/", "-")
                transportation_mode = line[2]

                start_date_time = datetime.strptime(start_date_time, "%Y-%m-%d %H:%M:%S")
                end_date_time = datetime.strptime(end_date_time, "%Y-%m-%d %H:%M:%S")

                activity = {
                    "_id": activity_counter,
                    "user_id": user_id,
                    "transportation_mode": transportation_mode,
                    "start_date_time": start_date_time,
                    "end_date_time": end_date_time,
                }

                activities.append(activity)
                activity_counter += 1

        # Insert all labeled activites as a batch
        program.insert_many_docs("Activity", activities)

    else:
        # If user does not save transportation mode, set has_labels=FALSE
        user = {"_id": user_id, "has_labels": False}

        program.insert_doc("User", user)

    # Retrieve files in current user's Trajectory folder
    # Each file corresponds to an activity
    trajectory_path = "./dataset/Data/" + user_id + "/Trajectory/"
    files = os.listdir(trajectory_path)

    for file in files:
        # Read the file
        with open(trajectory_path + file, "r", encoding="UTF-8") as f:
            lines = f.readlines()

        # Process the file to retrieve start and end time
        # Start time: Date of the first trackpoint
        # End time: Date of the last trackpoint
        start_date_time = lines[6].split(",")[5] + " " + lines[6].split(",")[6]
        end_date_time = lines[-1].split(",")[5] + " " + lines[-1].split(",")[6]

        start_date_time = datetime.strptime(start_date_time.strip(), "%Y-%m-%d %H:%M:%S")
        end_date_time = datetime.strptime(end_date_time.strip(), "%Y-%m-%d %H:%M:%S")

        # If user saves transportation mode, then retrieve the activity in
        # Activity table which has exact matches on starttime and end time
        if user_id in user_label:

            # We choose the most recent added activity by using the ASCENDING sort and then the loop
            # This is for example if a user starts driving the car, but drives slow in the beginning
            # it might have been perceived as walking in the beginning and added to the Activity list
            # after the user speeds up, a new activity is added with the same endtime and starttime,
            # but with another transportation mode
            rows = program.query(
                 "Activity",
                {
                    "user_id": user_id,
                    "start_date_time": start_date_time,
                    "end_date_time": end_date_time,
                },
            )
            activity_id = []
            for row in rows:
                activity_id.append(row['_id'])

            # If there is no match, then this file of trackpoint is skipped
            # Because user who saves transportation mode is not allowed
            # To have transportation mode equals to NULL
            # Thus, we cannot add a new Activity row for it
            if not activity_id:
                continue

        # If file includes more than 2500 (+6 to count for headers) trackpoints
        if len(lines) > 2500 + 6:
            # If user saves transportation mode, delete the activity that this
            # file belongs to as it will not have any trackpoints, thus is not relevant
            if user_id in user_label:

                program.delete_many_docs('Activity', {
                    '_id': {
                        '%in': activity_id
                    }
                })
            # Thereafter, skip this file
            continue

        if user_id in user_label:
            activity_id = activity_id[0]
            
        else:
            # If user does not save transportation, it means no activity has been added
            # and therefore we add a new activity with transportation_mode = None (NULL)
            activity = {
                "_id": activity_counter,
                "user_id": user_id,
                "transportation_mode": None,
                "start_date_time": start_date_time,
                "end_date_time": end_date_time,
            }
            program.insert_doc('Activity', activity)
            activity_id = activity_counter
            activity_counter += 1

        values = []
        # Then process each line of the activity file, but skip the headers (first 6 lines)
        trackpoints = []
        for line in lines[6:]:
            line = line.strip().split(",")
            lat = line[0]
            lon = line[1]
            altitude = line[3]
            date_days = line[4]
            date_time = line[5] + " " + line[6]

            date_time = datetime.strptime(date_time.strip(), "%Y-%m-%d %H:%M:%S")

            trackpoint = {
                '_id': trackpoint_counter,
                'activity_id': activity_id,
                'lat': lat,
                'lon': lon,
                'altitude': altitude,
                'date_days': date_days,
                'date_time': date_time
            }
            trackpoints.append(trackpoint)
            trackpoint_counter += 1

        program.insert_many_docs('TrackPoint',trackpoints)

# Then we clean up the activities which do not have any trackpoints
# As there could exists trackpoints in the labels.txt for each user that does not have any
# corresponding file in the user's Trajectory folder
activity_ids = program.field_query_unique('TrackPoint', 'activity_id')
program.delete_many_docs('Activity', {
    '_id': {
        "$nin": activity_ids
    }
})

[1199]
[1516]
[1554]
[1580]
[2706]
[2735]
[2739]
[2740]
[2744]
[2748]
[2758]
[2759]
[2783]
[2800]
[2801]
[2802]
[2803]
[2804]
[2805]
[2806]
[2807]
[2809]
[2810]
[2811]
[2812]
[2813]
[2814]
[2815]
[2816]
[2817]
[2818]
[2819]
[2820]
[2821]
[2822]
[2823]
[2825]
[2826]
[2828]
[2829]
[2830]
[2831]
[2832]
[2833]
[2842]
[2843]
[2844]
[2845]
[2846]
[2847]
[2848]
[2849]
[2850]
[2851]
[2852]
[2854]
[2855]
[2856]
[2858, 2857]
[2859]
[2860]
[2861]
[2862]
[2865]
[2866]
[2869]
[2870]
[2871]
[2872]
[2873]
[2875]
[2877]
[2878]
[2879]
[2882]
[2883]
[2884]
[2885]
[2886]
[2888]
[2889]
[2890]
[2891]
[2892]
[2893]
[2894]
[2897]
[2898]
[2899]
[2900]
[2903]
[2905]
[2906]
[2907]
[2908]
[2909]
[2910]
[2911]
[2946]
[5643]
[6009]
[6010]
[6011]
[6014]
[6016]
[6022]
[6023]
[6024]
[6028]
[6031]
[6032]
[6033]
[6034]
[6035]
[6036]
[6064]
[6065]
[6067]
[6068]
[6069]
[6070]
[6098]
[6123]
[6124]
[6126]
[6128]
[6129]
[6131]
[6133]
[6134]
[6135]
[6137]
[6138]
[6143]
[6144]
[6145]
[6146]
[6147]
[6148]
[6149]
[6150]
[6151]


'program.delete_many_docs(\'Activity\', {\n    \'_id\': {\n        "$nin": activity_ids\n    }\n})'

In [19]:
collection = program.db['User']
documents = collection.find()
users = list(documents)
len(users)

182

In [18]:
collection = program.db['Activity']
documents = collection.find({})
activities = list(documents)
len(activities)

7877

In [20]:
collection = program.db['TrackPoint']
documents = collection.find({})
trackpoints = list(documents)
len(trackpoints)

5355109