### Initialize

In [123]:
import os
import pymongo
from program import Program

In [124]:
program = Program()

You are connected to the database: assignment3
-----------------------------------------------



In [125]:
program.drop_coll('User')
program.drop_coll('Activity')
program.drop_coll('TrackPoint')

### Create Collections

In [126]:
program.create_coll('User')
program.create_coll('Activity')
program.create_coll('TrackPoint')

Created collection:  Collection(Database(MongoClient(host=['tdt4225-35.idi.ntnu.no:27017'], document_class=dict, tz_aware=False, connect=True), 'assignment3'), 'User')
Created collection:  Collection(Database(MongoClient(host=['tdt4225-35.idi.ntnu.no:27017'], document_class=dict, tz_aware=False, connect=True), 'assignment3'), 'Activity')
Created collection:  Collection(Database(MongoClient(host=['tdt4225-35.idi.ntnu.no:27017'], document_class=dict, tz_aware=False, connect=True), 'assignment3'), 'TrackPoint')


### Insert Documents

In [127]:
user_label = []
with open("./dataset/labeled_ids.txt", "r", encoding="UTF-8") as labeled_ids:
    for line in labeled_ids:
        user_label.append(line.strip())

trackpoint_counter = 1
activity_counter = 1

# Loop through all the folders containing data for each user
# As each folder correspond to a user, add the user to the User table
directory_path = "./dataset/Data"
folders = os.listdir(directory_path)
for user_id in folders:
    # If user saves transportation mode
    if user_id in user_label:
        # Insert into User table with has_labels=TRUE
        user = {"_id": user_id, "has_labels": True}

        program.insert_doc("User", user)

        activities = []

        # Insert the user's labeled activities
        with open(
            "./dataset/Data/" + user_id + "/labels.txt", "r", encoding="UTF-8"
        ) as labels:
            values = []
            labels.readline()
            for line in labels:
                # Process each line based on labels.txt format
                line = line.strip().split("\t")
                start_time = line[0].replace("/", "-").strip('\n')
                end_time = line[1].replace("/", "-").strip('\n')
                transportation_mode = line[2]

                activity = {
                    "_id": activity_counter,
                    "user_id": user_id,
                    "transportation_mode": transportation_mode,
                    "start_time": start_time,
                    "end_time": end_time,
                }

                activities.append(activity)
                activity_counter += 1

        # Insert all labeled activites as a batch
        program.insert_many_docs("Activity", activities)

    else:
        # If user does not save transportation mode, set has_labels=FALSE
        user = {"_id": user_id, "has_labels": False}

        program.insert_doc("User", user)

    # Retrieve files in current user's Trajectory folder
    # Each file corresponds to an activity
    trajectory_path = "./dataset/Data/" + user_id + "/Trajectory/"
    files = os.listdir(trajectory_path)

    for file in files:
        # Read the file
        with open(trajectory_path + file, "r", encoding="UTF-8") as f:
            lines = f.readlines()

        # Process the file to retrieve start and end time
        # Start time: Date of the first trackpoint
        # End time: Date of the last trackpoint
        start_time = lines[6].split(",")[5] + " " + lines[6].split(",")[6]
        end_time = lines[-1].split(",")[5] + " " + lines[-1].split(",")[6]

        # If user saves transportation mode, then retrieve the activity in
        # Activity table which has exact matches on starttime and end time
        if user_id in user_label:

            # We choose the most recent added activity by using the ASCENDING sort and then the loop
            # This is for example if a user starts driving the car, but drives slow in the beginning
            # it might have been perceived as walking in the beginning and added to the Activity list
            # after the user speeds up, a new activity is added with the same endtime and starttime,
            # but with another transportation mode
            rows = program.query(
                 "Activity",
                {
                    "user_id": user_id,
                    "start_time": start_time.strip('\n'),
                    "end_time": end_time.strip('\n'),
                },
            )
            activity_id = []
            for row in rows:
                activity_id.append(row['_id'])

            # If there is no match, then this file of trackpoint is skipped
            # Because user who saves transportation mode is not allowed
            # To have transportation mode equals to NULL
            # Thus, we cannot add a new Activity row for it
            if not activity_id:
                continue

        # If file includes more than 2500 (+6 to count for headers) trackpoints
        if len(lines) > 2500 + 6:
            # If user saves transportation mode, delete the activity that this
            # file belongs to as it will not have any trackpoints, thus is not relevant
            if user_id in user_label:

                program.delete_many_docs('Activity', {
                    '_id': {
                        '%in': activity_id
                    }
                })
            # Thereafter, skip this file
            continue

        if user_id in user_label:
            activity_id = activity_id[0]
        else:
            # If user does not save transportation, it means no activity has been added
            # and therefore we add a new activity with transportation_mode = None (NULL)
            activity = {
                "_id": activity_counter,
                "user_id": user_id,
                "transportation_mode": None,
                "start_time": start_time,
                "end_time": end_time,
            }
            program.insert_doc('Activity', activity)
            activity_id = activity_counter
            activity_counter += 1

        values = []
        # Then process each line of the activity file, but skip the headers (first 6 lines)
        trackpoints = []
        for line in lines[6:]:
            line = line.strip().split(",")
            lat = line[0]
            lon = line[1]
            altitude = line[3]
            date_days = line[4]
            date_time = line[5] + " " + line[6]

            trackpoint = {
                '_id': trackpoint_counter,
                'activity_id': activity_id,
                'lat': lat,
                'lon': lon,
                'altitude': altitude,
                'date_days': date_days,
                'date_time': date_time
            }
            trackpoints.append(trackpoint)
            trackpoint_counter += 1

        program.insert_many_docs('TrackPoint',trackpoints)

# Then we clean up the activities which do not have any trackpoints
# As there could exists trackpoints in the labels.txt for each user that does not have any
# corresponding file in the user's Trajectory folder

activity_ids = program.field_query_unique('TrackPoint', 'activity_id')
program.delete_many_docs('Activity', {
    '_id': {
        "$nin": activity_ids
    }
})

In [128]:
collection = program.db['User']
documents = collection.find()
users = list(documents)
len(users)

182

In [129]:
collection = program.db['Activity']
documents = collection.find({})
activities = list(documents)
len(activities)

7877

In [130]:
collection = program.db['TrackPoint']
documents = collection.find({})
trackpoints = list(documents)
len(trackpoints)

5355109