In [4]:
import pandas as pd
import os
from concurrent.futures import ThreadPoolExecutor

In [6]:
data_path = "C:/Users/aminp/OneDrive - NTNU/Dokumenter/NTNU/5år/Store, distribuerte datamengder/Store_distribuerte_datamengder/dataset/"
data_path2 = "C:/Users/aminp/OneDrive - NTNU/Dokumenter/NTNU/5år/Store, distribuerte datamengder/Store_distribuerte_datamengder/dataset/Data"

### TrackPoint Table

In [7]:
MAX_ALLOWED_ROWS = 2506  # Maximum allowed rows in a file

def process_file(file_path, user, activity):
    data = []
    with open(file_path) as f:
        # Check the number of rows in the file
        line_count = sum(1 for _ in f)
        if line_count > MAX_ALLOWED_ROWS:
            print(f"Skipping {file_path} - File is too long")
            return data

        # Reset the file pointer to read the file again
        f.seek(0)
        for _ in range(6):
            next(f)
        for line in f:
            lat, lon, _, altitude, date_days, date_string, time_string = line.split(',')
            data.append({
                'user': user,
                'activity': activity,
                'lat': lat,
                'lon': lon,
                'altitude': altitude,
                'date_days': date_days,
                'date_string': date_string,
                'time_string': time_string
            })
    return data

In [8]:
def get_trajectories(data_path):
    data_frames = []
    user_folders = os.listdir(data_path)
    user_folders.sort()

    with ThreadPoolExecutor() as executor:
        futures = []
        for user in user_folders:
            activity_dir_path = os.path.join(data_path, user).replace("\\", "/")
            if not os.path.isdir(activity_dir_path):
                continue

            for trajectory_dir in os.listdir(activity_dir_path):
                trajectory_dir_path = os.path.join(activity_dir_path, trajectory_dir).replace("\\", "/")

                if not os.path.isdir(trajectory_dir_path):
                    continue

                for activity_file in os.listdir(trajectory_dir_path):
                    if not activity_file.endswith('.plt'):
                        continue

                    file_path = os.path.join(trajectory_dir_path, activity_file).replace("\\", "/")
                    futures.append(executor.submit(process_file, file_path, user, activity_file[:-4]))

        for future in futures:
            data_frames.extend(future.result())

    final_df = pd.DataFrame(data_frames)
    return final_df


In [9]:
track_df = get_trajectories(data_path2)
track_df

Skipping C:/Users/aminp/OneDrive - NTNU/Dokumenter/NTNU/5år/Store, distribuerte datamengder/Store_distribuerte_datamengder/dataset/Data/000/Trajectory/20090405051938.plt - File is too long
Skipping C:/Users/aminp/OneDrive - NTNU/Dokumenter/NTNU/5år/Store, distribuerte datamengder/Store_distribuerte_datamengder/dataset/Data/000/Trajectory/20090403011657.plt - File is too long
Skipping C:/Users/aminp/OneDrive - NTNU/Dokumenter/NTNU/5år/Store, distribuerte datamengder/Store_distribuerte_datamengder/dataset/Data/000/Trajectory/20090419005226.plt - File is too long
Skipping C:/Users/aminp/OneDrive - NTNU/Dokumenter/NTNU/5år/Store, distribuerte datamengder/Store_distribuerte_datamengder/dataset/Data/000/Trajectory/20090413213935.plt - File is too long
Skipping C:/Users/aminp/OneDrive - NTNU/Dokumenter/NTNU/5år/Store, distribuerte datamengder/Store_distribuerte_datamengder/dataset/Data/000/Trajectory/20090425005731.plt - File is too long
Skipping C:/Users/aminp/OneDrive - NTNU/Dokumenter/NTNU

Unnamed: 0,user,activity,lat,lon,altitude,date_days,date_string,time_string
0,000,20081023025304,39.984702,116.318417,492,39744.1201851852,2008-10-23,02:53:04\n
1,000,20081023025304,39.984683,116.31845,492,39744.1202546296,2008-10-23,02:53:10\n
2,000,20081023025304,39.984686,116.318417,492,39744.1203125,2008-10-23,02:53:15\n
3,000,20081023025304,39.984688,116.318385,492,39744.1203703704,2008-10-23,02:53:20\n
4,000,20081023025304,39.984655,116.318263,492,39744.1204282407,2008-10-23,02:53:25\n
...,...,...,...,...,...,...,...,...
9681751,181,20080314025755,40.9148666666667,111.7105,3802.49343832021,39521.1527314815,2008-03-14,03:39:56\n
9681752,181,20080314025755,40.9142666666667,111.710333333333,3795.93175853018,39521.1536689815,2008-03-14,03:41:17\n
9681753,181,20080314025755,40.9124666666667,111.710666666667,3795.93175853018,39521.1548842593,2008-03-14,03:43:02\n
9681754,181,20080314025755,40.9115166666667,111.711316666667,3779.52755905512,39521.1551851852,2008-03-14,03:43:28\n


In [10]:
#Transformations and fixing types of columns 
def transform_data(df):
    df['user'] = df['user'].astype(str).str.zfill(3)
    df["activity"] = df['activity'].astype(str)
    df["lat"] = df['lat'].astype(float)
    df["lon"] = df['lon'].astype(float)
    df["altitude"] = df['altitude'].astype(float)
    df["date_days"] = df['date_days'].astype(float)
    df["time_string"] = df["time_string"].str.replace('\n', '')
    df["date_time"] = df["date_string"] + " " + df["time_string"]
    df['date_time'] = pd.to_datetime(df['date_time'], format='%Y-%m-%d %H:%M:%S')
    results_df = df.drop(['date_string', 'time_string'], axis=1)
    return results_df

In [11]:
track_df = transform_data(track_df)
track_df

Unnamed: 0,user,activity,lat,lon,altitude,date_days,date_time
0,000,20081023025304,39.984702,116.318417,492.000000,39744.120185,2008-10-23 02:53:04
1,000,20081023025304,39.984683,116.318450,492.000000,39744.120255,2008-10-23 02:53:10
2,000,20081023025304,39.984686,116.318417,492.000000,39744.120313,2008-10-23 02:53:15
3,000,20081023025304,39.984688,116.318385,492.000000,39744.120370,2008-10-23 02:53:20
4,000,20081023025304,39.984655,116.318263,492.000000,39744.120428,2008-10-23 02:53:25
...,...,...,...,...,...,...,...
9681751,181,20080314025755,40.914867,111.710500,3802.493438,39521.152731,2008-03-14 03:39:56
9681752,181,20080314025755,40.914267,111.710333,3795.931759,39521.153669,2008-03-14 03:41:17
9681753,181,20080314025755,40.912467,111.710667,3795.931759,39521.154884,2008-03-14 03:43:02
9681754,181,20080314025755,40.911517,111.711317,3779.527559,39521.155185,2008-03-14 03:43:28


In [41]:
#Check if any users has any activities with more than 2506 rows 

def check_rows_count(df):
    results = df.groupby(['user', 'activity']).size().reset_index(name='counts').query('counts > 2500')
    return results
    

In [42]:
check = check_rows_count(track_df)
check

Unnamed: 0,user,activity,counts


### User Table

In [43]:
#Need to find all users that got removed i.e had more than 2500 rows 

def find_removed_users(df): 
    all_users = [str(i).zfill(3) for i in range(183)]
    unique_users = set(df["user"].astype(str))
    removed_users = [user for user in all_users if user not in unique_users]
    return removed_users

In [44]:
removed_users = find_removed_users(track_df)
removed_users

['049', '120', '137', '143', '148', '149', '156', '160', '177', '182']

In [39]:
#Function to get all user_id and check if they have label

def get_user_id(data_path, df): 
    directories_data = os.listdir(f'{data_path}/Data')
    labeled_ids_path = f'{data_path}/labeled_ids.txt'
    
    with open(labeled_ids_path, "r") as file:
        lines = file.readlines()
    ids = [line.strip().split()[0] for line in lines]

    df_id = pd.DataFrame({"id": directories_data})
    labeled_ids_df = pd.DataFrame({"LabelID":ids})

    merged_df = pd.merge(df_id, labeled_ids_df, left_on="id", right_on="LabelID", how="left")
    merged_df["has_labels"] = merged_df["LabelID"].notna()
    merged_df.drop("LabelID", axis=1, inplace=True)

    removed_users = find_removed_users(df)
    
    # Remove users that were identified as missing (more than 2500 rows)
    merged_df = merged_df[~merged_df["id"].isin(removed_users)]

    return merged_df



In [40]:
get_user_id(data_path, track_df)

Unnamed: 0,id,has_labels
0,000,False
1,001,False
2,002,False
3,003,False
4,004,False
...,...,...
176,176,False
178,178,False
179,179,True
180,180,False


### Check for unique activities

Great to know if for all users their activities file_name is unique, so it could be used as a key later when inserting to the database 

In [34]:
#Checking if all activities is unique for each user

def check_unique_activities(data_path):
    lists_by_directory = {}

    # Format the directory name to have leading zeros if needed
    for i in range(182):
        directory_name = f"{i:03d}"

        files_in_directory = os.listdir(os.path.join(data_path, directory_name, "Trajectory"))
        lists_by_directory[directory_name] = files_in_directory

    # Check if all values in each list are unique
    results = {}
    for directory_name, file_list in lists_by_directory.items():
        is_unique = len(file_list) == len(set(file_list))
        results[directory_name] = is_unique
    
    return results


In [35]:

unique_activities_results = check_unique_activities(data_path2)

for directory_name, is_unique in unique_activities_results.items():
    print(f"Directory {directory_name}: All values are unique - {is_unique}")


Directory 000: All values are unique - True
Directory 001: All values are unique - True
Directory 002: All values are unique - True
Directory 003: All values are unique - True
Directory 004: All values are unique - True
Directory 005: All values are unique - True
Directory 006: All values are unique - True
Directory 007: All values are unique - True
Directory 008: All values are unique - True
Directory 009: All values are unique - True
Directory 010: All values are unique - True
Directory 011: All values are unique - True
Directory 012: All values are unique - True
Directory 013: All values are unique - True
Directory 014: All values are unique - True
Directory 015: All values are unique - True
Directory 016: All values are unique - True
Directory 017: All values are unique - True
Directory 018: All values are unique - True
Directory 019: All values are unique - True
Directory 020: All values are unique - True
Directory 021: All values are unique - True
Directory 022: All values are un

### Label table

Extracting the data for those users that have label and storing it as a dataframe. This dataframe will be used to check if the user have a valid transportation mode when comparing with the trajectories later

In [44]:
#Function for getting the content in labels.txt for those user which have has_label = True
def get_labels(path): 
    labels_df = pd.DataFrame(columns=["User","Start_time", "End_time", "Transportation_mode"])
    user_df = get_user_id(data_path)
    folders_with_labels = user_df[user_df["has_labels"]]
    
    for index, row in folders_with_labels.iterrows(): 
        folder_id = row["id"]
    
        labels_file_path = os.path.join(path, folder_id, "labels.txt").replace("\\", "/")
        

        if os.path.isfile(labels_file_path): 
            labels_data = pd.read_csv(labels_file_path, sep = "\t", names=['Start_time', 'End_time', 'Transportation_mode'], skiprows=1)
            labels_data["User"] = folder_id
            labels_df = pd.concat([labels_df, labels_data], ignore_index=True)
            

    #Formating from string to datetime object so comparisons can be made. Those invalid rows have NaT (Not a Time)
    labels_df['Start_time'] = pd.to_datetime(labels_df['Start_time'], format="%Y/%m/%d %H:%M:%S", errors='coerce')
    labels_df['End_time'] = pd.to_datetime(labels_df['End_time'], format="%Y/%m/%d %H:%M:%S", errors='coerce')

    return labels_df


In [45]:
label_df = get_labels(data_path2)
label_df


Unnamed: 0,User,Start_time,End_time,Transportation_mode
0,010,2007-06-26 11:32:29,2007-06-26 11:40:29,bus
1,010,2008-03-28 14:52:54,2008-03-28 15:59:59,train
2,010,2008-03-28 16:00:00,2008-03-28 22:02:00,train
3,010,2008-03-29 01:27:50,2008-03-29 15:59:59,train
4,010,2008-03-29 16:00:00,2008-03-30 15:59:59,train
...,...,...,...,...
14713,179,2008-11-17 06:59:58,2008-11-17 07:06:16,bus
14714,179,2008-11-17 07:06:16,2008-11-17 07:14:32,walk
14715,179,2008-11-29 01:58:05,2008-11-29 02:01:39,bus
14716,179,2008-11-29 02:01:39,2008-11-29 02:07:57,walk


In [46]:
#Function that checks weather or not some rows have invalid formating on start_time and end_time: 
def find_invalid_time_rows(df): 
    mask = df["Start_time"].isna() | df["End_time"].isna()
    rows_with_nat = df[mask]
    return rows_with_nat

In [47]:
invalid_time_rows = find_invalid_time_rows(label_df)
invalid_time_rows


Unnamed: 0,User,Start_time,End_time,Transportation_mode


In [48]:
def find_duplicate_rows(df, columns_to_check):
    duplicate_rows = df.duplicated(subset=columns_to_check, keep=False)
    duplicate_rows_df = df[duplicate_rows]
    return duplicate_rows_df


In [49]:
#IMPORTANT! There is on user that have tracked an activity at the exact same time but have tracked two different transportation mode
columns_to_check = ["User", "Start_time", "End_time"]
duplicate_rows_df = find_duplicate_rows(label_df, columns_to_check)
duplicate_rows_df

Unnamed: 0,User,Start_time,End_time,Transportation_mode
585,20,2011-11-02 02:04:29,2011-11-02 02:09:31,walk
586,20,2011-11-02 02:04:29,2011-11-02 02:09:31,bike


### Activity Table
Generating the activity table. Need to use both the Track point table and the dataframe with the labels to check for valid start/end times

In [50]:
#This is a function that finds the start_time and end_time for each activity grouped by user 
#based on the trajectories in the given activity

def find_start_end(trajectory_df): 
    grouped = trajectory_df.groupby(["user", "activity"])

    users = []
    activities = []
    start_times = []
    end_times = []

    for (user, activity), grouped_df in grouped: 
        earliest_time = grouped_df["date_time"].min()
        latest_time = grouped_df["date_time"].max()

        users.append(user)
        activities.append(activity)
        start_times.append(earliest_time)
        end_times.append(latest_time)

    result_df = pd.DataFrame({
        'User': users,
        'Activity': activities,
        'Start_time': start_times,
        'End_time': end_times
    })

    return result_df


In [51]:
start_end_df = find_start_end(track_df)
start_end_df

Unnamed: 0,User,Activity,Start_time,End_time
0,000,20081023025304,2008-10-23 02:53:04,2008-10-23 11:11:12
1,000,20081024020959,2008-10-24 02:09:59,2008-10-24 02:47:06
2,000,20081026134407,2008-10-26 13:44:07,2008-10-26 15:04:07
3,000,20081027115449,2008-10-27 11:54:49,2008-10-27 12:05:54
4,000,20081028003826,2008-10-28 00:38:26,2008-10-28 05:03:42
...,...,...,...,...
16043,181,20080128042855,2008-01-28 04:28:55,2008-01-28 07:14:35
16044,181,20080206025637,2008-02-06 02:56:37,2008-02-06 04:03:12
16045,181,20080216080442,2008-02-16 08:04:42,2008-02-16 09:00:06
16046,181,20080217010131,2008-02-17 01:01:31,2008-02-17 10:45:35


In [52]:
#Function that merges two given dataframes on the specified columns and method
def merge_dataframes(df1, df2, on_columns, how): 
    merged_df = pd.merge(df1, df2, on = on_columns, how = how)
    return merged_df

In [53]:
#Want to merge the dataframe with those user that have label with trajectories for each user and activity only
#if it have exact match on start and end time. 

columns = ["User", "Start_time", "End_time"]
activity_df = merge_dataframes(start_end_df, label_df, columns, how = "left" )
activity_df

Unnamed: 0,User,Activity,Start_time,End_time,Transportation_mode
0,000,20081023025304,2008-10-23 02:53:04,2008-10-23 11:11:12,
1,000,20081024020959,2008-10-24 02:09:59,2008-10-24 02:47:06,
2,000,20081026134407,2008-10-26 13:44:07,2008-10-26 15:04:07,
3,000,20081027115449,2008-10-27 11:54:49,2008-10-27 12:05:54,
4,000,20081028003826,2008-10-28 00:38:26,2008-10-28 05:03:42,
...,...,...,...,...,...
16044,181,20080128042855,2008-01-28 04:28:55,2008-01-28 07:14:35,
16045,181,20080206025637,2008-02-06 02:56:37,2008-02-06 04:03:12,
16046,181,20080216080442,2008-02-16 08:04:42,2008-02-16 09:00:06,
16047,181,20080217010131,2008-02-17 01:01:31,2008-02-17 10:45:35,


In [40]:
activity_df["Transportation_mode"].notna().sum()


1539