In [None]:
import pandas as pd
import os


In [None]:

track_df = pd.read_csv("trajectory.csv")

track_df

In [None]:
#Transformations and fixing types of columns 

track_df['user'] = track_df['user'].astype(str).str.zfill(3)
track_df["activity"] = track_df['activity'].astype(str)
track_df["time_string"] = track_df["time_string"].str.replace('\n', '')
track_df["date_time"] = track_df["date_string"] + " " + track_df["time_string"]
track_df['date_time'] = pd.to_datetime(track_df['date_time'], format='%Y-%m-%d %H:%M:%S')
track_df

In [None]:

# Get the current working directory
current_directory = os.getcwd()

# Go back one directory level
parent_directory = os.path.abspath(os.path.join(current_directory, '..','..'))

# Construct the relative paths
data_path = os.path.join(parent_directory, 'dataset')
data_path2 = os.path.join(parent_directory, 'dataset', 'Data')

print(data_path)
print(data_path2)



In [None]:
#Function to get all user_id and check if they have label

def get_user_id(data_path): 
    directories_data = os.listdir(f'{data_path}/Data')
    labeled_ids_path = f'{data_path}/labeled_ids.txt'
    
    with open(labeled_ids_path, "r") as file:
        lines = file.readlines()
    ids = [line.strip().split()[0] for line in lines]

    df= pd.DataFrame({"id": directories_data})
    labeled_ids_df = pd.DataFrame({"LabelID":ids})

    merged_df = pd.merge(df, labeled_ids_df, left_on="id", right_on="LabelID", how = "left")
    merged_df["has_labels"] = merged_df["LabelID"].notna()
    merged_df.drop("LabelID", axis=1, inplace=True)

    return merged_df



In [None]:
print(get_user_id(data_path))

In [None]:
#Checking if all trajectories is unique for each user

# Initializing a dictionary to store the lists
lists_by_directory = {}

for i in range(182): 
    directory_name = f"{i:03d}"  # Format the directory name to have leading zeros if needed

   
    files_in_directory = os.listdir(os.path.join(data_path2, directory_name, "Trajectory"))

    lists_by_directory[directory_name] = files_in_directory
    

# Check if all values in each list are unique
for directory_name, file_list in lists_by_directory.items():
    is_unique = len(file_list) == len(set(file_list))
    print(f"Directory {directory_name}:Values are not unique - {is_unique}")





In [None]:
#Function for getting the content in labels.txt for those user which have has_label = True
def get_labels(path): 
    labels_df = labels_df = pd.DataFrame(columns=["User","Start_time", "End_time", "Transportation_mode"])
    user_df = get_user_id(data_path)
    folders_with_labels = user_df[user_df["has_labels"]]
    
    for index, row in folders_with_labels.iterrows(): 
        folder_id = row["id"]
    
        labels_file_path = os.path.join(path, folder_id, "labels.txt").replace("\\", "/")
        

        if os.path.isfile(labels_file_path): 
            labels_data = pd.read_csv(labels_file_path, sep = "\t", names=['Start_time', 'End_time', 'Transportation_mode'])
            labels_data["User"] = folder_id
            labels_df = pd.concat([labels_df, labels_data], ignore_index=True)
            return_df = labels_df.iloc[1:]

    return return_df


In [None]:
label_df = get_labels(data_path2)
label_df


In [None]:
#Findig those rows which have invalid format on start/end time 

incorrect_start_times = []
incorrect_end_times = []

for index, row in label_df.iterrows():
    try:
        pd.to_datetime(row['Start_time'], format="%Y/%m/%d %H:%M:%S")
    except ValueError:
        incorrect_start_times.append(index)

    try:
            pd.to_datetime(row['End_time'], format="%Y/%m/%d %H:%M:%S")
    except ValueError:
            incorrect_end_times.append(index)

print(incorrect_start_times)
print(incorrect_end_times)

In [None]:
#This is the rows that had wrong format
mask = label_df.index.isin(incorrect_start_times)
wrong_df = label_df[mask]
wrong_df

In [None]:
#Formating from string to datetime object so comparisons can be made. Those invalid rows have NaT (Not a Time)
label_df['Start_time'] = pd.to_datetime(label_df['Start_time'], format="%Y/%m/%d %H:%M:%S", errors='coerce')
label_df['End_time'] = pd.to_datetime(label_df['End_time'], format="%Y/%m/%d %H:%M:%S", errors='coerce')
label_df.info()

In [None]:
#check for duplicates in user, start_time and end_time
label_df.duplicated(subset=['User', 'Start_time', 'End_time']).sum()


In [None]:
activity_df = pd.DataFrame(columns=["id","User", "transportation_mode", "start_date_time", "end_date_time"])
activity_df


In [None]:

def find_start_end(track_df):
    # Group the track data by 'user' and 'activity'
    grouped = track_df.groupby(['user', 'activity'])

    # Initialize empty lists to store results
    users = []
    activities = []
    start_times = []
    end_times = []

    # Iterate through each group
    for (user, activity), group_df in grouped:
        # Find the earliest and latest datetime for the user and activity
        earliest_time = group_df['date_time'].min()
        latest_time = group_df['date_time'].max()

        # Append the results to the lists
        users.append(user)
        activities.append(activity)
        start_times.append(earliest_time)
        end_times.append(latest_time)

    # Create a new DataFrame to store the results
    result_df = pd.DataFrame({
        'User': users,
        'Activity': activities,
        'Start_time': start_times,
        'End_time': end_times
    })

    return result_df

start_end_df = find_start_end(track_df)
start_end_df


In [None]:
label_df

#check for duplicates in label_df of user, start_time and end_time
label_df.duplicated(subset=['User', 'Start_time', 'End_time']).sum()


In [None]:
#merge label_df and start_end_df and use nan where Transportation_mode is not available
merged_df = pd.merge(start_end_df, label_df, on=["User", "Start_time", "End_time"], how="outer")
merged_df

In [None]:
merged_df
#droping rows with nan values in activity
merged_df.dropna(subset = ["Activity"], inplace=True)
merged_df

activity_df = merged_df.copy()

In [None]:
with_transport = merged_df[merged_df["Transportation_mode"].notna()]
with_transport

In [None]:
activity_df