In [6]:
import os
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict
from constants import *


In [7]:
def user_list_to_df(user_id: str, user_data_list: List[str]):
    df_user = pd.DataFrame({
        "cmd": user_data_list
    })
    df_user["user"] = user_id
    df_user["split"] = "train"
    df_user["segment_id"] = np.repeat(range(0, int(len(user_data_list) / SEGMENT_LEN)), SEGMENT_LEN)
    df_user["cmd"] = df_user["cmd"].astype("category")
    return df_user


def load_user_data(user_id: str, file_path: os.PathLike) -> Tuple[pd.DataFrame, pd.DataFrame]:
    with open(file_path, "r") as fp:
        user_data = fp.readlines()
        user_data = [s.strip() for s in user_data]

    # get training data:
    train_user_data = user_data[:TRAIN_HEADER_COUNT]
    test_user_data = user_data[TRAIN_HEADER_COUNT:]

    # convert to dataframes
    train_segments = user_list_to_df(user_id, train_user_data)
    test_segments = user_list_to_df(user_id, test_user_data)

    return train_segments, test_segments

In [9]:
''' load all user data '''

global_cmds = set()

for user_count in range(40):
    user_id = f"User{user_count}"
    user_file_path = os.path.join("data", user_id)

    df_user0_train, df_user0_test = load_user_data(user_id, user_file_path)

    ''' all commands '''

    user_cmd_set_train = set(df_user0_train["cmd"].unique())
    user_cmd_set_test = set(df_user0_test["cmd"].unique())

    user_cmd_set = user_cmd_set_test.union(user_cmd_set_train)

    global_cmds = global_cmds.union(user_cmd_set)

In [14]:
with open("global_cmds.txt", "w") as fp:
    global_cmds = list(global_cmds)
    global_cmds.sort()
    fp.writelines("\n".join(global_cmds))