In [28]:
import os
import numpy as np
import pandas as pd
from typing import Tuple, List, Dict
from constants import *


In [2]:
def user_list_to_df(user_id: str, user_data_list: List[str]):
    df_user = pd.DataFrame({
        "cmd": user_data_list
    })
    df_user["user"] = user_id
    df_user["split"] = "train"
    df_user["segment_id"] = np.repeat(range(0, int(len(user_data_list) / SEGMENT_LEN)), SEGMENT_LEN)
    df_user["cmd"] = df_user["cmd"].astype("category")
    return df_user


def load_user_data(user_id: str, file_path: os.PathLike) -> Tuple[pd.DataFrame, pd.DataFrame]:
    with open(file_path, "r") as fp:
        user_data = fp.readlines()
        user_data = [s.strip() for s in user_data]

    # get training data:
    train_user_data = user_data[:TRAIN_HEADER_COUNT]
    test_user_data = user_data[TRAIN_HEADER_COUNT:]

    # convert to dataframes
    train_segments = user_list_to_df(user_id, train_user_data)
    test_segments = user_list_to_df(user_id, test_user_data)

    return train_segments, test_segments

In [8]:
''' load all user data '''

global_cmds = set()


data_list = []

for user_count in range(40):
    user_id = f"User{user_count}"
    user_file_path = os.path.join("data", user_id)

    df_user0_train, df_user0_test = load_user_data(user_id, user_file_path)

    data_list.append(df_user0_train)
    data_list.append(df_user0_test)
    ''' all commands '''

    user_cmd_set_train = set(df_user0_train["cmd"].unique())
    user_cmd_set_test = set(df_user0_test["cmd"].unique())

    user_cmd_set = user_cmd_set_test.union(user_cmd_set_train)

    global_cmds = global_cmds.union(user_cmd_set)

In [4]:
with open("global_cmds.txt", "w") as fp:
    global_cmds = list(global_cmds)
    global_cmds.sort()
    fp.writelines("\n".join(global_cmds))

In [15]:
# TODO - calculate command TFIDF scores

all_df = pd.concat(data_list, axis=0)
all_segments_df = all_df.groupby(["user","segment_id"]).transform(lambda x: ' '.join(x)).drop_duplicates()
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = all_segments_df["cmd"].tolist()


vectorizer = TfidfVectorizer()



In [16]:
X = vectorizer.fit_transform(corpus)

In [17]:
X

<3836x747 sparse matrix of type '<class 'numpy.float64'>'
	with 117287 stored elements in Compressed Sparse Row format>

In [18]:
tfidf_tokens = vectorizer.get_feature_names_out()

In [27]:
tfidf_tokens.tolist()

['20',
 '4dwm',
 '5650',
 '5836',
 '7105',
 '7956',
 '8117',
 '8708',
 '9term',
 'aa',
 'aacdec',
 'acc',
 'acroread',
 'add',
 'addrinfo',
 'agen',
 'agrep',
 'aiffplay',
 'ama',
 'ama_volu',
 'apanel',
 'appdefpa',
 'ar',
 'arch',
 'archie',
 'arp',
 'as',
 'as1',
 'ascii',
 'augment_',
 'aupanel',
 'auplay',
 'aus',
 'autoconf',
 'awk',
 'backtalk',
 'backup',
 'basename',
 'bash',
 'bb_rep',
 'bb_rep_f',
 'bb_rep_n',
 'bb_rep_t',
 'bc',
 'bdftopcf',
 'be',
 'bibtex',
 'bindkey',
 'binhex',
 'bison',
 'blossom4',
 'bo_rep',
 'bo_rep_c',
 'bo_rep_f',
 'bo_rep_t',
 'bo_table',
 'bo_top',
 'bo_type',
 'btbuild',
 'btcreat',
 'byte_rev',
 'cal',
 'calendar',
 'call_fil',
 'calldd',
 'calprog',
 'cancel',
 'capture',
 'cat',
 'catalog',
 'catdoc',
 'cc',
 'cc1',
 'cdc',
 'cdec',
 'cfe',
 'cgiparse',
 'chat',
 'chec',
 'chkconfi',
 'chmod',
 'chown',
 'ci',
 'cled',
 'cled_jct',
 'clock',
 'cmex',
 'cmp',
 'co',
 'col',
 'comm',
 'comma',
 'compress',
 'concorde',
 'config',
 'configur',


In [21]:
vectorizer.transform([corpus[0]])

<1x747 sparse matrix of type '<class 'numpy.float64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [22]:
import pickle

In [24]:
with open("tfidf_vectorizer.pkl", "wb") as fp:
    pickle.dump(vectorizer, fp)

In [25]:
X.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.07485462, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])