# Living-Off-The-Land Command Detection Using Active Learning

```
@Conference{Ongun2021,
  author     = {Ongun, Talha and Stokes, Jack W. and Or, Jonathan Bar and Tian, Ke and Tajaddodianfar, Farid and Neil, Joshua and Seifert, Christian and Oprea, Alina and Platt, John C.},
  booktitle  = {Proceedings of the 24th International Symposium on Research in Attacks, Intrusions and Defenses},
  title      = {Living-Off-The-Land Command Detection Using Active Learning},
  year       = {2021},
  address    = {New York, NY, USA},
  month      = {10},
  pages      = {442–455},
  publisher  = {Association for Computing Machinery},
  series     = {RAID '21},
  abstract   = {In recent years, enterprises have been targeted by advanced adversaries who leverage creative ways to infiltrate their systems and move laterally to gain access to critical data. One increasingly common evasive method is to hide the malicious activity behind a benign program by using tools that are already installed on user computers. These programs are usually part of the operating system distribution or another user-installed binary, therefore this type of attack is called “Living-Off-The-Land”. Detecting these attacks is challenging, as adversaries may not create malicious files on the victim computers and anti-virus scans fail to detect them. We propose the design of an Active Learning framework called LOLAL for detecting Living-Off-the-Land attacks that iteratively selects a set of uncertain and anomalous samples for labeling by a human analyst. LOLAL is specifically designed to work well when a limited number of labeled samples are available for training machine learning models to detect attacks. We investigate methods to represent command-line text using word-embedding techniques, and design ensemble boosting classifiers to distinguish malicious and benign samples based on the embedding representation. We leverage a large, anonymized dataset collected by an endpoint security product and demonstrate that our ensemble classifiers achieve an average F1 score of 96% at classifying different attack classes. We show that our active learning method consistently improves the classifier performance, as more training data is labeled, and converges in less than 30 iterations when starting with a small number of labeled instances.},
  day        = {7},
  doi        = {10.1145/3471621.3471858},
  isbn       = {9781450390583},
  keywords   = {Active learning for security, Advanced Persistent Threats, Contextual text embeddings, Threat detection},
  location   = {San Sebastian, Spain},
  pagetotal  = {14},
  priority   = {prio1},
  ranking    = {rank5},
  readstatus = {read},
  relevance  = {relevant},
  url        = {https://doi.org/10.1145/3471621.3471858},
}
```

Token creation 

In [2]:
Dados = ["git commit --am _STRING_","echo at http:_PATH_ my email is _EMAIL_ and my website http:_PATH_ go to ftp:_PATH_", 
"ping _IP_ && ping google.com",
"rm _PATH_ | rm _PATH_  |  rm _PATH_ & cp _PATH_ _PATH_",
"chmod +x xampp-linux-x64-5.6.33-0-installer.run | bash x.sh"]

In [11]:
lista = []
for i in Dados:
    print(i.split())
    lista.append(i.split())

['git', 'commit', '--am', '_STRING_']
['echo', 'at', 'http:_PATH_', 'my', 'email', 'is', '_EMAIL_', 'and', 'my', 'website', 'http:_PATH_', 'go', 'to', 'ftp:_PATH_']
['ping', '_IP_', '&&', 'ping', 'google.com']
['rm', '_PATH_', '|', 'rm', '_PATH_', '|', 'rm', '_PATH_', '&', 'cp', '_PATH_', '_PATH_']
['chmod', '+x', 'xampp-linux-x64-5.6.33-0-installer.run', '|', 'bash', 'x.sh']


In [14]:
print(lista)

[['git', 'commit', '--am', '_STRING_'], ['echo', 'at', 'http:_PATH_', 'my', 'email', 'is', '_EMAIL_', 'and', 'my', 'website', 'http:_PATH_', 'go', 'to', 'ftp:_PATH_'], ['ping', '_IP_', '&&', 'ping', 'google.com'], ['rm', '_PATH_', '|', 'rm', '_PATH_', '|', 'rm', '_PATH_', '&', 'cp', '_PATH_', '_PATH_'], ['chmod', '+x', 'xampp-linux-x64-5.6.33-0-installer.run', '|', 'bash', 'x.sh']]


Submission of Tokens to vectorization models

In [23]:
from gensim.models import Word2Vec, FastText

# Sua lista
lista = [['git', 'commit', '--am', '_STRING_'], ['echo', 'at', 'http:_PATH_', 'my', 'email', 'is', '_EMAIL_', 'and', 'my', 'website', 'http:_PATH_', 'go', 'to', 'ftp:_PATH_'], ['ping', '_IP_', '&&', 'ping', 'google.com'], ['rm', '_PATH_', '|', 'rm', '_PATH_', '|', 'rm', '_PATH_', '&', 'cp', '_PATH_', '_PATH_'], ['chmod', '+x', 'xampp-linux-x64-5.6.33-0-installer.run', '|', 'bash', 'x.sh']]

# Treina o modelo Word2Vec
word2vec_model = Word2Vec(sentences=lista, vector_size=100, window=5, min_count=1, sg=0)

# Treina o modelo FastText
fasttext_model = FastText(sentences=lista, vector_size=100, window=5, min_count=1, sg=0)

# Exemplo de uso dos modelos treinados
vector_w2v = word2vec_model.wv['git']
vector_ft = fasttext_model.wv['git']

#print(vector_w2v)
#print(vector_ft)

Once the model is created ...

In [16]:
from sklearn.ensemble import RandomForestClassifier
from gensim.models import Word2Vec
import numpy as np

# Labels correspondentes a cada entrada
labels = [1, 0, 1, 0, 1]  # Supondo que você tem os rótulos para cada entrada

# Obtém os vetores de palavras do Word2Vec
word_vectors = word2vec_model.wv

# Obtém vetores de palavras para cada token na lista de tokens
entry_vectors = []
entry_labels = []
for i, tokens in enumerate(lista_de_tokens):
    for token in tokens:
        if token in word_vectors:
            entry_vectors.append(word_vectors[token])
            entry_labels.append(labels[i])

# Treina o modelo RandomForest com os vetores das palavras
random_forest_model = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest_model.fit(entry_vectors, entry_labels)

# Obtém o número de entradas na árvore para cada token com label 1
num_samples_per_leaf = random_forest_model.apply(entry_vectors)

# Cria um dicionário para armazenar a soma das probabilidades das folhas para cada token com label 1
token_leaf_probabilities_sum = {}

# Associa cada token com a soma das probabilidades das folhas correspondentes (apenas para tokens com label 1)
for i, tokens in enumerate(lista_de_tokens):
    if labels[i] == 1:  # Considera apenas tokens com label 1
        for j, token in enumerate(tokens):
            if token not in token_leaf_probabilities_sum:
                token_leaf_probabilities_sum[token] = []
            token_leaf_probabilities_sum[token].append(random_forest_model.predict_proba([entry_vectors[i]])[0][1] / num_samples_per_leaf[i, j])

# Calcula a média das probabilidades das folhas para cada token com label 1
mean_leaf_probabilities_label_1_normalized = {token: np.mean(probs) for token, probs in token_leaf_probabilities_sum.items()}

# Exibe a média das probabilidades das folhas para cada token com label 1, normalizada pelo número de entradas na árvore
for token, mean_prob in mean_leaf_probabilities_label_1_normalized.items():
    print(f"Token: {token}, Probabilidade Média da Leaf para Label 1 (Normalized): {mean_prob}")


Token: git, Mean Leaf Probability for Label 1 (Normalized): 0.1401111111111111
Token: commit, Mean Leaf Probability for Label 1 (Normalized): 0.8406666666666667
Token: --am, Mean Leaf Probability for Label 1 (Normalized): 0.09340740740740741
Token: _STRING_, Mean Leaf Probability for Label 1 (Normalized): 0.10508333333333333
Token: ping, Mean Leaf Probability for Label 1 (Normalized): 0.12881944444444446
Token: _IP_, Mean Leaf Probability for Label 1 (Normalized): 0.1766666666666667
Token: &&, Mean Leaf Probability for Label 1 (Normalized): 0.09814814814814815
Token: google.com, Mean Leaf Probability for Label 1 (Normalized): 0.0803030303030303
Token: chmod, Mean Leaf Probability for Label 1 (Normalized): 0.06
Token: +x, Mean Leaf Probability for Label 1 (Normalized): 0.012
Token: xampp-linux-x64-5.6.33-0-installer.run, Mean Leaf Probability for Label 1 (Normalized): 0.017142857142857144
Token: |, Mean Leaf Probability for Label 1 (Normalized): 0.02
Token: bash, Mean Leaf Probability f

In [21]:
new_dataset = []

for i, tokens in enumerate(lista_de_tokens):
    word_vecs = [word_vectors[token] for token in tokens if token in word_vectors.key_to_index]
    
    if len(word_vecs) > 0:
        word_vecs = np.array(word_vecs)
        
        if len(word_vecs) >= 3:
            min_values = np.min(word_vecs, axis=0)[:3]
            max_values = np.max(word_vecs, axis=0)[:3]
            avg_values = np.mean(word_vecs, axis=0)[:3]
        else:
            min_values = np.zeros(3)
            max_values = np.zeros(3)
            avg_values = np.zeros(3)
        
        token_scores = [mean_leaf_probabilities_label_1_normalized[token] for token in tokens if token in mean_leaf_probabilities_label_1_normalized]
        token_scores.sort(reverse=True)
        max_scores = token_scores[:3]
        
        num_tokens = len(tokens)
        
        rare_count = sum(1 for token in tokens if token not in word_vectors.key_to_index or word_vectors.get_vecattr(token, 'count') <= 1)
        
        label = labels[i]
        
        entry_data = np.array((min_values[0] if len(min_values) > 0 else 0, min_values[1] if len(min_values) > 1 else 0, min_values[2] if len(min_values) > 2 else 0,
                                max_values[0] if len(max_values) > 0 else 0, max_values[1] if len(max_values) > 1 else 0, max_values[2] if len(max_values) > 2 else 0,
                                avg_values[0] if len(avg_values) > 0 else 0, avg_values[1] if len(avg_values) > 1 else 0, avg_values[2] if len(avg_values) > 2 else 0,
                                max_scores[0] if len(max_scores) > 0 else 0, max_scores[1] if len(max_scores) > 1 else 0, max_scores[2] if len(max_scores) > 2 else 0,
                                num_tokens, rare_count, label),
                              dtype=[('min_val_0', np.float64),
                                     ('min_val_1', np.float64),
                                     ('min_val_2', np.float64),
                                     ('max_val_0', np.float64),
                                     ('max_val_1', np.float64),
                                     ('max_val_2', np.float64),
                                     ('avg_val_0', np.float64),
                                     ('avg_val_1', np.float64),
                                     ('avg_val_2', np.float64),
                                     ('max_score_0', np.float64),
                                     ('max_score_1', np.float64),
                                     ('max_score_2', np.float64),
                                     ('num_tokens', np.int64),
                                     ('rare_count', np.int64),
                                     ('label', np.int64)])
        
        new_dataset.append(entry_data)

new_dataset = np.array(new_dataset)


In [22]:
new_dataset

array([(-0.00950012, -0.00932806, -0.00777076, 0.00769665, 0.00956222, 0.00216339, -0.00331968,  0.00339692, -0.00158609, 0.84066667, 0.14011111, 0.10508333,  4,  4, 1),
       (-0.00957855, -0.00980292, -0.00717673, 0.00977506, 0.00894312, 0.00944711,  0.00038211, -0.00025323,  0.00093848, 0.        , 0.        , 0.        , 14, 10, 0),
       (-0.00823815, -0.00590276, -0.00019391, 0.00133212, 0.00930552, 0.00998768, -0.00357826,  0.00469329,  0.0038395 , 0.17666667, 0.12881944, 0.12881944,  5,  3, 1),
       (-0.00861682, -0.00128281, -0.00680692, 0.00964721, 0.00732885, 0.00519266, -0.00196912,  0.00203336,  0.0026725 , 0.02      , 0.02      , 0.        , 12,  2, 0),
       (-0.00696548, -0.00666834, -0.00943299, 0.00835129, 0.00573516, 0.00183118,  0.0006781 ,  0.00076893, -0.00538804, 0.06      , 0.02      , 0.01714286,  6,  5, 1)],
      dtype=[('min_val_0', '<f8'), ('min_val_1', '<f8'), ('min_val_2', '<f8'), ('max_val_0', '<f8'), ('max_val_1', '<f8'), ('max_val_2', '<f8'), ('av