In [21]:
import pandas as pd
import base64
import re

In [22]:
# Function to decode a Base64-encoded string into plaintext
def decode_base64(word):
    try:
        # Attempt to decode the Base64 string and return it
        return base64.b64decode(word).decode()
    except:
        # Return None if decoding fails (e.g., invalid Base64 input)
        pass

In [28]:
# Load a predefined vocabulary from "features.txt", one word/phrase per line
with open("../data/features.txt", "r") as f:
    vocabulary = f.read().splitlines()

In [29]:
# Function to split a given string into tokens based on the vocabulary
def split_by_vocabulary(string):
    # Create a regex pattern to match any word/phrase in the vocabulary
    pattern = '|'.join(map(re.escape, vocabulary))
    # Find all matches of the vocabulary in the string
    matches = re.findall(pattern, string)
    return matches

In [30]:
# Function to process and decode Base64 commands from an SSH session log
def decode_session(full_session):
    new_full_session = []  # List to store processed session chunks
    
    # Split the session log into chunks using the semicolon (';') delimiter
    for session_chunk in full_session.split(";"):
        # Identify chunks that contain Base64 decoding commands
        if "base64 --decode" in session_chunk and "echo" in session_chunk:
            parts = session_chunk.split()  # Split the chunk into individual parts
            base64_encoded = None  # Placeholder for the Base64-encoded string
        
            # Locate the Base64-encoded string following the 'echo' command
            for i in range(len(parts)):
                if parts[i] == "echo":
                    base64_encoded = parts[i + 1].strip("\"")  # Extract the string and strip quotes
                    break
            
            # If a Base64 string was found, attempt to decode it
            if base64_encoded:
                decoded = decode_base64(base64_encoded)
                if decoded:
                    # Split the decoded output into lines
                    words_decoded = decoded.split("\n")
                    # Remove the shebang (e.g., "#!/bin/bash") if present in the first line
                    if words_decoded[0].strip() == "#!/bin/bash":
                        words_decoded[0] = ""
                    # Add the decoded and cleaned output to the session list
                    new_full_session.append("; ".join(list(filter(None, words_decoded))).strip())
                    # Increment the global counter for successfully decoded chunks
                    global base64_decoded_counter
                    base64_decoded_counter += 1
                else:
                    # If decoding fails, keep the original chunk
                    new_full_session.append(session_chunk.strip())
            else:
                # If no Base64 string is found, keep the original chunk
                new_full_session.append(session_chunk.strip())
        else:
            # If the chunk does not contain Base64 decoding, keep it unchanged
            new_full_session.append(session_chunk.strip())
    
    # Rejoin the processed session chunks with semicolons and return the result
    return "; ".join(new_full_session)


In [31]:
# Load the raw dataset from a Parquet file
df_original = pd.read_parquet('../data/raw/ssh_attacks.parquet')
df_decoded = df_original.copy()  # Create a copy to preserve the original data

In [34]:
# Convert the 'first_timestamp' column to a datetime format
df_decoded['first_timestamp'] = pd.to_datetime(df_decoded['first_timestamp'])

# Initialize a global counter to track the number of successfully decoded Base64 strings
global base64_decoded_counter
base64_decoded_counter = 0

# Apply the session decoding function to the 'full_session' column
df_decoded["full_session"] = df_decoded["full_session"].apply(lambda session: decode_session(session))

# Create another copy of the decoded dataset for further processing
df_decoded_splitted = df_decoded.copy()

# Split the decoded 'full_session' column into vocabulary-based tokens
df_decoded_splitted["full_session"] = df_decoded_splitted["full_session"].apply(lambda session: split_by_vocabulary(session))

# Save the tokenized dataset to the specified processed folder
df_decoded_splitted.to_parquet("../data/processed/ssh_attacks_decoded.parquet")

In [35]:
df_decoded_splitted

Unnamed: 0,session_id,full_session,first_timestamp,Set_Fingerprint
0,0,"[enable, system, sh, sh, cat, proc, mounts, bi...",2019-06-04 09:45:11.151186+00:00,"[Defense Evasion, Discovery]"
1,1,"[enable, system, sh, sh, cat, proc, mounts, bi...",2019-06-04 09:45:50.396610+00:00,"[Defense Evasion, Discovery]"
2,2,"[enable, system, sh, sh, cat, proc, mounts, bi...",2019-06-04 09:54:41.863315+00:00,"[Defense Evasion, Discovery]"
3,3,"[enable, system, sh, sh, cat, proc, mounts, bi...",2019-06-04 10:22:14.623875+00:00,"[Defense Evasion, Discovery]"
4,4,"[enable, system, sh, sh, cat, proc, mounts, bi...",2019-06-04 10:37:19.725874+00:00,"[Defense Evasion, Discovery]"
...,...,...,...,...
233030,233042,"[cat, proc, cp, grep, name, echo, passwd, bash...",2020-02-29 23:47:28.217237+00:00,"[Discovery, Persistence]"
233031,233043,"[cat, proc, cp, grep, name, echo, passwd, bash...",2020-02-29 23:49:01.009046+00:00,"[Discovery, Persistence]"
233032,233044,"[cat, proc, cp, grep, name, echo, ss, passwd, ...",2020-02-29 23:56:18.827281+00:00,"[Discovery, Persistence]"
233033,233045,"[cat, proc, cp, grep, name, echo, passwd, bash...",2020-02-29 23:56:56.263104+00:00,"[Discovery, Persistence]"
