In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.regularizers import l2

In [None]:
# Generate Data
df_malicious = pd.read_csv('../DataSources/ParsedLogs/MALICIOUS_ALL.csv')
df_mal_cols = df_malicious['Image']
df_malicious["Label"] = 1
print(f"Malicious Shape: {df_malicious.shape}")

df_benign = pd.read_csv('../DataSources/ParsedLogs/BENIGN_ALL.csv')
df_benign["Label"] = 0
print(f"Benign Shape: {df_benign.shape}")

# Downsample Benign Data
df_benign = df_benign.sample(n = df_malicious.shape[0])

# Concatenate Data
df = pd.concat([df_malicious, df_benign])
print(f"Concatenated Shape: {df.shape}")
# Convert to Float
# df = df.drop(labels = ["TimeCreated", "SourceFile", "TargetUserDomain", "MachineName"], axis = 1)
# numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# df[numeric_cols] = df[numeric_cols].astype(float)
object_cols = df.select_dtypes(include=["object"]).columns.tolist()
for col in object_cols:
    df[col] = df[col].astype('category').cat.codes
scaler = StandardScaler()
X = df
y = df['Label'].values
# y = df["Label"]
X = X.drop(labels = ["Label"], axis = 1)
X = scaler.fit_transform(X)


In [None]:
# Preprocess Code
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
keras = tf.keras
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
# Model Code
model = tf.keras.models.Sequential([
    tf.keras.layers.Conv1D(filters=32, kernel_size=2, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01), input_shape=(X_train.shape[1], 1)),
    tf.keras.layers.Conv1D(filters=32, kernel_size=2, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.GlobalAveragePooling1D(), 
    tf.keras.layers.Dense(1, activation="sigmoid")
])
optim = keras.optimizers.Adam(learning_rate=0.001)
metrics = ["accuracy"]
loss = tf.keras.losses.BinaryCrossentropy()

model.compile(optimizer=optim, loss=loss, metrics=metrics)
model.fit(X_train, y_train, epochs=10, shuffle=True)

In [None]:
# Evaluate Model
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print("Accuracy", accuracy)

In [None]:
# Save Model
model.save('models/model.h5')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

class estimator:
    _estimator_type = ""
    classes_ = []
    def __init__(self, model, classes):
        self.model = model
        self._estimator_type = "classifier"
        self.classes = classes
    def predict(self, X):
        y_prob = self.model.predict(X, verbose=False)
        y_pred = (y_prob + 0.5).astype(int)
        return y_pred

def plot_confusion_matrix(y_test = [], predictions = [], labels = []):
    classifier = estimator(model, labels)
    
    fig, ax = plt.subplots(figsize=(5,5))
    disp = ConfusionMatrixDisplay.from_predictions(y_true=y_test, y_pred=predictions, display_labels=labels, normalize="true", xticks_rotation="vertical", cmap="Blues", colorbar=False, values_format=".2f", ax=ax)
    
    plt.show()
    
labels = ["Benign", "Malicious"]
predictions = (model.predict(X_test)+0.5).astype(int)
plot_confusion_matrix(y_test, predictions=predictions, labels=labels)
cm = confusion_matrix(y_test, predictions)
print(cm)

In [None]:
def save_human_readable_csv(df, filename, original_df_malicious, original_df_benign):
    """
    Convert the dataframe back into a human-readable CSV file with the Labels column included.
    """
    # Reverse the encoding of categorical columns
    for col in object_cols:
        df[col] = df[col].astype('category').cat.codes

    # Map the labels back to their original values
    df['Label'] = df['Label'].map({0: 'Benign', 1: 'Malicious'})

    # Merge with original data to get cleartext
    df_malicious = original_df_malicious.copy()
    df_benign = original_df_benign.copy()
    df_malicious["Label"] = 'Malicious'
    df_benign["Label"] = 'Benign'
    df_human_readable = pd.concat([df_malicious, df_benign])

    # Save to CSV
    df_human_readable.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

# Save the human-readable CSV files
save_human_readable_csv(df, 'human_readable_data.csv', df_malicious, df_benign)

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the Model and Data
model = keras.models.load_model('models/model.h5')

# Generate Data
df_original = pd.read_csv('../DataSources/ParsedLogs/GCFA-4688.csv')
# df_original = pd.read_csv('../DataSources/ParsedLogs/GCFA-Sysmon.csv')
df_cols = df_original[['Image', 'CommandLine', 'ParentProcess']].copy()

# Convert to Float
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df[numeric_cols] = df[numeric_cols].astype(float)
object_cols = df.select_dtypes(include=["object"]).columns.tolist()
label_encoders = {}
for col in df_cols.columns:
    le = LabelEncoder()
    df_cols[col] = le.fit_transform(df_cols[col])  # Convert to numeric
    label_encoders[col] = le  # Store encoders in case needed later


scaler = StandardScaler()
X = scaler.fit_transform(df_cols)
X = np.expand_dims(X, axis=-1)

# Make predictions
predictions = model.predict(X)
predicted_labels = (predictions > 0.5).astype(int)

# Add predictions to the original data
df_original['Label'] = predicted_labels
df_original['Label'] = df_original['Label'].map({0: 'Benign', 1: 'Malicious'})

# Save the predictions to a CSV file with original logs
df_original.to_csv('GCFA-Prediction.csv', index=False)



[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 929us/step
