In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import skl2onnx

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from pprint import pprint
from collections import Counter
from datetime import datetime

from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [None]:
# Load the DataFrame from the Parquet file
input_file = 'filtered_dataset.parquet'#
df = pd.read_parquet(input_file)

# Display the first few rows of the DataFrame
df.info()


In [48]:
# Step 1: Count occurrences of each item in the lists
all_imports = [item for sublist in df['import_directory_list'] for item in sublist]
item_counts = Counter(all_imports)

top_items = [item for item, count in item_counts.most_common(200)]
df['filtered_imports'] = df['import_directory_list'].apply(lambda x: [item for item in x if item in top_items])

# Step 4: One-Hot Encode the filtered lists
mlb = MultiLabelBinarizer()
one_hot_encoded = mlb.fit_transform(df['filtered_imports'])

# Step 5: Create a DataFrame with meaningful column names
one_hot_df = pd.DataFrame(one_hot_encoded, columns=[f'import_{item}' for item in mlb.classes_])

# Step 6: Concatenate with the original DataFrame
final_df = pd.concat([df, one_hot_df], axis=1)
df = final_df


In [None]:
# Feature selection - Drop columns that won't be used in model
# 'file_name' may not be helpful, and 'dll_characteristics' has missing values.
drop_columns = ['file_name', 'timestamp', 'table_pointer',
                'address_of_entry_point', 'base_of_code', 
                'image_base', 'checksum_validation',
                'dll_characteristics', 'import_directory','architecture',
                'machine_type', 'file_characteristics', 'magic_number', 
                'subsystem', 'import_directory_list', 'filtered_imports']

df = df.drop(columns=drop_columns)
df.info()


In [None]:
mute_feature_columns = ['section_alignment', 'os_version', 'subsystem_version']

df = df.drop(columns=mute_feature_columns)
df.info()


In [None]:

# Step 1: Convert the 'disposition' column into numerical labels (0 for 'goodware', 1 for 'malware')
df['disposition'] = df['disposition'].map({'goodware': 0, 'malware': 1})

# Step 2: Define X (features) and y (target)
X = df.drop(columns=['disposition'])
  # Drop the target and any non-relevant columns
y = df['disposition']

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 4: Train the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1000, random_state=42)
clf.fit(X_train, y_train)

# Step 5: Make predictions
y_pred = clf.predict(X_test)

# Step 6: Evaluate the model (F1-score and classification report)
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")

# Classification report for precision, recall, f1-score
print("Classification Report:")
print(classification_report(y_test, y_pred))



In [None]:
# Classification report for precision, recall, f1-score
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Step 7: Analyze feature importance
feature_importances = pd.Series(clf.feature_importances_, index=X.columns)
feature_importances = feature_importances.sort_values(ascending=False)

# Plot the top 10 most important features
plt.figure(figsize=(20, 12))
sns.barplot(x=feature_importances[:40], y=[label.upper() for label in feature_importances.index[:40]])
plt.title('Top 20 Feature Importances')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.show()

pprint(feature_importances)


In [None]:
# Assuming `clf` is your trained model
today_date = datetime.today().strftime('%Y%m%d')
print(f"Date {today_date}")
model_path = f'malware_random_forest_{today_date}.joblib'
joblib.dump(clf, model_path)

print(f"Model saved to {model_path}")

In [54]:
# Load your trained model (assuming it's a RandomForestClassifier here)
clf = joblib.load(model_path)

# Define the input shape (e.g., for a model with 10 features)
initial_type = [('float_input', FloatTensorType([None, 10]))]

# Convert the model to ONNX format
onnx_model = convert_sklearn(clf, initial_types=initial_type)

# Save the model to an .onnx file
onnx_path = f'malware_random_forest_{today_date}.onnx'
with open(onnx_path, "wb") as f:
    f.write(onnx_model.SerializeToString())