In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

# Create mock dataset
np.random.seed(42)
n_samples = 1000
n_malware = int(n_samples * 0.05)
n_benign = n_samples - n_malware

# Features: 10 features with values between 0-1
benign = np.random.rand(n_benign, 10)
malware = np.random.rand(n_malware, 10)

# Combine and create labels
X = np.vstack((benign, malware))
y = np.array([0]*n_benign + [1]*n_malware)

# Shuffle
shuffle_idx = np.random.permutation(n_samples)
X, y = X[shuffle_idx], y[shuffle_idx]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

# Evaluation
print("\n--- Imbalanced Data Model Evaluation ---")
print(classification_report(y_test, y_pred, target_names=["Benign", "Malware"]))



--- Imbalanced Data Model Evaluation ---
              precision    recall  f1-score   support

      Benign       0.96      0.93      0.94       190
     Malware       0.13      0.20      0.16        10

    accuracy                           0.90       200
   macro avg       0.55      0.57      0.55       200
weighted avg       0.92      0.90      0.90       200

