In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def load_dataset(path="malware_dataset.csv"):
    if not os.path.exists(path):
        # Generate mock data
        print("Dataset not found, generating mock dataset...")
        data = {
            'file_size': np.random.randint(10000, 500000, 1000),
            'entropy': np.random.uniform(2.0, 8.0, 1000),
            'has_net_import': np.random.randint(0, 2, 1000),
            'suspicious_sections': np.random.randint(0, 5, 1000),
            'label': np.random.randint(0, 2, 1000)  # 0 = Benign, 1 = Malware
        }
        df = pd.DataFrame(data)
        df.to_csv(path, index=False)
    else:
        df = pd.read_csv(path)

    print(f"Dataset shape: {df.shape}")
    return df

In [None]:
def preprocess(df):
    X = df.drop('label', axis=1)
    y = df['label']
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    return X_scaled, y, scaler


In [None]:
def train_model(X_train, y_train):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model