In [None]:
!git clone https://github.com/2alf/aiav.git
!pip install pandas numpy scikit-learn joblib sklearn pefile


import os
import math
import pefile
import joblib
import argparse
import requests
import numpy as np
import pickle as pk
import pandas as pd
from io import StringIO
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier

# Train

In [None]:
media_path = 'aiav/database/data.csv'

In [None]:
def load_data(filename=media_path):
    """Load and preprocess the data."""
    data = pd.read_csv(filename, sep='|')
    X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
    y = data['legitimate'].values
    return X, y

def feature_selection(X, y):
    """Perform feature selection using ExtraTreesClassifier."""
    fsel = ExtraTreesClassifier().fit(X, y)
    model = SelectFromModel(fsel, prefit=True)
    X_new = model.transform(X)
    return X_new

def train_models(X_train, X_test, y_train, y_test):
    """Train and evaluate various machine learning models."""
    algorithms = {
        "DecisionTree": tree.DecisionTreeClassifier(max_depth=10),
        "RandomForest": ExtraTreesClassifier(n_estimators=50),
        "GradientBoosting": GradientBoostingClassifier(n_estimators=50),
        "AdaBoost": AdaBoostClassifier(n_estimators=100),
        "GNB": GaussianNB()
    }

    results = {}
    print("Now testing algorithms")
    for algo in algorithms:
        clf = algorithms[algo]
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print(f"{algo}: {score * 100:.2f}%")
        results[algo] = score

    winner = max(results, key=results.get)
    print(f'\nWinner algorithm is {winner} with a {results[winner] * 100:.2f}% success')
    return algorithms[winner]

def save_model(algorithm, features, output_dir='classifier/'):
    """Save the model and features."""
    os.makedirs(output_dir, exist_ok=True)

    print('Saving algorithm and feature list in classifier directory...')
    joblib.dump(algorithm, os.path.join(output_dir, 'classifier.pkl'))
    with open(os.path.join(output_dir, 'features.pkl'), 'wb') as features_file:
        pk.dump(features, features_file)
    print('Saved')

def evaluate_performance(y_test, predictions):
    """Evaluate model performance."""
    mt = confusion_matrix(y_test, predictions)
    false_positive_rate = (mt[0][1] / float(sum(mt[0]))) * 100
    false_negative_rate = (mt[1][0] / float(sum(mt[1]))) * 100

    print(f"False positive rate: {false_positive_rate:.2f}%")
    print(f"False negative rate: {false_negative_rate:.2f}%")

if __name__ == "__main__":
    X, y = load_data()

    print(f'Total features per row: {X.shape[1]}')

    X_new = feature_selection(X, y)
    nb_features = X_new.shape[1]

    X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2)

    print(f'Identified important features: {nb_features}')

    algorithms = train_models(X_train, X_test, y_train, y_test)
    save_model(algorithms, features)

    predictions = algorithms.predict(X_test)
    evaluate_performance(y_test, predictions)

Total features per row: 54
Identified important features: 13
Now testing algorithms
DecisionTree: 99.11%
RandomForest: 99.45%
GradientBoosting: 98.81%
AdaBoost: 98.60%
GNB: 69.93%

Winner algorithm is RandomForest with a 99.45% success
Saving algorithm and feature list in classifier directory...
Saved
False positive rate: 0.45%
False negative rate: 0.81%


Scan