In [None]:
import numpy as np
import pandas as pd

from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/content/drive'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('5_gram.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
import matplotlib.pyplot as plt

# 计算Class列的值的计数
class_counts = df['Class'].value_counts()

# 创建柱状图
plt.bar(class_counts.index.astype(str), class_counts)

# 在柱形上方显示数量
for i in range(len(class_counts)):
    plt.text(i, class_counts[i], str(class_counts[i]), ha='center', va='bottom')

plt.xlabel('Malware')
plt.ylabel('Counts')
plt.title('Malware Distribution in Dataset')
plt.show()

In [None]:
# 分割数据为特征和目标
X = df.drop(columns=['Class'])  # 特征
y = df['Class']  # 目标

# 分割数据为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [None]:
import random
from deap import base, creator, tools, algorithms
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 创建一个适应度函数，用于评估个体的好坏（适应度越高越好）
def fitness(individual):
    # individual 是一个二进制向量，表示特征的选择（1表示选择，0表示不选择）
    selected_features = [X.columns[i] for i in range(len(individual)) if individual[i] == 1]

    # 创建一个SVM分类器
    classifier = SVC(kernel='linear', random_state=42)

    # 使用选定的特征来训练和评估分类器
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]
    classifier.fit(X_train_selected, y_train)
    y_pred = classifier.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)

    # 返回准确率作为适应度分数
    return accuracy,

# 创建遗传算法的问题（最大化准确率）
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

# 初始化遗传算法工具箱
toolbox = base.Toolbox()

# 创建二进制个体，每个特征对应一个二进制位
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, len(X.columns))
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# 定义遗传算法的操作（选择、交叉、变异）
toolbox.register("evaluate", fitness)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)
toolbox.register("select", tools.selTournament, tournsize=3)

# 创建初始种群
population = toolbox.population(n=100)

# 运行遗传算法
NGEN = 20  # 设置迭代次数
best_accuracies = []  # 用于存储每次迭代的最佳准确率

for gen in range(NGEN):
    offspring = algorithms.varAnd(population, toolbox, cxpb=0.5, mutpb=0.2)
    fits = toolbox.map(toolbox.evaluate, offspring)
    for fit, ind in zip(fits, offspring):
        ind.fitness.values = fit
    population = toolbox.select(offspring, k=len(population))

    # 获取每次迭代的最佳适应度值（准确率）
    best_ind = tools.selBest(population, k=1)[0]
    best_accuracy = best_ind.fitness.values[0]
    best_accuracies.append(best_accuracy)

    print(f"Generation {gen + 1}: Best Accuracy = {best_accuracy:.5f}")

# 获取最佳个体
best_individual = tools.selBest(population, k=1)[0]
selected_feature_indices = [X.columns[i] for i in range(len(best_individual)) if best_individual[i] == 1]
print("Selected Features:", selected_feature_indices)

# 打印每次迭代的最佳适应度值（准确率）
print("Best Accuracies Over Generations:")
for gen, accuracy in enumerate(best_accuracies):
    print(f"Generation {gen + 1}: {accuracy:.5f}")


In [None]:
# 画出折线图
import matplotlib.pyplot as plt

# 绘制折线图
plt.plot(range(1, NGEN + 1), best_accuracies)
plt.xlabel("Generation")
plt.ylabel("Best Accuracy")
plt.title("Best Accuracy Over Generations")
# plt.show()

# 设置纵坐标的最小值
plt.ylim(0.9,1)

plt.show()

In [None]:
# 从训练和测试数据中仅选择这些特征
X_train = X_train[selected_feature_indices]
X_test = X_test[selected_feature_indices]

In [None]:
X_train.shape

In [None]:
import matplotlib.pyplot as plt
from keras.layers import LSTM, Input, Dense, MultiHeadAttention, Flatten, concatenate
from keras.models import Model
from keras.regularizers import l2, l1
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from tcn import TCN

X_train_encoded_seq = X_train.to_numpy().reshape(X_train.shape[0], 1, X_train.shape[1])
X_test_encoded_seq = X_test.to_numpy().reshape(X_test.shape[0], 1, X_test.shape[1])

early_stopping = EarlyStopping(
    monitor='val_loss',  # 监测验证集损失
    patience=10,         # 如果连续10个epoch验证集损失没有改善，则停止训练
    restore_best_weights=True  # 恢复最佳模型权重
)

# LSTM
lstm_input = Input(shape=(1, X_train_encoded_seq.shape[2]))
lstm_layer = LSTM(64, return_sequences=True)(lstm_input)
lstm_layer = Dropout(0.5)(lstm_layer)

# TCN
tcn_input = Input(shape=(1, X_train_encoded_seq.shape[2]))
tcn_layer = TCN(return_sequences=True)(tcn_input)
tcn_layer = Dropout(0.5)(tcn_layer)

# Attention
attention = MultiHeadAttention(num_heads=2, key_dim=64)(query=lstm_layer, key=tcn_layer, value=tcn_layer)

merged = concatenate([lstm_layer, tcn_layer, attention], axis=-1)

merged = Flatten()(merged)

output_layer = Dense(1, activation='sigmoid')(merged)

combined_model = Model(inputs=[lstm_input, tcn_input], outputs=output_layer)

combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

combined_model.fit([X_train_encoded_seq, X_train_encoded_seq], y_train, epochs=100, batch_size=64, validation_data=([X_test_encoded_seq, X_test_encoded_seq], y_test), callbacks=[early_stopping])

# Predict labels on the test set
y_pred_prob = combined_model.predict([X_test_encoded_seq, X_test_encoded_seq])

# Convert probabilities to binary labels using a threshold (0.5)
y_pred = (y_pred_prob > 0.5).astype(int)
y_pred = pd.Series(y_pred.squeeze(), index=y_test.index)


# Calculate LSTM_TCN_attention metrics
LSTM_TCN_attention_accuracy = accuracy_score(y_test, y_pred)
LSTM_TCN_attention_precision = precision_score(y_test, y_pred,average='weighted')
LSTM_TCN_attention_recall = recall_score(y_test, y_pred,average='weighted')
LSTM_TCN_attention_f1 = f1_score(y_test, y_pred,average='weighted')

print(f"LSTM_TCN_attention Accuracy: {LSTM_TCN_attention_accuracy:.4f}")
print(f"LSTM_TCN_attention Precision: {LSTM_TCN_attention_precision:.4f}")
print(f"LSTM_TCN_attention Recall: {LSTM_TCN_attention_recall:.4f}")
print(f"LSTM_TCN_attention F1-Score: {LSTM_TCN_attention_f1:.4f}")
conf = confusion_matrix(y_pred, y_test)
sns.heatmap(conf , cmap='YlGnBu', fmt='', xticklabels=['0' ,'1'], yticklabels=['0' ,'1'], annot=True)


In [None]:
import shap
import pandas as pd
import numpy as np
import seaborn as sns

from tqdm import tqdm
sns.set_style('whitegrid')
import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor='val_loss',  # 监测验证集损失
    patience=10,         # 如果连续10个epoch验证集损失没有改善，则停止训练
    restore_best_weights=True  # 恢复最佳模型权重
)

lencoder = LabelEncoder()

params = {
       'objective': 'binary',
       'num_class' : 1,
       'metric': 'binary_logloss'
   }

test_preds = None
train_rmse = 0
val_rmse = 0
n_splits = 10


model = LGBMClassifier(**params)

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

for tr_index, val_index in tqdm(skf.split(X_train.values, y_train.values), total=skf.get_n_splits(), desc="k-fold"):

    x_train_o, x_val_o = X_train.iloc[tr_index], X_train.iloc[val_index]
    y_train_o, y_val_o = y_train.iloc[tr_index], y_train.iloc[val_index]

    eval_set = [(x_val_o, y_val_o)]

    model.fit(x_train_o, y_train_o, eval_set=eval_set)

    train_preds = model.predict(x_train_o)
    train_rmse += mean_squared_error(y_train_o, train_preds, squared=False)

    val_preds = model.predict(x_val_o)
    val_rmse += mean_squared_error(y_val_o, val_preds, squared=False)
    if test_preds is None:
        test_preds = model.predict_proba(X_test.values)[:, 1]
    else:
        test_preds += model.predict_proba(X_test.values)[:, 1]

print(f"\nAverage Training RMSE : {train_rmse / n_splits}")
print(f"Average Validation RMSE : {val_rmse / n_splits}\n")

test_preds /= n_splits


In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)


In [None]:
shap.summary_plot(shap_values, X_test)