In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# 替换为实际数据集路径
labelled_data_path = 'kddcup_1_percent'
unlabelled_data_path = 'kddcup_1_percent.unlabeled'
except_path = 'kddcup_1_percent'

# 读取有标签的数据集
df_labelled = pd.read_csv(labelled_data_path, header=None)

# 读取无标签的数据集
df_unlabelled = pd.read_csv(unlabelled_data_path, header=None)
df_except = pd.read_csv(except_path, header=None)

# 定义列名
columns = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
           "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", 
           "logged_in", "num_compromised", "root_shell", "su_attempted", 
           "num_root", "num_file_creations", "num_shells", "num_access_files", 
           "num_outbound_cmds", "is_host_login", "is_guest_login", "count", 
           "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", 
           "srv_rerror_rate", "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", 
           "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", 
           "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
           "dst_host_srv_diff_host_rate", "dst_host_serror_rate", 
           "dst_host_srv_serror_rate", "dst_host_rerror_rate", 
           "dst_host_srv_rerror_rate", "label"]

df_labelled.columns = columns
df_unlabelled.columns = columns[:-1]  # 无标签数据集没有label列
df_except.columns = columns

# 获取分类特征的所有种类
categorical_columns = ["protocol_type", "service", "flag"]

# 为所有分类特征创建一个LabelEncoder对象
le_dict = {col: LabelEncoder() for col in categorical_columns}

# Fit LabelEncoder对象于有标签和无标签数据集中
for col in categorical_columns:
    le = le_dict[col]
    le.fit(pd.concat([df_labelled[col], df_unlabelled[col], df_except[col]]))

# 对有标签数据集进行编码
for col in categorical_columns:
    le = le_dict[col]
    df_labelled[col] = le.transform(df_labelled[col])

# 对无标签数据集进行编码
for col in categorical_columns:
    le = le_dict[col]
    df_unlabelled[col] = le.transform(df_unlabelled[col])

# 对df_except进行编码
for col in categorical_columns:
    le = le_dict[col]
    df_except[col] = le.transform(df_except[col])

# 将标签编码
df_labelled['label'] = df_labelled['label'].apply(lambda x: 1 if x != 'normal.' else 0)
df_except['label'] = df_except['label'].apply(lambda x: 1 if x != 'normal.' else 0)

# 特征标准化
scaler = StandardScaler()
df_labelled[df_labelled.columns[:-1]] = scaler.fit_transform(df_labelled[df_labelled.columns[:-1]])
df_unlabelled[df_unlabelled.columns] = scaler.transform(df_unlabelled[df_unlabelled.columns])
df_except[df_except.columns[:-1]] = scaler.transform(df_except[df_except.columns[:-1]])

# 划分有标签数据集的训练集和测试集
X = df_labelled.drop(columns=['label'])
y = df_labelled['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, random_state=42)

# 使用训练集中所有的有标签数据
X_train_labelled = X_train
y_train_labelled = y_train

# 无标签数据
X_train_unlabelled = df_unlabelled

# 处理后的df_except
X_except = df_except.drop(columns=['label'])
y_except = df_except['label']

# 显示数据集状态
print(f"Labelled Data Shape: {df_labelled.shape}")
print(f"Unlabelled Data Shape: {df_unlabelled.shape}")
print(f"Except Data Shape: {df_except.shape}")

# 显示分割后数据集的形状
print(f"X_train Labelled Shape: {X_train_labelled.shape}")
print(f"y_train Labelled Shape: {y_train_labelled.shape}")
print(f"X_test Shape: {X_test.shape}")
print(f"y_test Shape: {y_test.shape}")
print(f"X_except Shape: {X_except.shape}")
print(f"y_except Shape: {y_except.shape}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 数据概述与统计描述
print("Labelled Data Statistics:")
print(df_labelled.describe())
print("\nUnlabelled Data Statistics:")
print(df_unlabelled.describe())
print("\nExcept Data Statistics:")
print(df_except.describe())

# 特征分布可视化
def plot_feature_distribution(data, columns):
    for col in columns:
        plt.figure(figsize=(12, 6))
        sns.histplot(data[col], kde=True)
        plt.title(f"Distribution of {col}")
        plt.show()

# 绘制有标签数据的特征分布
plot_feature_distribution(df_labelled, df_labelled.columns[:-1])

# 标签分布
plt.figure(figsize=(8, 6))
sns.countplot(x='label', data=df_labelled)
plt.title('Label Distribution in Labelled Data')
plt.show()

# 计算相关性矩阵
corr_matrix = df_labelled.corr()

# 绘制相关性矩阵热图
plt.figure(figsize=(14, 10))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix Heatmap')
plt.show()

# 小提琴图示例
plt.figure(figsize=(12, 6))
sns.violinplot(x="label", y="duration", data=df_labelled)
plt.title("Violin Plot of Duration by Label")
plt.show()

# 注意：根据实际需要调整绘图参数和数据选择


In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# 初始模型训练
clf = SVC(probability=True, random_state=42)
clf.fit(X_train_labelled, y_train_labelled)

# 自训练迭代
max_iterations = 10
confidence_threshold = 0.9
low_confidence_sample_size = 100  # 定义一个合理的低置信度样本数量

# 自训练（半监督学习）迭代
for iteration in range(max_iterations):
    # 半监督学习部分
    y_unlabelled_pred = clf.predict(X_train_unlabelled)
    y_unlabelled_proba = clf.predict_proba(X_train_unlabelled)
    
    # 选择高置信度样本
    high_confidence_indices = np.where(np.max(y_unlabelled_proba, axis=1) >= confidence_threshold)[0]
    
    if len(high_confidence_indices) == 0:
        break
    
    X_high_confidence = X_train_unlabelled.iloc[high_confidence_indices]
    y_high_confidence = y_unlabelled_pred[high_confidence_indices]
    
    # 将高置信度样本添加到有标签的训练集中
    X_train_labelled = pd.concat([X_train_labelled, X_high_confidence])
    y_train_labelled = pd.concat([y_train_labelled, pd.Series(y_high_confidence)])
    
    # 从无标签数据集中删除高置信度样本
    X_train_unlabelled = X_train_unlabelled.drop(X_high_confidence.index)
    
    # 主动学习部分
    # 确保低置信度样本数量不超过未标记数据的大小
    if len(X_train_unlabelled) < low_confidence_sample_size:
        low_confidence_sample_size = len(X_train_unlabelled)
    
    # 重新计算未标记样本的预测概率
    y_unlabelled_proba = clf.predict_proba(X_train_unlabelled)
    
    # 选择低置信度样本进行标注
    low_confidence_indices = np.argsort(np.max(y_unlabelled_proba, axis=1))[:low_confidence_sample_size]
    X_low_confidence = X_train_unlabelled.iloc[low_confidence_indices]
    
    # 从df_except中获取标签
    X_low_confidence_features = X_low_confidence.reset_index(drop=True)
    y_low_confidence_labels = df_except[df_except.drop(columns=['label']).apply(tuple, axis=1).isin(X_low_confidence_features.apply(tuple, axis=1))]['label'].values
    
    # 检查是否找到了所有低置信度样本的标签
    if len(y_low_confidence_labels) < low_confidence_sample_size:
        print(f"Warning: Only found {len(y_low_confidence_labels)} labels for low confidence samples.")
        low_confidence_sample_size = len(y_low_confidence_labels)
        X_low_confidence = X_low_confidence.iloc[:low_confidence_sample_size]
    
    # 添加低置信度样本及其标签到训练集中
    X_train_labelled = pd.concat([X_train_labelled, X_low_confidence])
    y_train_labelled = pd.concat([y_train_labelled, pd.Series(y_low_confidence_labels)])
    
    # 从无标签数据集中删除低置信度样本
    X_train_unlabelled = X_train_unlabelled.drop(X_low_confidence.index)
    
    # 重新训练模型
    clf.fit(X_train_labelled, y_train_labelled)

# 模型评估
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
