In [7]:
import pickle

# Load the pickle files
print("Loading data...")
with open("/Users/claire/Downloads/기계학습/mon_standard.pkl", 'rb') as fi:  # Monitored 데이터 로드
    mon_data = pickle.load(fi)

# 데이터 확인
print("Total sites:", len(mon_data))  # 사이트 개수
total_samples = sum(len(samples) for samples in mon_data.values())
print("Total samples:", total_samples)  # 전체 샘플 수

Loading data...
Total sites: 950
Total samples: 19000


In [35]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 데이터 전처리 함수
def process_data_multi_class(data, max_monitored_labels=95):
    X1, X2, y = [], [], []

    for site_id, samples in enumerate(data.values()):
        if site_id >= max_monitored_labels:
            break
        for sample in samples:
            X1.append([abs(c) for c in sample])
            X2.append([(1 if c > 0 else -1) * 512 for c in sample])
            y.append(site_id)

    return X1, X2, y

# Monitored 데이터 처리
X1, X2, y = process_data_multi_class(mon_data, max_monitored_labels=95)

# 시간 간격 평균(mean) 계산 함수 수정
def calculate_mean_time_intervals(X1):
    mean_intervals = []
    for sample in X1:
        if len(sample) > 1:  # 두 개 이상의 패킷이 있어야 계산 가능
            time_intervals = np.diff(sample)  # 시간 간격 계산
            mean_intervals.append(np.mean(time_intervals))  # 평균 계산
        else:
            mean_intervals.append(0)  # 간격 계산 불가능한 경우 0 추가
    return np.array(mean_intervals)

def create_features(X1, X2):
    X = []
    for i in range(len(X1)):
        packet_size_direction = sum(X2[i])
        cumulative_packet_size = np.sum([abs(c) for c in X2[i]])
        burst_lengths = len([c for c in X2[i] if c != 0])

        num_incoming_packets = len([c for c in X2[i] if c > 0])
        ratio_incoming_packets = num_incoming_packets / len(X2[i]) if len(X2[i]) > 0 else 0
        num_outgoing_packets = len([c for c in X2[i] if c < 0])
        total_packet_count = len(X2[i])

        feature_vector = [
            packet_size_direction,
            np.mean(X1[i]) if len(X1[i]) > 0 else 0,
            cumulative_packet_size,
            burst_lengths,
            num_incoming_packets,
            ratio_incoming_packets,
            num_outgoing_packets,
            total_packet_count
        ]
        X.append(feature_vector)
    return np.array(X)


# 피처 생성
X = create_features(X1, X2)

# 시간 간격 평균 피처 추가
mean_time_intervals = calculate_mean_time_intervals(X1)
X = np.hstack((X, mean_time_intervals.reshape(-1, 1)))  # 새로운 피처 결합

# 레이블 변환
y = np.array(y)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 데이터 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Random Forest 모델 생성 및 학습 (주어진 하이퍼파라미터 사용)
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=30,
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

gb_model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.001,
    max_depth=3,
    random_state=42
)

svm_model = SVC(
    C = 100,
    gamma=1,
    kernel='rbf',
    probability=True,
    random_state=42
)

# VotingClassifier 정의
voting_clf = VotingClassifier(
    estimators=[
        ('random_forest', rf_model),
        ('gradient_boosting', gb_model),
        ('svm', svm_model)
    ],
    voting='soft',
    weights=[2, 1, 2]
)

# Voting Classifier 학습
voting_clf.fit(X_train, y_train)

# 모델 평가
y_pred = voting_clf.predict(X_test)

# 평가 지표 출력
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.6605263157894737
Confusion Matrix:
 [[3 0 0 ... 0 0 0]
 [0 3 0 ... 0 0 0]
 [0 0 4 ... 0 0 0]
 ...
 [0 0 0 ... 4 0 0]
 [0 0 0 ... 0 4 0]
 [0 0 0 ... 0 1 2]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.75      0.75         4
           1       0.60      0.75      0.67         4
           2       0.80      1.00      0.89         4
           3       0.80      1.00      0.89         4
           4       1.00      0.75      0.86         4
           5       1.00      0.50      0.67         4
           6       0.67      0.50      0.57         4
           7       0.50      0.75      0.60         4
           8       0.75      0.75      0.75         4
           9       0.75      0.75      0.75         4
          10       0.50      0.25      0.33         4
          11       0.50      0.25      0.33         4
          12       0.33      0.25      0.29         4
          13       0.40      0.50      0.44        

In [4]:
# Unmonitored 데이터 라벨 확인
print(f"mon 데이터 라벨: {set(y)}")


Unmonitored 데이터 라벨: {np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11), np.int64(12), np.int64(13), np.int64(14), np.int64(15), np.int64(16), np.int64(17), np.int64(18), np.int64(19), np.int64(20), np.int64(21), np.int64(22), np.int64(23), np.int64(24), np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29), np.int64(30), np.int64(31), np.int64(32), np.int64(33), np.int64(34), np.int64(35), np.int64(36), np.int64(37), np.int64(38), np.int64(39), np.int64(40), np.int64(41), np.int64(42), np.int64(43), np.int64(44), np.int64(45), np.int64(46), np.int64(47), np.int64(48), np.int64(49), np.int64(50), np.int64(51), np.int64(52), np.int64(53), np.int64(54), np.int64(55), np.int64(56), np.int64(57), np.int64(58), np.int64(59), np.int64(60), np.int64(61), np.int64(62), np.int64(63), np.int64(64), np.int64(65), np.int64(66), np.int64(67), np.int64(68), np.int64(69), np.int64(

In [5]:
# 결과 평가
print("데이터 라벨 분포 확인:")
print(pd.Series(y).value_counts())  # 라벨 분포 확인

데이터 라벨 분포 확인:
0     20
60    20
69    20
68    20
67    20
      ..
29    20
28    20
27    20
26    20
94    20
Name: count, Length: 95, dtype: int64


In [10]:
# Monitored와 Unmonitored 데이터의 라벨 분포 확인
def validate_labels(y_mon):
    """
    Monitored와 Unmonitored 데이터의 라벨 분포 검증
    :param y_mon: Monitored 데이터 라벨 리스트
    :param y_unmon: Unmonitored 데이터 라벨 리스트
    :return: 검증 결과 출력
    """
    print("\n--- 검증 시작 ---")
    
    # Monitored 라벨 검증
    mon_unique_labels = set(y_mon)
    if mon_unique_labels == set(range(95)):  # 0 ~ 94
        print("Monitored 데이터 라벨링: 정상 (0 ~ 94)")
    else:
        print(f"Monitored 데이터 라벨링 오류: {mon_unique_labels}")
    
    print("--- 검증 완료 ---\n")

# Monitored와 Unmonitored 데이터 각각 분리
monitored_labels = y_mon  # Monitored 라벨 리스트 

# 검증 함수 실행
validate_labels(monitored_labels)



--- 검증 시작 ---
Monitored 데이터 라벨링: 정상 (0 ~ 94)
--- 검증 완료 ---



In [12]:
print(y_mon)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,

In [14]:
import matplotlib.pyplot as plt

# 라벨 분포 시각화 함수
def plot_label_distribution(y_mon):
    """
    Monitored와 Unmonitored 데이터의 라벨 분포 시각화
    :param y_mon: Monitored 데이터 라벨 리스트
    :param y_unmon: Unmonitored 데이터 라벨 리스트
    """
    # Monitored 데이터 라벨 분포
    plt.figure(figsize=(12, 6))
    
    # Monitored 라벨 분포 히스토그램
    plt.subplot(1, 2, 1)
    plt.hist(y_mon, bins=len(set(y_mon)), color='blue', alpha=0.7, edgecolor='black')
    plt.title("Monitored Label Distribution")
    plt.xlabel("Labels (0 to 94)")
    plt.ylabel("Frequency")
    plt.xticks(range(0, 95, 10))  # x축 간격 설정
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    
    # 그래프 표시
    plt.tight_layout()
    plt.show()

# 시각화 함수 실행
plot_label_distribution(monitored_labels)


ModuleNotFoundError: No module named 'matplotlib'

In [6]:
# Monitored 데이터 샘플 수 확인
num_monitored_samples = len(mon_data)
print(f"Monitored 데이터 샘플 수: {num_monitored_samples}")


Monitored 데이터 샘플 수: 950
