In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df=pd.read_csv('train1.csv')
test_df=pd.read_csv('test1.csv')

In [3]:
p1_dict = {
    "P1_B2004": "hx_press_sp",
    "P1_B2016": "power_press_demand",
    "P1_B3004": "rtn_level_sp",
    "P1_B3005": "rtn_flow_sp",
    "P1_B4002": "hx_temp_sp",
    "P1_B4005": "temp_pid_out",
    "P1_B400B": "heat_outflow_sp",
    "P1_B4022": "power_temp_demand",

    "P1_FCV01D": "fcv01_cmd",
    "P1_FCV01Z": "fcv01_pos",
    "P1_FCV02D": "fcv02_cmd",
    "P1_FCV02Z": "fcv02_pos",
    "P1_FCV03D": "fcv03_cmd",
    "P1_FCV03Z": "fcv03_pos",

    "P1_FT01": "rtn_flow_raw",
    "P1_FT01Z": "rtn_flow",
    "P1_FT02": "heat_flow_raw",
    "P1_FT02Z": "heat_flow",
    "P1_FT03": "rtn_flow2_raw",
    "P1_FT03Z": "rtn_flow2",

    "P1_LCV01D": "lcv01_cmd",
    "P1_LCV01Z": "lcv01_pos",

    "P1_LIT01": "rtn_level",

    "P1_PCV01D": "pcv01_cmd",
    "P1_PCV01Z": "pcv01_pos",
    "P1_PCV02D": "pcv02_cmd",
    "P1_PCV02Z": "pcv02_pos",

    "P1_PIT01": "hx_press",
    "P1_PIT01_HH": "hx_press_high",
    "P1_PIT02": "pump_supply_press",

    "P1_PP01AD": "pump1A_cmd",
    "P1_PP01AR": "pump1A_run",
    "P1_PP01BD": "pump1B_cmd",
    "P1_PP01BR": "pump1B_run",
    "P1_PP02D": "pump2_cmd",
    "P1_PP02R": "pump2_run",

    "P1_PP04": "cooler_out",
    "P1_PP04SP": "cooler_temp_sp",

    "P1_SOL01D": "sol_supply_cmd",
    "P1_SOL03D": "sol_drain_cmd",

    "P1_STSP": "boiler_run_cmd",

    "P1_TIT01": "hx_temp",
    "P1_TIT02": "heat_tank_temp",
    "P1_TIT03": "main_tank_temp"
}
train_df.rename(columns=p1_dict, inplace=True)
test_df.rename(columns=p1_dict, inplace=True)

In [None]:
''' 원본 코드
df['fcv03_diff'] = abs(df['fcv03_pos'] - df['fcv03_cmd'])
df['fcv03_cng'] = abs(df['fcv03_cmd'].diff().fillna(0))
df['level_diff'] = df['rtn_level'].diff().fillna(0)
df['level_diff_abs']=abs(df['level_diff'])
df['flow_balance'] = df['rtn_flow'] - df['rtn_flow2']
df['flow_balance_abs'] = abs(df['flow_balance'])
df['phys_diff'] = df['level_diff'] - df['flow_balance']
df['phys_diff_abs'] = abs(df['phys_diff'])
'''


In [4]:
def add_custom_features(df):
    df = df.copy()

    # 1) 밸브 명령-위치 차이
    df['fcv03_diff'] = (df['fcv03_pos'] - df['fcv03_cmd']).abs()

    # 2) 밸브 명령 변화량
    df['fcv03_cng'] = df['fcv03_cmd'].diff().fillna(0).abs()

    # 3) 수위 변화량
    df['level_diff'] = df['rtn_level'].diff().fillna(0)
    df['level_diff_abs'] = df['level_diff'].abs()

    # 4) 유량 균형
    df['flow_balance'] = df['rtn_flow'] - df['rtn_flow2']
    df['flow_balance_abs'] = df['flow_balance'].abs()

    # 5) 물리 기반 차이값 (수위 변화 vs 유량 변화)
    df['phys_diff'] = df['level_diff'] - df['flow_balance']
    df['phys_diff_abs'] = df['phys_diff'].abs()

    return df


In [5]:
train_df = add_custom_features(train_df)
test_df  = add_custom_features(test_df)


In [9]:
feature_cols = [
        'phys_diff_abs',
        'flow_balance_abs',
        'level_diff_abs',
        'fcv03_cng',
        'fcv03_diff',
        'rtn_level',
        'rtn_flow',
        'rtn_flow2',
        'fcv03_cmd',
        'fcv03_pos',
        'hx_press',
        'attack'
    ]

In [10]:
train_df=train_df[feature_cols]
test_df=test_df[feature_cols]

In [11]:
feature_cols2 = [
        'phys_diff_abs',
        'flow_balance_abs',
        'level_diff_abs',
        'fcv03_cng',
        'fcv03_diff',
        'rtn_level',
        'rtn_flow',
        'rtn_flow2',
        'fcv03_cmd',
        'fcv03_pos',
        'hx_press'
    ]

In [12]:
X_train = train_df[feature_cols2]
y_train = train_df["attack"]    # all zero

X_test = test_df[feature_cols2]
y_test = test_df["attack"]


In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


In [14]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    scale_pos_weight=20,   # test.csv는 attack이 매우 적기 때문에 가중치 필요
    eval_metric='logloss',
    random_state=42
)

model.fit(X_train_scaled, y_train)


In [16]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

y_prob = model.predict_proba(X_test_scaled)[:,1]

threshold = 0.4
y_pred = (y_prob >= threshold).astype(int)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_prob))


[[42572     0]
 [  629     0]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     42572
           1       0.00      0.00      0.00       629

    accuracy                           0.99     43201
   macro avg       0.49      0.50      0.50     43201
weighted avg       0.97      0.99      0.98     43201

AUC: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Isolation Forest

In [17]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

feature_cols = [
    'phys_diff_abs', 'flow_balance_abs', 'level_diff_abs',
    'fcv03_cng', 'fcv03_diff', 
    'rtn_level', 'rtn_flow', 'rtn_flow2',
    'fcv03_cmd', 'fcv03_pos', 'hx_press'
]

X_train = train_df[feature_cols]
X_test  = test_df[feature_cols]
y_test  = test_df['attack']   # 0/1 (test에는 공격 있음)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# Isolation Forest (정상만 학습)
iso = IsolationForest(
    n_estimators=300,
    contamination=0.01,   # 전체 중 이상 비율 대충 설정 (나중에 튜닝)
    random_state=42
)
iso.fit(X_train_scaled)

# 예측: 정상=1, 이상=-1 로 나옴
y_pred_iso = iso.predict(X_test_scaled)

# -1 → 공격(1), 1 → 정상(0) 으로 매핑
y_pred_attack = (y_pred_iso == -1).astype(int)

print(confusion_matrix(y_test, y_pred_attack))
print(classification_report(y_test, y_pred_attack))


[[42446   126]
 [  322   307]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     42572
           1       0.71      0.49      0.58       629

    accuracy                           0.99     43201
   macro avg       0.85      0.74      0.79     43201
weighted avg       0.99      0.99      0.99     43201



In [18]:
from sklearn.metrics import recall_score
from sklearn.ensemble import IsolationForest

cont_list = [0.003, 0.005, 0.01, 0.015, 0.02]

for c in cont_list:
    iso = IsolationForest(
        n_estimators=300,
        contamination=c,
        random_state=42
    )
    iso.fit(X_train_scaled)
    
    y_pred = (iso.predict(X_test_scaled) == -1).astype(int)
    rec = recall_score(y_test, y_pred)
    print(c, rec)


0.003 0.30842607313195547
0.005 0.3863275039745628
0.01 0.48807631160572335
0.015 0.5580286168521462
0.02 0.5977742448330684


In [19]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(
    n_neighbors=50,
    novelty=True  # test에서 예측 허용
)
lof.fit(X_train_scaled)

y_pred = (lof.predict(X_test_scaled) == -1).astype(int)


In [21]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[31099 11473]
 [  306   323]]
              precision    recall  f1-score   support

           0       0.99      0.73      0.84     42572
           1       0.03      0.51      0.05       629

    accuracy                           0.73     43201
   macro avg       0.51      0.62      0.45     43201
weighted avg       0.98      0.73      0.83     43201



In [22]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

feature_cols = [
    'phys_diff_abs', 'flow_balance_abs', 'level_diff_abs',
    'fcv03_cng', 'fcv03_diff', 
    'rtn_level', 'rtn_flow', 'rtn_flow2',
    'fcv03_cmd', 'fcv03_pos', 'hx_press'
]

X_train = train_df[feature_cols]
X_test  = test_df[feature_cols]
y_test  = test_df['attack']   # 0/1 (test에는 공격 있음)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# Isolation Forest (정상만 학습)
iso = IsolationForest(
    n_estimators=300,
    contamination=0.02,   # 전체 중 이상 비율 대충 설정 (나중에 튜닝)
    random_state=42
)
iso.fit(X_train_scaled)

# 예측: 정상=1, 이상=-1 로 나옴
y_pred_iso = iso.predict(X_test_scaled)

# -1 → 공격(1), 1 → 정상(0) 으로 매핑
y_pred_attack = (y_pred_iso == -1).astype(int)

print(confusion_matrix(y_test, y_pred_attack))
print(classification_report(y_test, y_pred_attack))

[[42322   250]
 [  253   376]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     42572
           1       0.60      0.60      0.60       629

    accuracy                           0.99     43201
   macro avg       0.80      0.80      0.80     43201
weighted avg       0.99      0.99      0.99     43201



In [24]:
from sklearn.metrics import recall_score
from sklearn.ensemble import IsolationForest

cont_list = [0.003, 0.005,0.07, 0.01, 0.015, 0.02,0.025,0.03]

for c in cont_list:
    iso = IsolationForest(
        n_estimators=300,
        contamination=c,
        random_state=42
    )
    iso.fit(X_train_scaled)
    
    y_pred = (iso.predict(X_test_scaled) == -1).astype(int)
    rec = recall_score(y_test, y_pred)
    print(c, rec)

0.003 0.30842607313195547
0.005 0.3863275039745628
0.07 0.6677265500794912
0.01 0.48807631160572335
0.015 0.5580286168521462
0.02 0.5977742448330684
0.025 0.6200317965023847
0.03 0.6327503974562798


In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report

feature_cols = [
    'phys_diff_abs', 'flow_balance_abs', 'level_diff_abs',
    'fcv03_cng', 'fcv03_diff', 
    'rtn_level', 'rtn_flow', 'rtn_flow2',
    'fcv03_cmd', 'fcv03_pos', 'hx_press'
]

X_train = train_df[feature_cols]
X_test  = test_df[feature_cols]
y_test  = test_df['attack']   # 0/1 (test에는 공격 있음)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# Isolation Forest (정상만 학습)
iso = IsolationForest(
    n_estimators=300,
    contamination=0.07,   
    random_state=42
)
iso.fit(X_train_scaled)

# 예측: 정상=1, 이상=-1 로 나옴
y_pred_iso = iso.predict(X_test_scaled)

# -1 → 공격(1), 1 → 정상(0) 으로 매핑
y_pred_attack = (y_pred_iso == -1).astype(int)

print(confusion_matrix(y_test, y_pred_attack))
print(classification_report(y_test, y_pred_attack))

[[41917   655]
 [  209   420]]
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     42572
           1       0.39      0.67      0.49       629

    accuracy                           0.98     43201
   macro avg       0.69      0.83      0.74     43201
weighted avg       0.99      0.98      0.98     43201

