In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score as f1
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pickle

In [2]:
df = pd.read_csv('data/data.csv',sep=';', low_memory=False)

In [3]:
#удаляем лишние пробелы из названия колонок

df.columns = df.columns.str.replace(' ', '')

ОПРЕДЕЛЯЕМ ОСНОВНЫЕ ДАТЧИКИ, КОТОРЫЕ ПОДВЕРГАЮТСЯ АТАКАМ

In [4]:
#Чтение файла

df2 = pd.read_excel('data/List_of_attacks_Final.xlsx')
df2 = df2.drop(df2[df2['Attack Point'].isna() == True].index)

In [5]:
#Удаляем строки No Physical Impact Attack

df2 = df2.drop(df2[df2['Attack Point'] == 'No Physical Impact Attack'].index)

In [6]:
#Получаем список уникальных значений из колонки Attack Point

print("количество датчиков, которые подвергаются атакам", len(df2['Attack Point'].unique()))
print()
print("Список датчиков")
print(df2['Attack Point'].unique())

количество датчиков, которые подвергаются атакам 28

Список датчиков
['MV-101' 'P-102' 'LIT-101' 'MV-504' 'AIT-202' 'LIT-301' 'DPIT-301'
 'FIT-401' 'MV-304' 'Mv-303' 'MV-303' 'AIT-504' 'MV-101, LIT-101'
 'UV-401, AIT-502, P-501' 'P-602, DIT-301, MV-302' 'P-203, P-205'
 'LIT-401, P-401' 'P-101, LIT-301' 'P-302, LIT-401' 'P-302'
 'P-201, P-203, P-205' 'LIT-101, P-101, MV-201' 'LIT-401' 'P-101'
 'P-101; P-102' 'P-501, FIT-502' 'AIT-402, AIT-502' 'FIT-401, AIT-502']


In [7]:
#Сплитим значения по пробелу и добавляем в новый чистый список

features = []

for col in df2['Attack Point'].unique():
    features += col.split(', ')

#уберем символ "-" из названия фичей и сделаем все буквы заглавными

for i in range(len(features)):
    features[i] = features[i].replace('-', "")
    features[i] = features[i].upper()
    
features = list(set(features))

In [8]:
#удалим фичи которых нет в тренировочном наборе данных

for i in features:
    if i not in df.columns:
        features.remove(i)

In [9]:
#создадим новый датафрейм в котором будут только фичи из списка features

df_new = pd.DataFrame()

for colname in features:
    df_new[colname] = df[colname].copy()

In [10]:
#Добавим к датасету целевую переменную, чтобы в случае удаления лишних строк, длины тренировочного набора и серия с меткой совпали

df_new['Normal/Attack'] = df['Normal/Attack'].copy()
df_new.head(5)

Unnamed: 0,AIT502,P205,P101,AIT504,AIT202,P201,AIT402,UV401,P501,MV303,...,MV304,P401,MV101,P102,MV201,FIT401,LIT101,P302,LIT301,Normal/Attack
0,1451166,2,2,1203538,8396437,1,1560882,2,2,1,...,1,1,2,1,2,1713517,5228467,2,9561651,Normal
1,1451166,2,2,1203538,8396437,1,1560882,2,2,1,...,1,1,2,1,2,1715952,522886,2,9561651,Normal
2,1451166,2,2,1203538,8394514,1,1560882,2,2,1,...,1,1,2,1,2,1715952,5228467,2,9564855,Normal
3,1450141,2,2,1203538,8394514,1,1560882,2,2,1,...,1,1,2,1,2,171467,5229645,2,956806,Normal
4,1448859,2,2,1203538,8394514,1,1560882,2,2,1,...,1,1,2,1,2,171467,5234748,2,9570864,Normal


In [11]:
df_new.columns

Index(['AIT502', 'P205', 'P101', 'AIT504', 'AIT202', 'P201', 'AIT402', 'UV401',
       'P501', 'MV303', 'P602', 'P203', 'LIT401', 'DPIT301', 'FIT502', 'MV302',
       'MV304', 'P401', 'MV101', 'P102', 'MV201', 'FIT401', 'LIT101', 'P302',
       'LIT301', 'Normal/Attack'],
      dtype='object')

In [12]:
#Убираем лишние пробелы из значений целевой переменной

df_new['Normal/Attack'] = df_new['Normal/Attack'].str.replace(' ', "")

In [13]:
unique_values = []

for column in df_new.columns:
    if len(df_new[column].unique()) < 2:
        df_new = df_new.drop(column, axis=1)

In [14]:
num_cols = []

for columns in df_new.columns:
    if len(df_new[columns].unique())> 3:
        num_cols += [columns]

In [15]:
for num_col in num_cols:
    df_new[num_col] = df_new[num_col].str.replace(',', '.').astype('float')

In [16]:
df_new.isna().sum()

AIT502           0
P205             0
P101             0
AIT504           0
AIT202           0
P201             0
AIT402           0
UV401            0
P501             0
MV303            0
P602             0
P203             0
LIT401           0
DPIT301          0
FIT502           0
MV302            0
MV304            0
MV101            0
P102             0
MV201            0
FIT401           0
LIT101           0
P302             0
LIT301           0
Normal/Attack    0
dtype: int64

In [17]:
#Построив сводную таблицу, мы видим, что в случае когда FIT401 = 0, в подавляющм числе случаев это признак атаки

pd.pivot_table(df_new[df_new.FIT401 == 0], values='FIT401',
                    columns=['Normal/Attack'], aggfunc='count')

Normal/Attack,Attack,Normal
FIT401,32758,104


In [18]:
#А в случаях, когда FIT401 больше 0, в большинстве своем наблюдения имеют метку Normal

pd.pivot_table(df_new[df_new.FIT401 > 0], values='FIT401',
                    columns=['Normal/Attack'], aggfunc='count')

Normal/Attack,Attack,Normal
FIT401,21863,395194


In [19]:
#посчитаем сколько строк с меткой Normal и Attack

print(len(df_new[df_new['Normal/Attack'] == 'Attack']))
print(len(df_new[df_new['Normal/Attack'] == 'Normal']))

54621
395298


In [20]:
#Заменим в нашей целевой переменной метки Normal = 0, Attack = 1

df_new['Normal/Attack'] = df_new['Normal/Attack'].replace('Normal', 0)
df_new['Normal/Attack'] = df_new['Normal/Attack'].replace('Attack', 1)

In [21]:
X = df_new.drop('Normal/Attack', axis=1).copy()
y = df_new['Normal/Attack'].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

X_train = X_train.copy()
X_test = X_test.copy()
y_train = y_train.copy()
y_test = y_test.copy()

scaler  = MinMaxScaler()
scaler.fit_transform(X_train)

X_train_scaled = scaler.transform(X_train) 
X_test_scaled  = scaler.transform(X_test)

X_train.iloc[:] = X_train_scaled
X_test.iloc[:] = X_test_scaled

In [22]:
#@title Логистическая Регрессия{ run: "auto" }
#@markdown ### Тип Регуляризации 
penalty= 'l1' #@param [ "l2" , "l1", "none"]{type:"string"}
#@markdown ### Константа Регуларизации
regularization = 1.66 #@param {type:"slider", min:0.01, max:3, step:0.05}



LogReg = LogisticRegression(fit_intercept=True,
                            penalty=penalty,solver='saga',
                            C=regularization,
                            max_iter=100000)

LogReg.fit(X_train, y_train)

LogisticRegression(C=1.66, max_iter=100000, penalty='l1', solver='saga')

Код для сохранения обученной модели в RAW

In [23]:
y_predict=LogReg.predict(X_test)

print('test error')
print('ACC: %.4f' % acc(y_predict,y_test))
print('F1 : %.4f' %  f1(y_predict,y_test,average='binary'))

test error
ACC: 0.9652
F1 : 0.8356


In [24]:
target_names = ['normal', 'attack']
print(classification_report(y_test, y_predict, target_names=target_names))

              precision    recall  f1-score   support

      normal       0.96      1.00      0.98     39539
      attack       0.98      0.73      0.84      5453

    accuracy                           0.97     44992
   macro avg       0.97      0.86      0.91     44992
weighted avg       0.97      0.97      0.96     44992



ПОСТРОЕНИЕ ДЕРЕВА РЕШЕНИЙ

Создаем словарь с параметрами, которые будет перебирать GridSearchCV

In [25]:
parametrs = {
    'n_estimators': [2,5],
    'max_features': ['sqrt'],
    'max_depth': [x for x in range(3, 5, 1)],
    'min_samples_split': [15,25],
    'min_samples_leaf': [1,2,5]
}

grid_search_reg = GridSearchCV(RandomForestClassifier(), parametrs, scoring = 'accuracy', cv=5, n_jobs=-1)
grid_search_reg.fit(X_train, y_train)
grid_search_reg.best_params_

{'max_depth': 4,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 15,
 'n_estimators': 2}

In [26]:
predictions = grid_search_reg.best_estimator_.predict(X_test)

Качество модели на метрике accuracy

In [27]:
print('accuracy:', acc(y_test, predictions))

accuracy: 0.9660384068278806


In [28]:
target_names = ['normal', 'attack']
print(classification_report(y_test, y_predict, target_names=target_names))

              precision    recall  f1-score   support

      normal       0.96      1.00      0.98     39539
      attack       0.98      0.73      0.84      5453

    accuracy                           0.97     44992
   macro avg       0.97      0.86      0.91     44992
weighted avg       0.97      0.97      0.96     44992



ПОСТРОЕНИЕ SVM МОДЕЛИ

Обратить внимание, что SVM модель обучается относительно долго, поэтому был выбрана сокращенная обучающая выборка

In [29]:
C = 1.0  # = self._alpha in our algorithm

model1 = svm.SVC(kernel='linear', C=C)

model1.fit(X_train.iloc[:50000,:], y_train.iloc[:50000])
y_predict = model1.predict(X_test.iloc[:10000,:])

print(acc(y_test.iloc[:10000], y_predict))

0.9635


In [30]:
target_names = ['normal', 'attack']
print(classification_report(y_test.iloc[:10000], y_predict, target_names=target_names))

              precision    recall  f1-score   support

      normal       0.96      1.00      0.98      8761
      attack       0.96      0.74      0.83      1239

    accuracy                           0.96     10000
   macro avg       0.96      0.87      0.91     10000
weighted avg       0.96      0.96      0.96     10000



ПОСТРОЕНИЕ SVM ONE-CLASS MODEL (ОБУЧЕНИЕ БЕЗ УЧИТЕЛЯ)

In [31]:
df_new.head()

Unnamed: 0,AIT502,P205,P101,AIT504,AIT202,P201,AIT402,UV401,P501,MV303,...,MV302,MV304,MV101,P102,MV201,FIT401,LIT101,P302,LIT301,Normal/Attack
0,145.1166,2,2,12.03538,8.396437,1,156.0882,2,2,1,...,2,1,2,1,2,1.713517,522.8467,2,956.1651,0
1,145.1166,2,2,12.03538,8.396437,1,156.0882,2,2,1,...,2,1,2,1,2,1.715952,522.886,2,956.1651,0
2,145.1166,2,2,12.03538,8.394514,1,156.0882,2,2,1,...,2,1,2,1,2,1.715952,522.8467,2,956.4855,0
3,145.0141,2,2,12.03538,8.394514,1,156.0882,2,2,1,...,2,1,2,1,2,1.71467,522.9645,2,956.806,0
4,144.8859,2,2,12.03538,8.394514,1,156.0882,2,2,1,...,2,1,2,1,2,1.71467,523.4748,2,957.0864,0


Исправим метки на 1(normal) и на -1(attack)

In [32]:
df_new['Normal/Attack'] = df_new['Normal/Attack'].replace(1, -1)
df_new['Normal/Attack'] = df_new['Normal/Attack'].replace(0, 1)

Обучать будем на обрезанном датасете, чтобы снизить время обучения

In [33]:
X = df_new.iloc[:50000].drop('Normal/Attack', axis=1)
y = df_new['Normal/Attack'].iloc[:50000]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

X_train = X_train.copy()
X_test = X_test.copy()
y_train = y_train.copy()
y_test = y_test.copy()

scaler  = MinMaxScaler()
scaler.fit_transform(X_train)

X_train_scaled = scaler.transform(X_train) 
X_test_scaled  = scaler.transform(X_test)

X_train.iloc[:] = X_train_scaled
X_test.iloc[:] = X_test_scaled

Доля выбросов относительно общего количества наблюдений

In [34]:
outliers = df_new['Normal/Attack'][df_new['Normal/Attack'] == -1]
print(len(outliers))
normal = df_new['Normal/Attack'][df_new['Normal/Attack'] == 1]
print(len(normal))
ratio = round(len(outliers) / len(normal),2)
print(ratio)

54621
395298
0.14


In [35]:
nu = ratio
print("nu", nu)

model = svm.OneClassSVM(nu=nu, kernel='rbf', gamma=0.00005) 
model.fit(X_train)

nu 0.14


OneClassSVM(gamma=5e-05, nu=0.14)

In [36]:
y_predict = model.predict(X_test)
target_names = ['normal', 'attack']
print(classification_report(y_test, y_predict, target_names=target_names))

              precision    recall  f1-score   support

      normal       0.39      0.64      0.49       451
      attack       0.96      0.90      0.93      4549

    accuracy                           0.88      5000
   macro avg       0.68      0.77      0.71      5000
weighted avg       0.91      0.88      0.89      5000



In [41]:
round(roc_auc_score(y_test, y_predict),2)

0.77