In [None]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score


In [None]:
names = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'num_failed_logins',
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'count',
    'srv_count',
    'serror_rate',
    'srv_serror_rate',
    'rerror_rate',
    'srv_rerror_rate',
    'same_srv_rate',
    'diff_srv_rate',
    'srv_diff_host_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_srv_rate',
    'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate',
    'target'
]

data = pd.read_csv('corrected', names=names)
# Оставляю только категориальные фичи
data = data[[
    'protocol_type', 
    'flag',
    'land', 
    'urgent', 
    'hot', 
    'num_failed_logins', 
    'logged_in',
    'num_compromised',
    'root_shell',
    'su_attempted',
    'num_root',
    'num_file_creations',
    'num_shells',
    'num_access_files',
    'num_outbound_cmds',
    'is_host_login',
    'is_guest_login',
    'target'
]]

for feature in data.columns:
    data[feature] = data[feature].apply(lambda x: str(x))

# One-hot encoding
data_one_hot = pd.get_dummies(data.iloc[:, :-1])
data_one_hot['target'] = data.iloc[:, -1].map({'normal.':0}).fillna(1)

# Небольшой датасет
# data_small = pd.read_csv('train.csv')
# data_one_hot_small = pd.get_dummies(data_small.iloc[:, :-1])
# data_one_hot_small['TARGET'] = data_small.iloc[:, -1].replace({'T':1, 'F':0})

In [None]:
class Ant_model:
    
    def __init__(self, alpha=1, beta=1, p=0.9, m=200, t_0=1, omega=0.7, phi=0.3, T=5, eps=1.2):
        self._alpha = alpha
        self._beta = beta
        self._p = p
        self._m = m
        self._t_0 = t_0
        self._omega = omega
        self._phi = phi
        self._T = T
        self._eps = eps
        
    def fit(self, train):
        # Выделим фичи и создадим граф
        features = train.columns[:-1]
        f_len = len(features)

        nodes = np.ones(f_len)
        
        accuracy_array = np.zeros(f_len)
        # Посчитаем classifier accuracy (heuristic information)
        for i in range(f_len):
            accuracy = accuracy_score(train.iloc[:, i], train.iloc[:, -1])
            accuracy_array[i] = accuracy

        nodes *= self._t_0

        # Случайно расставим муравьев
        positions = np.random.choice(np.arange(0, f_len), self._m)

        for t in range(self._T):
            # Создадим массивы для фичей, длин и качества классификации
            S = np.empty(self._m, dtype=object)
            S_length = np.empty(self._m, dtype=object)
            S_perfomance = np.empty(self._m, dtype=object)

            for i in range(self._m):
                S[i] = []
                while True:
                    # Выбираем следубщий нод
                    # Число феромона
                    heuristic_information = np.copy(nodes) ** self._alpha
                    for item in S[i]:
                        heuristic_information[item] = 0
                    heuristic_information[positions[i]] = 0

                    # Classifier accuracy
                    heuristic_information *= accuracy_array ** self._beta
                    next_node = np.random.choice(np.arange(f_len), p=heuristic_information / heuristic_information.sum())
                    subset = pd.DataFrame.copy(train)
                    prediction = np.ones(len(train))
                    for item in S[i]:
                        prediction *= subset.iloc[:, item]

                    # Вычисляем accuracy
                    current_accuracy = accuracy_score(prediction, train.iloc[:, -1])
                    prediction *= subset.iloc[:, next_node]
                    next_accuracy = accuracy_score(prediction, train.iloc[:, -1])

                    S_perfomance[i] = current_accuracy
                    # Критерий остановы (я домножаю на eps, чтобы модель добавляла фичи, иначе быстро останавливается)
                    if len(S[i]) == 0 or next_accuracy * self._eps > current_accuracy:
                        S_perfomance[i] = next_accuracy
                        S[i].append(next_node)
                        if len(S[i]) == f_len - 1:
                            break
                    else:
                        break

                    S_length[i] = len(S[i])

            # Обновляем феромон
            nodes = (1 - self._p) * nodes

            for i in range(f_len):
                for k in range(self._m):
                    if i in S[k]:
                        nodes[i] += self._omega * (S_perfomance[k] - 0.5) + self._phi * (f_len /  S_length[k])

        # Cортируем маршруты муравьев на последней итерации по качеству
        S = S[np.argsort(-S_perfomance)]
        S_perfomance = S_perfomance[np.argsort(-S_perfomance)]
        self._S = S
        # Выводим лучший маршрут
        print(features[S[0]])
        
    def predict(self, test):
        # Считаем accuracy на тестовой выборке
        subset = pd.DataFrame.copy(test)
        prediction = np.ones(len(test))
        for item in self._S[0]:
            prediction *= subset.iloc[:, item]

        return accuracy_score(prediction, test.iloc[:, -1])

In [107]:
from sklearn.model_selection import train_test_split

train_one_hot, test_one_hot = train_test_split(data_one_hot[:50000], test_size=0.8, random_state=42)

In [108]:
ant_model = Ant_model(m=30)
ant_model.fit(train_one_hot)
accuracy = ant_model.predict(test_one_hot)
print(np.round(accuracy, 2))

Index(['urgent_0', 'root_shell_0', 'num_failed_logins_0', 'flag_SF',
       'su_attempted_0', 'num_compromised_0', 'is_host_login_0', 'logged_in_0',
       'num_shells_0', 'num_access_files_0', 'land_0', 'num_file_creations_0',
       'is_guest_login_0', 'num_outbound_cmds_0', 'num_root_0',
       'protocol_type_icmp', 'hot_0'],
      dtype='object')
0.95


In [109]:
# Сравним с Линейной моделью

from sklearn.linear_model import RidgeClassifier

clf = RidgeClassifier()
clf.fit(train_one_hot.iloc[:,:-1], train_one_hot.iloc[:, -1])
pred = clf.predict(test_one_hot.iloc[:,:-1])
accuracy_score(pred, test_one_hot.iloc[:, -1])

0.976575