# Основной код

In [7]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, train_test_split
import matplotlib.pyplot as plt

<h3>Блок функций</h3>

In [8]:
def data_preparation():
    train_data = pd.read_csv('train.csv', sep=';', dtype={'Source': np.object, 'Destination': np.object,  
                                                     'Protocol': np.object, 'Length': np.int64, 'Source Port': np.int64,
                                                     'Dest Port': np.int64, 'Delta time': np.float64, 'Error': np.object})
    train_data = train_data.fillna('delivered')

    categories = {}
    
    for key, value in enumerate(train_data['Error'].unique()):
        categories[value] = key + 1

    train_data['Error'] = train_data['Error'].map(categories)
    
    for key, value in enumerate(train_data['Source'].unique()):
        categories[value] = key + 1

    train_data['Source'] = train_data['Source'].map(categories)
    
    for key, value in enumerate(train_data['Destination'].unique()):
        categories[value] = key + 1

    train_data['Destination'] = train_data['Destination'].map(categories)
    
    train_data = train_data.sample(frac=1).reset_index(drop=True)    #Функция перемештвания строк
    
    return train_data

<h3>Подготовка и обработка данных</h3>
<p><em>Надо понять, как признаки превратить в вектор и передать их в модель. 
    Возможно нужно сделать словарь, как для 'Error'</em></p>

In [9]:
train_db = data_preparation()
train_db.head()

Unnamed: 0,Source,Destination,Protocol,Length,Source Port,Dest Port,Delta time,Error
0,4,4,TCP,1518,36384,445,6.3e-05,1
1,4,4,TCP,2966,36384,445,6e-06,4
2,4,4,TCP,2966,36384,445,3e-06,4
3,2,2,TCP,70,445,36384,1.2e-05,1
4,4,4,TCP,2966,36384,445,3e-06,4


In [10]:
X = train_db[['Source', 'Destination', 'Length', 'Source Port', 'Dest Port', 'Delta time']]
Y = train_db['Error']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=6)
X_train.shape, y_train.shape

((188388, 6), (188388,))

<h3>Создание модели</h3>

In [36]:
first_tree = DecisionTreeClassifier(random_state=6)

In [37]:
np.mean(cross_val_score(first_tree, X_train, y_train, cv=5))

0.9646739749618243

<h3>Подбор гиперпараметров</h3>

In [14]:
from sklearn.model_selection import GridSearchCV

In [29]:
tree_params = {'max_depth': np.arange(1,11)}

In [48]:
tree_grid = GridSearchCV(first_tree, tree_params, cv=5)

In [49]:
%%time
tree_grid.fit(X_train, y_train);

CPU times: total: 3.55 s
Wall time: 3.55 s


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=6),
             param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])})

In [50]:
tree_grid.best_params_, tree_grid.best_score_

({'max_depth': 7}, 0.9666008522157707)

<h3>Оценка на отложенной выборке</h3>

In [51]:
tree_valid_pred = tree_grid.predict(X_test)

In [52]:
tree_greed.score(X_test, y_test)

0.9675617429215487

In [53]:
from sklearn.metrics import accuracy_score

In [54]:
accuracy_score(y_test, tree_valid_pred)

0.9675617429215487

<h3>Тут будут графики</h3>

In [55]:
from sklearn.tree import export_graphviz

In [65]:
export_graphviz(tree_grid.best_estimator_, out_file='error_tree.dot', feature_names=X.columns, filled=True)

<img src='error_tree.png'>