In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import sys
sys.path.append('../')
import utils.preprocess as preprocess

In [2]:
pre = preprocess.preprocess("D:/DataSet/CICIDS2017/CSV/Wednesday-workingHours.pcap_ISCX.csv", labelColumnName='Label')
X_train, X_test, y_train, y_test = pre.preprocessing()

In [3]:
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=10000, n_jobs=-1, random_state=0)

In [6]:
import utils.feature_list as fl
features = fl.get_feature()

In [7]:
for feature in zip(features, clf.feature_importances_):
    print(feature)

('Flow Duration', 0.005371179224643753)
('Total Fwd Packets', 0.006737741835560833)
('Total Backward Packets', 0.018699233417685986)
('Total Length of Fwd Packets', 0.016655213521182953)
('Total Length of Bwd Packets', 0.02644089970932599)
('Fwd Packet Length Max', 0.015469126608426408)
('Fwd Packet Length Min', 0.007004513274965313)
('Fwd Packet Length Mean', 0.022398193404226555)
('Fwd Packet Length Std', 0.004794739077204929)
('Bwd Packet Length Max', 0.047728570646447775)
('Bwd Packet Length Min', 0.005052113084291457)
('Bwd Packet Length Mean', 0.058593364614585176)
('Bwd Packet Length Std', 0.06628842157583242)
('Flow Bytes/s', 0.0037542638543521015)
('Flow Packets/s', 0.005631269835642492)
('Flow IAT Mean', 0.014762534471212462)
('Flow IAT Std', 0.008409439734707454)
('Flow IAT Max', 0.014478647566564876)
('Flow IAT Min', 0.004647686091298986)
('Fwd IAT Total', 0.005951636441770657)
('Fwd IAT Mean', 0.010605176200019146)
('Fwd IAT Std', 0.016349118898414637)
('Fwd IAT Max', 0.01

總計為 100%

### To see which features are important we can use get_support method on the fitted model.

In [10]:
sfm = SelectFromModel(clf, threshold=0.01)
sfm.fit(X_train, y_train)
for feature_list_index in sfm.get_support(indices=True):
    print(features[feature_list_index])

SelectFromModel(estimator=RandomForestClassifier(n_estimators=10000, n_jobs=-1,
                                                 random_state=0),
                threshold=0.01)

In [12]:
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [13]:
# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, y_train)

RandomForestClassifier(n_estimators=10000, n_jobs=-1, random_state=0)

In [15]:
from sklearn.metrics import accuracy_score

In [16]:
# Apply The Full Featured Classifier To The Test Data
y_pred = clf.predict(X_test)

# View The Accuracy Of Our Full Feature (4 Features) Model
accuracy_score(y_test, y_pred)

0.9990406032147024

In [17]:
# Apply The Full Featured Classifier To The Test Data
y_important_pred = clf_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature (2 Features) Model
accuracy_score(y_test, y_important_pred)

0.9993154053089837

In [25]:
feature_list = []

In [26]:
import numpy as np
np.set_printoptions(precision=4)
for i in np.arange(0.0001, 0.1, 0.005):
    temp = []
    temp.append(i)
    sfm = SelectFromModel(clf, threshold=i)
    sfm.fit(X_train, y_train)
    for feature_list_index in sfm.get_support(indices=True):
        temp.append(feature_list_index)

KeyboardInterrupt: 

https://chrisalbon.com/machine_learning/trees_and_forests/feature_selection_using_random_forest/