In [1]:
import pandas as pd
import numpy as np
import collections
from collections import deque
from pandas import Timestamp
from tcdt import TCDT

In [2]:
#load sample data
fname='sample.csv'
sampleData = pd.read_csv(fname, encoding='utf-8')
sampleData["b'timestamp'"] = pd.to_datetime(sampleData["b'timestamp'"])
sampleData

Unnamed: 0.1,Unnamed: 0,b'timestamp',b'remote_host',b'remote_user',b'request_method',b'request_url_path',b'prefix0',b'prefix1',b'prefix2',b'status'
0,0,2016-12-04 04:24:31,b'104.223.44.126',b'user1',b'GET',b’/project/1.html’,b’/',b’/project/’,b’/project/1.html’,allow
1,1,2016-12-04 04:27:47,b'123.126.113.131',b'user2',b'GET',b’/project/2.html’,b’/',b’/project/’,b’/project/2.html’,deny
2,2,2016-12-04 04:39:28,b'68.180.230.30',b'user1',b'GET',b’/project/3.html’,b’/',b’/project/’,b’/project/3.html’,allow
3,3,2016-12-04 04:50:15,b'40.77.167.73',b'user2',b'GET',b’/project/4.html’,b’/',b’/project/’,b’/project/4.html’,deny
4,4,2016-12-04 04:54:30,b'180.76.15.19',b'user1',b'GET',b’/project/1.html’,b’/',b’/project/’,b’/project/1.html’,allow
5,5,2016-12-04 04:55:11,b'180.76.15.19',b'user2',b'GET',b’/project/2.html’,b’/',b’/project/’,b’/project/2.html’,deny
6,6,2016-12-04 05:03:36,b'23.105.159.84',b'user1',b'GET',b’/project/3.html’,b’/',b’/project/’,b’/project/3.html’,deny
7,7,2016-12-04 05:08:32,b'93.56.74.218',b'user2',b'GET',b’/project/4.html’,b’/',b’/project/’,b’/project/4.html’,deny
8,8,2016-12-04 05:10:22,b'28.15.152.81',b'user1',b'GET',b’/project/1.html’,b’/',b’/project/’,b’/project/3.html’,deny
9,9,2016-12-04 05:11:32,b'193.16.14.28',b'user2',b'GET',b’/project/2.html’,b’/',b’/project/’,b’/project/4.html’,deny


In [3]:
# features for training, 
# ["b'prefix0'", "b'prefix1'", "b'prefix2'", "b'request_url_path'"] indicate the hirerarchy of the url features
trainFeatures = [["b'remote_host'"], 
                 ["b'remote_user'"], 
                 ["b'request_method'"],
                 ["b'prefix0'", "b'prefix1'", 
                  "b'prefix2'", "b'request_url_path'"]]
labelCol = "b'status'"
timeCol = "b'timestamp'"

#train a decision tree, 
trainData = sampleData[:10]
tcdt = TCDT().fit(trainData, trainFeatures, labelCol, timeCol)

#print out the tree
print(tcdt.export_text())




In [4]:
# make a prediction, here the testData have a new change so the predictions are wrong
testData = sampleData[10:].to_numpy()
for row in testData:
    print(row)
    print('predict: ', tcdt.predict(row))

[10 Timestamp('2016-12-04 05:12:22') "b'28.11.112.31'" "b'user1'" "b'GET'"
 'b’/project/3.html’' "b’/'" 'b’/project/’' 'b’/project/3.html’' 'allow']
predict:  deny
[11 Timestamp('2016-12-04 05:13:32') "b'19.16.124.29'" "b'user1'" "b'GET'"
 'b’/project/4.html’' "b’/'" 'b’/project/’' 'b’/project/4.html’' 'allow']
predict:  deny


In [5]:
# update the tcdt with the testData and the predictions become correct now
tcdt.update(testData[0])
for row in testData:
    print(row)
    print('predict: ', tcdt.predict(row))

[10 Timestamp('2016-12-04 05:12:22') "b'28.11.112.31'" "b'user1'" "b'GET'"
 'b’/project/3.html’' "b’/'" 'b’/project/’' 'b’/project/3.html’' 'allow']
predict:  allow
[11 Timestamp('2016-12-04 05:13:32') "b'19.16.124.29'" "b'user1'" "b'GET'"
 'b’/project/4.html’' "b’/'" 'b’/project/’' 'b’/project/4.html’' 'allow']
predict:  allow


In [6]:
# print all tree leaves
for l in tcdt.get_allleaves():
    print(l.ipath)
    print(l.timeSeries)

[("b'remote_user'", '==', "b'user1'")]
[[0 Timestamp('2016-12-04 04:24:31') "b'104.223.44.126'" "b'user1'"
  "b'GET'" 'b’/project/1.html’' "b’/'" 'b’/project/’'
  'b’/project/1.html’' 'allow']
 [4 Timestamp('2016-12-04 04:54:30') "b'180.76.15.19'" "b'user1'"
  "b'GET'" 'b’/project/1.html’' "b’/'" 'b’/project/’'
  'b’/project/1.html’' 'allow']
 [6 Timestamp('2016-12-04 05:03:36') "b'23.105.159.84'" "b'user1'"
  "b'GET'" 'b’/project/3.html’' "b’/'" 'b’/project/’'
  'b’/project/3.html’' 'deny']
 [8 Timestamp('2016-12-04 05:10:22') "b'28.15.152.81'" "b'user1'"
  "b'GET'" 'b’/project/1.html’' "b’/'" 'b’/project/’'
  'b’/project/3.html’' 'deny']
 [10 Timestamp('2016-12-04 05:12:22') "b'28.11.112.31'" "b'user1'"
  "b'GET'" 'b’/project/3.html’' "b’/'" 'b’/project/’'
  'b’/project/3.html’' 'allow']]
[("b'remote_user'", '!=', "b'user1'")]
[[1 Timestamp('2016-12-04 04:27:47') "b'123.126.113.131'" "b'user2'"
  "b'GET'" 'b’/project/2.html’' "b’/'" 'b’/project/’'
  'b’/project/2.html’' 'deny']
 [9 T