In [1]:
import re
from datetime import datetime
from numpy import std
import os


class PARSER:
    LOG_RE = re.compile(r'''
        (?P<host>\S+)\s+                # host (or -)
        (?P<ident>\S+)\s+               # ident (or -)
        \[(?P<time>[^\]]+)\]\s+         # [timestamp]
        "(?P<method>\S+)\s+(?P<path>[^"]+?)\s+(?P<proto>[^"]+)"\s+  # "METHOD path PROTO"
        (?P<status>\d{3})\s+            # status
        (?P<size>\S+)\s+                # size in bytes or -
        "(?P<referer>[^"]*)"\s+         # "referer" (may be empty)
        (?P<token>\S+)\s+               # custom token (session id or cookie)
        "(?P<agent>[^"]+)"              # "user-agent"
    ''', re.VERBOSE)

    def parse_apache_line(self,line):
        m = self.LOG_RE.search(line)
        if not m:
            print("-------line did not match expected format ---------\n",line)
            return 
        d = m.groupdict()
        # print(m)

        # parse time to datetime
        # example time format: 01/Nov/2019:16:32:09 +0000
        dt = datetime.strptime(d['time'], "%d/%b/%Y:%H:%M:%S %z")

        # normalize size
        size = None if d['size'] == '-' else int(d['size'])

        # Optional minimal UA parsing (heuristic)
        
        parsed = {
            'timestamp': dt,
            'method': d['method'],
            'path': d['path'],
            'protocol': d['proto'],
            'status': int(d['status']),
            'size': size,
            'referer': None if d['referer'] == '' else d['referer'],
            'session': d['token'],
        }
        return parsed
    

            

# Example usage:
line = '- - [01/Nov/2019:16:32:09 +0000] "POST /storage/store_sess_total_mousemv_db.php HTTP/1.1" 200 449 "https://160.40.52.164/content/big_data.php" htodnmm7tjpihgeuqk64c0gjes "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36 OPR/64.0.3417.61"'
line2 = '- - [30/Oct/2019:09:42:02 +0000] "GET / HTTP/1.0" 200 2045 "-" - "-"'
line3 = '- - [30/Oct/2019:11:50:36 +0000] "GET / HTTP/1.1" 200 2770 "-" - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36 OPR/64.0.3417.61"'
parser = PARSER()
print(parser.parse_apache_line(line))


{'timestamp': datetime.datetime(2019, 11, 1, 16, 32, 9, tzinfo=datetime.timezone.utc), 'method': 'POST', 'path': '/storage/store_sess_total_mousemv_db.php', 'protocol': 'HTTP/1.1', 'status': 200, 'size': 449, 'referer': 'https://160.40.52.164/content/big_data.php', 'session': 'htodnmm7tjpihgeuqk64c0gjes'}


In [2]:
class LOG_FEATURES:
    """
    features =  {
        Total_requests,
        Total_Bytes,
        Total_GET_requests,
        Total_POST_requests,
        Total_HEAD_requests,
        Total_3xx_responses
        Total_4xx_responses
        image requests,
        css file request,
        js requests
        HTML-to-image ratio
        Depth SD                     -> Standard deviation of requested pages’ depth (i.e., number of ’/’ in URLpath)
        Max requests per page        -> The maximum number of requests to the same page in a session [20].
        Average requests per page    -> The average number of requests per page in a session [20].
        Max number of consecutive sequential HTTP requests -> The maximum number of HTTP requested URLs that contain the previously requested URL as a subpart page [20, 43].
        % of consecutive sequential HTTP requests
        Session time                 -> The total time (in seconds) between the first and the last HTTP request of the session [3, 4, 20, 29, 34, 43].
        Browsing speed               -> The ratio of the total number of requested pages over time (in seconds)
        SD of inter-request times    -> Standard deviation of time between successive requests [4, 20].

    """
    session_features = {}
    feature_names = [
        "Total_requests","Total_Bytes","Total_GET_requests","Total_POST_requests",
        "Total_3xx_responses","Total_4xx_responses","image_requests","css_file_request",
        "js_requests","Depth_SD","Max_requests_per_page","Average_requests_per_page",
        "Max_sequential_request","per_sequential_requests","Session_time","Browsing_speed","SD_inter_request_time"
    ]
    global PARSER
    parser = PARSER()

    def init_features(self):
        new = {}
        for f_name in self.feature_names:
            new[f_name] = 0
        new["request_path"] = []
        new["requests_timestamps"] = []
        return new

    def is_image(self,path):
        ext_list = [".png",".jpeg",".webp",".svg"]
        for ext in ext_list:
            if path.endswith(".jpg") == True :
                return True
        return False

    def calculate_Depth_SD(self,session_id):
        depth_list = []
        for path in self.session_features[session_id]["request_path"]:
            depth_list.append(path.count('/'))
        # print("depth_list " ,depth_list)
        return std(depth_list)
    
    def inter_request_time_SD(self,session_id):
        time_dif_list = [0]
        sz = len(self.session_features[session_id]["requests_timestamps"])
        time_line = self.session_features[session_id]["requests_timestamps"]
        for i in range(1,sz):
            time_dif_list.append((time_line[i]-time_line[i-1]).total_seconds())
        # print("time sd ",time_dif_list)
        return std(time_dif_list)

    def get_request_stats(self,session_id):
        request_path_cnt = {}
        max_consecutive_len = 1
        cur_consecutive_len = 0
        cnt_consecutive_path = 0
        prev_path = ""
        for path in self.session_features[session_id]["request_path"]:
            if path in request_path_cnt:
                request_path_cnt[path]+=1
            else:
                request_path_cnt[path]=1
            
            if path.startswith(prev_path):
                cur_consecutive_len += 1
                cnt_consecutive_path += 1
                # print(cur_consecutive_len,path)
                max_consecutive_len = max(max_consecutive_len,cur_consecutive_len)
            else:
                cur_consecutive_len = 1

        total_req = 0
        max_req = 0
        for cnt in request_path_cnt.values():
            total_req+=cnt
            max_req = max(max_req,cnt)

        stats = {}
        tot_pages = len(request_path_cnt)
        stats["Max_requests_per_page"] = max_req
        stats["Average_requests_per_page"] = total_req/tot_pages
        stats["Max_sequential_request"] = max_consecutive_len
        stats["cnt_consecutive_path"] = cnt_consecutive_path
        if self.session_features[session_id]["Session_time"] != 0:
            stats["Browsing_speed"] = tot_pages / self.session_features[session_id]["Session_time"]
        else:
            stats["Browsing_speed"] = 0
        return stats


    def add_log(self,str_log):
        new_log_feature = self.parser.parse_apache_line(str_log)

        if new_log_feature==None or new_log_feature["session"] == '-':
            return

        session_id = new_log_feature["session"]
        if session_id not in self.session_features:
            self.session_features[session_id] =  self.init_features()
            self.session_features[session_id]["session_start"] = new_log_feature["timestamp"]

        """
        example new_log_feature = 
        {
         'timestamp': datetime.datetime(2019, 10, 30, 11, 50, 36, tzinfo=datetime.timezone.utc),
         'method': 'GET',
         'path': '/',
         'protocol': 'HTTP/1.1',
         'status': 200,
         'size': 2770,
         'referer': 'https://160.40.52.164/content/big_data.php',
         'token': 'htodnmm7tjpihgeuqk64c0gjes'
        }
        """
        
        self.session_features[session_id]["Total_requests"]      += 1
        self.session_features[session_id]["Total_Bytes"]         += new_log_feature["size"]
        self.session_features[session_id]["Total_GET_requests"]  += 1 if new_log_feature["method"]=="GET" else 0
        self.session_features[session_id]["Total_POST_requests"] += 1 if new_log_feature["method"]=="POST" else 0
        self.session_features[session_id]["Total_3xx_responses"] += 1 if new_log_feature["status"]//100 == 3 else 0
        self.session_features[session_id]["Total_4xx_responses"] += 1 if new_log_feature["status"]//100 == 4 else 0
        self.session_features[session_id]["image_requests"]      += 1 if self.is_image(new_log_feature["path"]) else 0
        self.session_features[session_id]["css_file_request"]    += 1 if new_log_feature["path"].endswith(".css") else 0
        self.session_features[session_id]["js_requests"]         += 1 if new_log_feature["path"].endswith(".js") else 0

        self.session_features[session_id]["request_path"].append(new_log_feature["path"])
        self.session_features[session_id]["requests_timestamps"].append(new_log_feature["timestamp"])
        
        self.session_features[session_id]["Depth_SD"] = self.calculate_Depth_SD(session_id)
        self.session_features[session_id]["Session_time"] = (new_log_feature["timestamp"] - self.session_features[session_id]["session_start"]).total_seconds()
        self.session_features[session_id]["SD_inter_request_time"] = self.inter_request_time_SD(session_id)
        self.session_features[session_id].update( self.get_request_stats(session_id) )

    def get_seseion_features_as_csv(self,session_id):
        feature_names = [
            "Total_requests","Total_Bytes","Total_GET_requests","Total_POST_requests",
            "Total_3xx_responses","Total_4xx_responses","image_requests","css_file_request",
            "js_requests","Depth_SD","Max_requests_per_page","Average_requests_per_page",
            "Max_sequential_request","per_sequential_requests","Session_time","Browsing_speed","SD_inter_request_time"
        ]
        features = self.session_features[session_id]
        out_str = f"""{session_id},{features["Total_requests"]},{features["Total_Bytes"]},{features["Total_GET_requests"]},{features["Total_POST_requests"]},{features["Total_3xx_responses"]/features["Total_requests"]},{features["Total_4xx_responses"]/features["Total_requests"]},{features["image_requests"]/features["Total_requests"]},{features["css_file_request"]/features["Total_requests"]},{features["js_requests"]/features["Total_requests"]},{features["Depth_SD"]},{features["Max_requests_per_page"]},{features["Average_requests_per_page"]},{features["Max_sequential_request"]},{features["cnt_consecutive_path"]/features["Total_requests"]},{features["Session_time"]},{features["Browsing_speed"]},{features["SD_inter_request_time"]}
"""

        return out_str
            


In [3]:
file_paths = ["../dataset/phase1/data/web_logs/bots","../dataset/phase2/data/web_logs/bots"]

log_features = LOG_FEATURES()


In [4]:

for path in file_paths:
    files = os.listdir(path)
    files = [f for f in files if os.path.isfile(path+'/'+f)]
    for file in files:
        with open(path+'/'+file,'r') as fd:
            for line in fd:
                log_features.add_log(line)
        print("done for ",path+'/'+file)

# log = """- - [25/Nov/2020:08:04:44 +0000] "GET / HTTP/1.1" 200 1147 "-" 0d5d2435pbn4mi4gj9spju5gia "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
# - - [25/Nov/2020:08:04:45 +0000] "GET /js/mousemove_onclick.js HTTP/1.1" 200 294 "http://192.168.1.120/" 0d5d2435pbn4mi4gj9spju5gia "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
# - - [25/Nov/2020:08:04:45 +0000] "GET /js/fingerprint_browser.js HTTP/1.1" 200 944 "http://192.168.1.120/" 0d5d2435pbn4mi4gj9spju5gia "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"
# - - [25/Nov/2020:08:14:04 +0000] "POST /storage/store_sess_total_mousemv_db.php HTTP/1.1" 200 316 "http://192.168.1.120/" fas9mgeiptj5suoafcgt3cb6a6 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 OPR/72.0.3815.320"
# - - [25/Nov/2020:08:14:05 +0000] "POST /storage/store_sess_total_mousemv_db.php HTTP/1.1" 200 316 "http://192.168.1.120/" fas9mgeiptj5suoafcgt3cb6a6 "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36 OPR/72.0.3815.320"
# """
# log_list = log.splitlines()


-------line did not match expected format ---------
 - - [01/Nov/2019:11:28:02 +0000] "-" 408 1511 "-" - "-"

-------line did not match expected format ---------
 - - [01/Nov/2019:11:53:30 +0000] "-" 408 1511 "-" - "-"

-------line did not match expected format ---------
 - - [01/Nov/2019:12:05:40 +0000] "-" 408 156 "-" - "-"

-------line did not match expected format ---------
 - - [01/Nov/2019:12:24:52 +0000] "-" 408 156 "-" - "-"

-------line did not match expected format ---------
 - - [01/Nov/2019:12:27:53 +0000] "-" 408 156 "-" - "-"

-------line did not match expected format ---------
 - - [01/Nov/2019:12:40:08 +0000] "-" 408 156 "-" - "-"

-------line did not match expected format ---------
 - - [01/Nov/2019:13:29:32 +0000] "-" 408 156 "-" - "-"

-------line did not match expected format ---------
 - - [01/Nov/2019:13:42:36 +0000] "-" 408 156 "-" - "-"

-------line did not match expected format ---------
 - - [01/Nov/2019:13:51:11 +0000] "-" 408 156 "-" - "-"

-------line did n

In [5]:
# print(log_features.session_features)

In [6]:
# for ses in log_features.session_features:
#     print(ses)

In [5]:
with open("bots.csv",'w') as fd:
    features = """"session_id","Total_requests","Total_Bytes","Total_GET_requests","Total_POST_requests","Total_3xx_responses","Total_4xx_responses","per_image_requests","per_css_request","per_js_requests","Depth_SD","Max_requests_per_page","Average_requests_per_page","Max_sequential_request","per_sequential_requests","Session_time","Browsing_speed","SD_request_time"\n"""
    fd.write(features)

    for ses in log_features.session_features:
        fd.write(log_features.get_seseion_features_as_csv(ses))

    

In [8]:
# log_features.get_seseion_features_as_csv("0d5d2435pbn4mi4gj9spju5gia")