In [17]:
import re
from datetime import datetime

LOG_RE = re.compile(r'''
    (?P<host>\S+)\s+                # host (or -)
    (?P<ident>\S+)\s+               # ident (or -)
    \[(?P<time>[^\]]+)\]\s+         # [timestamp]
    "(?P<method>\S+)\s+(?P<path>[^"]+?)\s+(?P<proto>[^"]+)"\s+  # "METHOD path PROTO"
    (?P<status>\d{3})\s+            # status
    (?P<size>\S+)\s+                # size in bytes or -
    "(?P<referer>[^"]*)"\s+         # "referer" (may be empty)
    (?P<token>\S+)\s+               # custom token (session id or cookie)
    "(?P<agent>[^"]+)"              # "user-agent"
''', re.VERBOSE)

def parse_apache_line(line):
    m = LOG_RE.search(line)
    if not m:
        raise ValueError("line did not match expected format")
    d = m.groupdict()
    # print(m)

    # parse time to datetime
    # example time format: 01/Nov/2019:16:32:09 +0000
    dt = datetime.strptime(d['time'], "%d/%b/%Y:%H:%M:%S %z")

    # normalize size
    size = None if d['size'] == '-' else int(d['size'])

    # Optional minimal UA parsing (heuristic)
    
    parsed = {
        'timestamp': dt,
        'method': d['method'],
        'path': d['path'],
        'protocol': d['proto'],
        'status': int(d['status']),
        'size': size,
        'referer': None if d['referer'] == '' else d['referer'],
        'token': d['token'],
    }
    return parsed

# Example usage:
line1 = '- - [01/Nov/2019:16:32:09 +0000] "POST /storage/store_sess_total_mousemv_db.php HTTP/1.1" 200 449 "https://160.40.52.164/content/big_data.php" htodnmm7tjpihgeuqk64c0gjes "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36 OPR/64.0.3417.61"'
line2 = '- - [30/Oct/2019:09:42:02 +0000] "GET / HTTP/1.0" 200 2045 "-" - "-"'
line3 = '- - [30/Oct/2019:11:50:36 +0000] "GET / HTTP/1.1" 200 2770 "-" - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36 OPR/64.0.3417.61"'
print(parse_apache_line(line1))


{'timestamp': datetime.datetime(2019, 11, 1, 16, 32, 9, tzinfo=datetime.timezone.utc), 'method': 'POST', 'path': '/storage/store_sess_total_mousemv_db.php', 'protocol': 'HTTP/1.1', 'status': 200, 'size': 449, 'referer': 'https://160.40.52.164/content/big_data.php', 'token': 'htodnmm7tjpihgeuqk64c0gjes'}


In [1]:
import pandas as pd

In [4]:
df = pd.read_csv("bots.csv")

In [5]:
df.head()

Unnamed: 0,session_id,Total_requests,Total_Bytes,Total_GET_requests,Total_POST_requests,Total_3xx_responses,Total_4xx_responses,per_image_requests,per_css_request,per_js_requests,Depth_SD,Max_requests_per_page,Average_requests_per_page,Max_sequential_request,per_sequential_requests,Session_time,Browsing_speed,SD_request_time
0,97hf7ciplt2k54f5j6109nekn0,157,70114,8,149,0.0,0.0,0.0,0.006369,0.031847,0.112145,143,15.7,157,1.0,277.0,0.036101,0.587992
1,46hm95bnvnuglhj1i1906nc80u,29,24199,16,13,0.0,0.0,0.0,0.137931,0.172414,0.405081,9,2.636364,29,1.0,11.0,1.0,0.665079
2,mtilohhtbsshka38svllisk0am,18,17012,9,9,0.0,0.0,0.0,0.055556,0.277778,0.31427,8,1.8,18,1.0,15.0,0.666667,0.957427
3,66vt430cgmgpus1k96japf46pf,326,200283,43,283,0.0,0.0,0.0,0.003067,0.015337,0.306518,264,16.3,326,1.0,512.0,0.039062,0.813757
4,igirlpfg3oft6i3dl8ah549gqi,319,222003,40,279,0.0,0.0,0.0,0.003135,0.015674,0.335409,262,13.869565,319,1.0,528.0,0.043561,0.799822


In [12]:
print("hii/hf".startswith("gi"))

False


In [14]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
data = np.load("mv_data2/dpmpsl8ejt2b9e49o7ulf939f7.npz")
for key in data:
    print(data[key].shape)
    plt.imshow(data[key], cmap='gray')   # 'gray' colormap forces grayscale display
    plt.axis('off')                 # hide axis for clean view
    plt.show()

In [15]:
def get_test_train_data_from_csv(hum_file = "../parsed_data/human.csv",bot_file = "../parsed_data/bots.csv"):

    df_bots = pd.read_csv(bot_file)
    df_bots["bot"] = 1
    df_bots

    df_hum = pd.read_csv(hum_file)
    df_hum["bot"] = 0
    df_hum
    df_merged = pd.concat([df_bots, df_hum], ignore_index=True, sort=False)
    df_merged

    return df_merged



In [16]:
df = get_test_train_data_from_csv()

In [20]:
df

Unnamed: 0,session_id,Total_requests,Total_Bytes,Total_GET_requests,Total_POST_requests,Total_3xx_responses,Total_4xx_responses,per_image_requests,per_css_request,per_js_requests,Depth_SD,Max_requests_per_page,Average_requests_per_page,Max_sequential_request,per_sequential_requests,Session_time,Browsing_speed,SD_request_time,bot
0,97hf7ciplt2k54f5j6109nekn0,157,70114,8,149,0.0,0.0,0.0,0.006369,0.031847,0.112145,143,15.700000,157,1.0,277.0,0.036101,0.587992,1
1,46hm95bnvnuglhj1i1906nc80u,29,24199,16,13,0.0,0.0,0.0,0.137931,0.172414,0.405081,9,2.636364,29,1.0,11.0,1.000000,0.665079,1
2,mtilohhtbsshka38svllisk0am,18,17012,9,9,0.0,0.0,0.0,0.055556,0.277778,0.314270,8,1.800000,18,1.0,15.0,0.666667,0.957427,1
3,66vt430cgmgpus1k96japf46pf,326,200283,43,283,0.0,0.0,0.0,0.003067,0.015337,0.306518,264,16.300000,326,1.0,512.0,0.039062,0.813757,1
4,igirlpfg3oft6i3dl8ah549gqi,319,222003,40,279,0.0,0.0,0.0,0.003135,0.015674,0.335409,262,13.869565,319,1.0,528.0,0.043561,0.799822,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809,a3novvctivmjtfh19pehjqing5,63,28719,7,56,0.0,0.0,0.0,0.015873,0.079365,0.124984,56,7.875000,63,1.0,29.0,0.275862,0.558496,0
810,itltb3mfbii9h9ikttfqkc3ca2,26,19425,11,15,0.0,0.0,0.0,0.038462,0.269231,0.192308,15,2.600000,26,1.0,15.0,0.666667,0.494047,0
811,f8vvptugepi32i09gotgtjb50d,1,1758,1,0,0.0,1.0,0.0,0.000000,0.000000,0.000000,1,1.000000,1,1.0,0.0,0.000000,0.000000,0
812,upgqfre5gn72uks220ag27idir,1,1758,1,0,0.0,1.0,0.0,0.000000,0.000000,0.000000,1,1.000000,1,1.0,0.0,0.000000,0.000000,0


In [38]:
session_cat = {}
for _,row in df[["session_id","bot"]].iterrows():
    session_cat[row[0]] = (row[1])

  session_cat[row[0]] = (row[1])


In [39]:
session_cat

{'97hf7ciplt2k54f5j6109nekn0': 0,
 '46hm95bnvnuglhj1i1906nc80u': 1,
 'mtilohhtbsshka38svllisk0am': 1,
 '66vt430cgmgpus1k96japf46pf': 1,
 'igirlpfg3oft6i3dl8ah549gqi': 1,
 'uqhd4i1019lcou4b0s1l1vdq87': 1,
 'moqasnr56raoh1gq94vqlngt50': 1,
 'o2dutldgeigipiq4cd4t69mgr7': 1,
 '0erfuds6s7kpisp8j1pphnombt': 1,
 'c7rlbj17lequj24elhrere5g13': 1,
 'kn4jl5vb5fita9bo6cu1b6jrk8': 1,
 'm2d7pem8vph8npifco3ogcf8on': 1,
 'qlgu4hfptc004osevv48p07oj3': 1,
 's07af6jn35uipvfnkj6hpglsl9': 1,
 'hs62n8oguaemh0k12m9434sr20': 1,
 '3rfboibsp6vin0c5iqp8r33f5n': 1,
 '59phgq1j4hsfskn63mgd5nrnmb': 1,
 '3uqepgd76f9ecnauehcl4sucbh': 1,
 'ck0vis16184tm6572eohin19d2': 1,
 'pf7tnis955pq27n6sibk32d87k': 1,
 '1ttvuqau08dh4t1cjg50pr2298': 1,
 'bdnrcfsa6rd1fuorjjjmi0o20s': 1,
 'k0371k7nj5dqmfg9fllmll59sn': 1,
 '16r8lv9fabgjk06di8usnmu9ni': 1,
 '7428dqv0aoerlt7uj52th5qac2': 1,
 '8srapvmong540puc7nb8gttr9s': 1,
 'qqvn22bkui88d086v1pnp37cus': 1,
 'bhgacl77180d93ov5a5fbbau9v': 1,
 '7fk29hdm43m3g5jcsn40ilqq72': 1,
 'aj9b410e48gi

In [25]:
df[["session_id","bot"]]

Unnamed: 0,session_id,bot
0,97hf7ciplt2k54f5j6109nekn0,1
1,46hm95bnvnuglhj1i1906nc80u,1
2,mtilohhtbsshka38svllisk0am,1
3,66vt430cgmgpus1k96japf46pf,1
4,igirlpfg3oft6i3dl8ah549gqi,1
...,...,...
809,a3novvctivmjtfh19pehjqing5,0
810,itltb3mfbii9h9ikttfqkc3ca2,0
811,f8vvptugepi32i09gotgtjb50d,0
812,upgqfre5gn72uks220ag27idir,0


In [None]:
all_ses = df[:]["session_id"].to_list()

In [16]:
len(all_ses)

814

In [4]:
import os

In [7]:
s1 = []
s2 = []
for dir in os.listdir("mv_data"):
    s1.append(dir)

for dir in os.listdir("mv_data2"):
    s2.append(dir)

In [10]:
cnt = 0
for x in s1:
    if x in s2:
        cnt+=1
    

In [40]:
len(s2)

299

In [4]:
import numpy as np
a = [1,2,3]
b = np.array([[x*2] for x in a])

In [5]:
b.shape

(3, 1)

In [9]:
with open("dump.txt",'w') as fd:
    fd.write(str(a)[1:-1])