# Real-Time Monitoring of DoS Attacks: Explore Data of RSDoS Crawler

This notebook runs a very first analysis as part of the real-time monitoring of DoS attacks. The aim of this analysis is to get some first insights into the data format of the RSDoS crawler.

*Stefan Scholz* 
*2020-07-22*

In [1]:
import os
import json
import gzip
import pprint
import itertools
import pandas as pd
from datetime import datetime

## Data Format

In [2]:
# load data
with gzip.open("data-telescope-crawler-dos-202007201318.json.gz", "rt", encoding="utf-8") as file:
    dump = json.load(file)
    
# print data
pp = pprint.PrettyPrinter(depth=3)
pp.pprint(dump)

{'name': 'data-telescope-crawler-dos-20200720131800',
 'targets': [{'hosts': {...},
              'ip': '192.172.226.78',
              'latest_time': '2020-07-20T13:14:13.264622Z',
              'start_time': '2020-07-20T13:12:13.250350Z',
              'target_lines': [...]},
             {'hosts': {...},
              'ip': '194.59.37.35',
              'latest_time': '2020-07-20T13:14:13.264622Z',
              'start_time': '2020-07-20T13:12:33.254245Z',
              'target_lines': [...]},
             {'hosts': {...},
              'ip': '208.80.154.232',
              'latest_time': '2020-07-20T13:14:33.263870Z',
              'start_time': '2020-07-20T13:12:33.254245Z',
              'target_lines': [...]},
             {'hosts': {...},
              'ip': '172.217.23.163',
              'latest_time': '2020-07-20T13:14:13.264622Z',
              'start_time': '2020-07-20T13:13:33.258748Z',
              'target_lines': [...]},
             {'hosts': {...},
              'ip'

## Data Preparation

In [3]:
# get dumps
dumps_dir = "."
dumps = [os.path.join(dumps_dir, file) for file in os.listdir(dumps_dir) if file.endswith(".json.gz")]

In [4]:
def date_parser(value):
    """
    Parse dates in dump
    """
    
    if isinstance(value, dict):
        for k, v in value.items():
            value[k] = date_parser(v)
    elif isinstance(value, list):
        for index, row in enumerate(value):
            value[index] = date_parser(row)
    elif isinstance(value, str) and value:
        try:
            value = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%f%z")
        except (ValueError, AttributeError):
            pass
    return value

# get attacks
attacks = [json.load(gzip.open(dump, "rt", encoding="utf-8"), object_hook=date_parser)["targets"] for dump in dumps]
attacks = list(itertools.chain.from_iterable(attacks))
attacks = pd.DataFrame(attacks)

# print attacks
attacks.head(5)

Unnamed: 0,ip,start_time,latest_time,target_lines,hosts
0,192.172.226.78,2020-07-20 13:12:13.250350+00:00,2020-07-20 13:14:13.264622+00:00,"[{'target_ip': '192.172.226.78', 'nr_attacker_...",{'192.172.226.78': [{'record': 'WARC/1.1 WARC...
1,194.59.37.35,2020-07-20 13:12:33.254245+00:00,2020-07-20 13:14:13.264622+00:00,"[{'target_ip': '194.59.37.35', 'nr_attacker_ip...",{'194.59.37.35': [{'record': 'WARC/1.1 WARC-T...
2,208.80.154.232,2020-07-20 13:12:33.254245+00:00,2020-07-20 13:14:33.263870+00:00,"[{'target_ip': '208.80.154.232', 'nr_attacker_...",{'208.80.154.232': [{'record': 'WARC/1.1 WARC...
3,172.217.23.163,2020-07-20 13:13:33.258748+00:00,2020-07-20 13:14:13.264622+00:00,"[{'target_ip': '172.217.23.163', 'nr_attacker_...",{'172.217.23.163': [{'record': 'WARC/1.1 WARC...
4,54.187.154.195,2020-07-20 13:13:53.264622+00:00,2020-07-20 13:14:13.264622+00:00,"[{'target_ip': '54.187.154.195', 'nr_attacker_...",{'54.187.154.195': [{'record': 'WARC/1.1 WARC...
