# Merge and clean

This notebook cleans datasets (ECML, HTTPParams and XSS) and finally merges them to one bigger dataset.

In [1]:
import json

In [2]:
with open('ECML.json', 'r') as f:
    ecml = json.load(f)

In [3]:
ecml

[{'type': 'Valid',
  'request': 'crTeYreti=wsn&et3tf6shoV=tdsviee+y+fum%24oh3%3Bore&sAib5hfAvhEpC=tcilbrr+Lne',
  'interval': None},
 {'type': 'Valid', 'request': '*;q=0.7', 'interval': None},
 {'type': 'Valid', 'request': 'nhgiopIe/7.8.4.7.0', 'interval': None},
 {'type': 'Valid',
  'request': '/dyylkL.XD9cPu/4Ot0ta/ts6xNrP1/hssh/a2cuerht/sE8j00jiF@UbfX.77G/s0F/gt/5TsrstiuyvldFatrury.png',
  'interval': None},
 {'type': 'Valid',
  'request': '/2m6VLb1r37jSPC/cWVv/Mbar/oqrd0/msc/etceebwgi/iO/m2zXMv/r@i98toR3YBbLcn.css',
  'interval': None},
 {'type': 'Valid',
  'request': 'seivste3c=804252514&Gxnc1atstirc3eR=eA&ioe9ogl=%3B%7E&J7etc88Op=7%24gr+stx&HWlTW=c2.2mwQ&anE5=TAistyle&usi=la+&md=dNt%40&zs=%5Dt%7ChmEh',
  'interval': None},
 {'type': 'Valid', 'request': '/Soqdee/lDF.jpg', 'interval': None},
 {'type': 'Valid', 'request': '*;q=0.3', 'interval': None},
 {'type': 'Valid',
  'request': '/i4tBn0QTpvx/l6NstqM5eOe/rh-ELinsertPaTUC1bgsound/rh/dFwpV/thnosoZt6otNteic.css',
  'interval': None

In [5]:
with open('HTTPParams.json', 'r') as f:
    custom = json.load(f)

In [6]:
custom

[{'payload': 'c/ caridad s/n', 'attack_type': 'norm'},
 {'payload': 'campello, el', 'attack_type': 'norm'},
 {'payload': '1442431887503330', 'attack_type': 'norm'},
 {'payload': 'nue37', 'attack_type': 'norm'},
 {'payload': 'tufts3@joll.rs', 'attack_type': 'norm'},
 {'payload': '22997112x', 'attack_type': 'norm'},
 {'payload': 'arenas de san juan', 'attack_type': 'norm'},
 {'payload': '19245', 'attack_type': 'norm'},
 {'payload': 'fennell', 'attack_type': 'norm'},
 {'payload': 'd50allecido', 'attack_type': 'norm'},
 {'payload': 'genny', 'attack_type': 'norm'},
 {'payload': '03248i367ca', 'attack_type': 'norm'},
 {'payload': 'grubel8@albeiteria.kw', 'attack_type': 'norm'},
 {'payload': '83497200r', 'attack_type': 'norm'},
 {'payload': 'martn de yeltes', 'attack_type': 'norm'},
 {'payload': '1769471856078209', 'attack_type': 'norm'},
 {'payload': 'cascabela', 'attack_type': 'norm'},
 {'payload': 'ludolfo', 'attack_type': 'norm'},
 {'payload': 'snerd@bwds.tj', 'attack_type': 'norm'},
 {'p

In [7]:
with open('xss.json', 'r') as f:
    xss = json.load(f)

In [8]:
xss

[{'Unnamed: 0': 0,
  'Sentence': '<li><a href="/wiki/File:Socrates.png" class="image"><img alt="Socrates.png" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/18px-Socrates.png" decoding="async" width="18" height="28" class="noviewer" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/27px-Socrates.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/36px-Socrates.png 2x" data-file-width="326" data-file-height="500" /> </a> <a href="/wiki/Portal:Philosophy" title="Portal:Philosophy">Philosophy&#32;portal </a> </li> </ul>',
  'Label': 0},
 {'Unnamed: 0': 1,
  'Sentence': '<tt onmouseover="alert(1)">test</tt>',
  'Label': 1},
 {'Unnamed: 0': 2,
  'Sentence': '\t </span> <span class="reference-text">Steering for the 1995 "<a href="/wiki/History_of_autonomous_cars#1990s" class="mw-redirect" title="History of autonomous cars">No Hands Across America </a>" required "only a few human assists". ',
  'Label': 0},
 {'Unnamed: 0': 3

In [9]:
import urllib.parse

def unquote(text):
    k = 0
    uq_prev = text
    while(k < 100):
        uq = urllib.parse.unquote_plus(uq_prev)
        if uq == uq_prev:
            break
        else:
            uq_prev = uq
    
    return uq_prev

def remove_new_line(text):
    text = text.strip()
    text = ' '.join(text.splitlines())
    
    return text

def remove_multiple_whitespaces(text):
    return ' '.join(text.split())

def clean_pattern(pattern):
    pattern = unquote(pattern)
    pattern = remove_new_line(pattern)
    pattern = pattern.lower()
    pattern = remove_multiple_whitespaces(pattern)
    
    return pattern

def prepare_ecml(x):
    out = {}
    out['pattern'] = clean_pattern(x['request'])
    if x['type'] == 'Valid':
        out['type'] = 'valid'
    elif x['type'] == 'XSS':
        out['type'] = 'xss'
    elif x['type'] == 'SqlInjection':
        out['type'] = 'sqli'
    elif x['type'] == 'PathTransversal':
        out['type'] = 'path-traversal'
    elif x['type'] == 'OsCommanding':
        out['type'] = 'cmdi'
    return out

def prepare_custom(x):
    out = {}
    
    out['pattern'] = clean_pattern(x['payload'])
    if x['attack_type'] == 'norm':
        out['type'] = 'valid'
    else:
        out['type'] = x['attack_type']
    return out

def prepare_xss(x):
    out = {}
    
    out['pattern'] = clean_pattern(x['Sentence'])
    if x['Label'] == 1:
        out['type'] = 'xss'
    else:
        out['type'] = 'valid'
    return out

In [10]:
ecml = list(map(prepare_ecml, ecml))

In [11]:
custom = list(map(prepare_custom, custom))

In [12]:
xss = list(map(prepare_xss, xss))

In [13]:
ecml

[{'pattern': 'crteyreti=wsn&et3tf6shov=tdsviee y fum$oh3;ore&saib5hfavhepc=tcilbrr lne',
  'type': 'valid'},
 {'pattern': '*;q=0.7', 'type': 'valid'},
 {'pattern': 'nhgiopie/7.8.4.7.0', 'type': 'valid'},
 {'pattern': '/dyylkl.xd9cpu/4ot0ta/ts6xnrp1/hssh/a2cuerht/se8j00jif@ubfx.77g/s0f/gt/5tsrstiuyvldfatrury.png',
  'type': 'valid'},
 {'pattern': '/2m6vlb1r37jspc/cwvv/mbar/oqrd0/msc/etceebwgi/io/m2zxmv/r@i98tor3ybblcn.css',
  'type': 'valid'},
 {'pattern': 'seivste3c=804252514&gxnc1atstirc3er=ea&ioe9ogl=;~&j7etc88op=7$gr stx&hwltw=c2.2mwq&ane5=taistyle&usi=la &md=dnt@&zs=]t|hmeh',
  'type': 'valid'},
 {'pattern': '/soqdee/ldf.jpg', 'type': 'valid'},
 {'pattern': '*;q=0.3', 'type': 'valid'},
 {'pattern': '/i4tbn0qtpvx/l6nstqm5eoe/rh-elinsertpatuc1bgsound/rh/dfwpv/thnosozt6otnteic.css',
  'type': 'valid'},
 {'pattern': 'ittoaqitnt=ftxe&eal= $optobaee)rscriptcnid;cc)&pinfcoh8amv1=6&84k=68&optrhlpc=i &fnadtt9ao=syjzp4sf8&gem5neenha=3esgnemkuti',
  'type': 'valid'},
 {'pattern': 'oengtle0=85

In [14]:
custom

[{'pattern': 'c/ caridad s/n', 'type': 'valid'},
 {'pattern': 'campello, el', 'type': 'valid'},
 {'pattern': '1442431887503330', 'type': 'valid'},
 {'pattern': 'nue37', 'type': 'valid'},
 {'pattern': 'tufts3@joll.rs', 'type': 'valid'},
 {'pattern': '22997112x', 'type': 'valid'},
 {'pattern': 'arenas de san juan', 'type': 'valid'},
 {'pattern': '19245', 'type': 'valid'},
 {'pattern': 'fennell', 'type': 'valid'},
 {'pattern': 'd50allecido', 'type': 'valid'},
 {'pattern': 'genny', 'type': 'valid'},
 {'pattern': '03248i367ca', 'type': 'valid'},
 {'pattern': 'grubel8@albeiteria.kw', 'type': 'valid'},
 {'pattern': '83497200r', 'type': 'valid'},
 {'pattern': 'martn de yeltes', 'type': 'valid'},
 {'pattern': '1769471856078209', 'type': 'valid'},
 {'pattern': 'cascabela', 'type': 'valid'},
 {'pattern': 'ludolfo', 'type': 'valid'},
 {'pattern': 'snerd@bwds.tj', 'type': 'valid'},
 {'pattern': '15365381r', 'type': 'valid'},
 {'pattern': 'regueras de arriba', 'type': 'valid'},
 {'pattern': '08281',

In [15]:
xss

[{'pattern': '<li><a href="/wiki/file:socrates.png" class="image"><img alt="socrates.png" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/socrates.png/18px-socrates.png" decoding="async" width="18" height="28" class="noviewer" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/socrates.png/27px-socrates.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/cd/socrates.png/36px-socrates.png 2x" data-file-width="326" data-file-height="500" /> </a> <a href="/wiki/portal:philosophy" title="portal:philosophy">philosophy&#32;portal </a> </li> </ul>',
  'type': 'valid'},
 {'pattern': '<tt onmouseover="alert(1)">test</tt>', 'type': 'xss'},
 {'pattern': '</span> <span class="reference-text">steering for the 1995 "<a href="/wiki/history_of_autonomous_cars#1990s" class="mw-redirect" title="history of autonomous cars">no hands across america </a>" required "only a few human assists".',
  'type': 'valid'},
 {'pattern': '</span> <span class="reference-text"><cite class="citati

In [16]:
with open('ecml_clean.json', 'w') as f:
    json.dump(ecml, f)

In [17]:
with open('HTTPParams_clean.json', 'w') as f:
    json.dump(custom, f)

In [18]:
with open('xss_clean.json', 'w') as f:
    json.dump(xss, f)

In [19]:
complete_clean = ecml+custom+xss

In [20]:
complete_clean

[{'pattern': 'crteyreti=wsn&et3tf6shov=tdsviee y fum$oh3;ore&saib5hfavhepc=tcilbrr lne',
  'type': 'valid'},
 {'pattern': '*;q=0.7', 'type': 'valid'},
 {'pattern': 'nhgiopie/7.8.4.7.0', 'type': 'valid'},
 {'pattern': '/dyylkl.xd9cpu/4ot0ta/ts6xnrp1/hssh/a2cuerht/se8j00jif@ubfx.77g/s0f/gt/5tsrstiuyvldfatrury.png',
  'type': 'valid'},
 {'pattern': '/2m6vlb1r37jspc/cwvv/mbar/oqrd0/msc/etceebwgi/io/m2zxmv/r@i98tor3ybblcn.css',
  'type': 'valid'},
 {'pattern': 'seivste3c=804252514&gxnc1atstirc3er=ea&ioe9ogl=;~&j7etc88op=7$gr stx&hwltw=c2.2mwq&ane5=taistyle&usi=la &md=dnt@&zs=]t|hmeh',
  'type': 'valid'},
 {'pattern': '/soqdee/ldf.jpg', 'type': 'valid'},
 {'pattern': '*;q=0.3', 'type': 'valid'},
 {'pattern': '/i4tbn0qtpvx/l6nstqm5eoe/rh-elinsertpatuc1bgsound/rh/dfwpv/thnosozt6otnteic.css',
  'type': 'valid'},
 {'pattern': 'ittoaqitnt=ftxe&eal= $optobaee)rscriptcnid;cc)&pinfcoh8amv1=6&84k=68&optrhlpc=i &fnadtt9ao=syjzp4sf8&gem5neenha=3esgnemkuti',
  'type': 'valid'},
 {'pattern': 'oengtle0=85

In [21]:
with open('complete_clean.json', 'w') as f:
    json.dump(complete_clean, f)