In [1]:
import json

In [4]:
with open('../ecml/ECML.json', 'r') as f:
    ecml = json.load(f) 

In [6]:
ecml[0]

{'type': 'Valid',
 'request': 'crTeYreti=wsn&et3tf6shoV=tdsviee+y+fum%24oh3%3Bore&sAib5hfAvhEpC=tcilbrr+Lne',
 'interval': None}

In [11]:
with open('../httpParams/HTTPParams.json','r') as f:
    custom = json.load(f)

In [12]:
custom[0]

{'payload': 'c/ caridad s/n', 'attack_type': 'norm'}

In [13]:
with open('../xss/xss.json' ,'r') as f:
    xss = json.load(f)

In [14]:
xss[0]

{'Unnamed: 0': 0,
 'Sentence': '<li><a href="/wiki/File:Socrates.png" class="image"><img alt="Socrates.png" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/18px-Socrates.png" decoding="async" width="18" height="28" class="noviewer" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/27px-Socrates.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Socrates.png/36px-Socrates.png 2x" data-file-width="326" data-file-height="500" /> </a> <a href="/wiki/Portal:Philosophy" title="Portal:Philosophy">Philosophy&#32;portal </a> </li> </ul>',
 'Label': 0}

In [15]:
import urllib.parse

# The unquote function decodes a URL-encoded string, potentially multiple times, until it can no longer be decoded further.
def unquote(text):
    k = 0
    uq_prev = text
    while(k < 100):
        uq = urllib.parse.unquote_plus(uq_prev)
        if uq == uq_prev:
            break
        else:
            uq_prev = uq
    
    return uq_prev

def remove_new_line(text):
    """
    This function removes leading/trailing whitespaces and replaces newline characters with a space.
    
    Args:
        text (str): The input text to be processed.
    
    Returns:
        str: The processed text with no newline characters and leading/trailing whitespaces.
    """
    text = text.strip()  # Remove leading/trailing whitespaces
    text = ' '.join(text.splitlines())  # Replace newline characters with a space
    
    return text


def remove_multiple_whitespaces(text):
    """
    This function removes multiple consecutive whitespaces from a given text and replaces them with a single space.
    
    Args:
        text (str): The input text to be processed.
    
    Returns:
        str: The processed text with no multiple consecutive whitespaces.
    """
    return ' '.join(text.split())

def clean_pattern(pattern):
    # combination of above function
    pattern = unquote(pattern)  # Decodes URL-encoded strings
    pattern = remove_new_line(pattern)  # Removes leading/trailing whitespaces and replaces newline characters with a space
    pattern = pattern.lower()  # Converts to lowercase
    pattern = remove_multiple_whitespaces(pattern)  # Removes multiple consecutive whitespaces
    
    return pattern

# For ecml data

In [16]:
def prepare_ecml(x):
    """
    Prepares the input data for ECML processing.

    Args:
        x (dict): Input data containing 'request' and 'type' keys.

    Returns:
        dict: Processed data with 'pattern' and 'type' keys.
    """
    out = {}
    out['pattern'] = clean_pattern(x['request'])
    if x['type'] == 'Valid':
        out['type'] = 'valid'
    elif x['type'] == 'XSS':
        out['type'] = 'xss'
    elif x['type'] == 'SqlInjection':
        out['type'] = 'sqli'
    elif x['type'] == 'PathTransversal':
        out['type'] = 'path-traversal'
    elif x['type'] == 'OsCommanding':
        out['type'] = 'cmdi'
    return out

# For https

In [17]:
def prepare_custom(x):
    out = {}
    
    out['pattern'] = clean_pattern(x['payload'])
    if x['attack_type'] == 'norm':
        out['type'] = 'valid'
    else:
        out['type'] = x['attack_type']
    return out

# For xss

In [18]:
def prepare_xss(x):
    out = {}
    
    out['pattern'] = clean_pattern(x['Sentence'])
    if x['Label'] == 1:
        out['type'] = 'xss'
    else:
        out['type'] = 'valid'
    return out

In [19]:
ecml = list(map(prepare_ecml, ecml))
custom = list(map(prepare_custom, custom))
xss = list(map(prepare_xss, xss))
# The code is preparing three lists (ecml, custom, xss) by applying specific functions (prepare_ecml, prepare_custom, prepare_xss) to each element in the lists using the map() function.
# The map() function applies a given function to each item of an iterable (like a list or tuple) and returns a list of the results.
# The results are then converted to lists using the list() function.
# The prepared lists are stored in the ecml, custom, and xss variables.

In [20]:
ecml[0]

{'pattern': 'crteyreti=wsn&et3tf6shov=tdsviee y fum$oh3;ore&saib5hfavhepc=tcilbrr lne',
 'type': 'valid'}

In [21]:
xss[0]

{'pattern': '<li><a href="/wiki/file:socrates.png" class="image"><img alt="socrates.png" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/socrates.png/18px-socrates.png" decoding="async" width="18" height="28" class="noviewer" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/cd/socrates.png/27px-socrates.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/cd/socrates.png/36px-socrates.png 2x" data-file-width="326" data-file-height="500" /> </a> <a href="/wiki/portal:philosophy" title="portal:philosophy">philosophy&#32;portal </a> </li> </ul>',
 'type': 'valid'}

In [22]:
custom[0]

{'pattern': 'c/ caridad s/n', 'type': 'valid'}

In [23]:
with open('ecml_clean.json', 'w') as f:
    json.dump(ecml, f)

In [25]:
with open('HTTPParams_clean.json', 'w') as f:
    json.dump(custom, f)

In [26]:
with open('xss_clean.json', 'w') as f:
    json.dump(xss, f)

In [27]:
complete_clean = ecml+custom+xss
# This line of code is combining three variables (ecml, custom, and xss) into a new variable named 'complete_clean' using the addition operator (+), which is likely being used for string concatenation in this context. 

In [31]:
complete_clean

{'pattern': 'crteyreti=wsn&et3tf6shov=tdsviee y fum$oh3;ore&saib5hfavhepc=tcilbrr lne',
 'type': 'valid'}

In [32]:
with open('complete_clean.json', 'w') as f:
    json.dump(complete_clean, f)