In [3]:
from elasticsearch import Elasticsearch
import pandas as pd
import warnings

from sklearn.preprocessing import LabelBinarizer

from dotenv import load_dotenv
import os


#credentials pour se connecter a ELASTICSEARCH
elastic_user = "elastic"
elastic_password = "2Tt_dVh8htQwAtlz5-7I"

# Créez une instance d'Elasticsearch avec un pool de connexions
es = Elasticsearch(
    hosts=[{'host': 'localhost', 'port': 9200, 'scheme': 'https'}],
    basic_auth=(elastic_user, elastic_password),
    verify_certs=False,
    request_timeout=1000000,
    connections_per_node=64
)
import urllib3
urllib3.disable_warnings(urllib3.exceptions.SecurityWarning)

from elasticsearch.helpers import scan

  _transport = transport_class(


In [4]:
def get_all_data_from_index():
    # recherche de tous nos résultats par 1000 par pagination
    query = {
        "query": {"match_all": {}},
        "size": 1000
    }
    index_name = 'xml_data' 
    results = es.search(index=index_name, body=query, scroll='1m')
    scroll_id = results['_scroll_id']
    scroll_size = results['hits']['total']['value']
    
    all_data = []
    
    while scroll_size > 0:
        for hit in results['hits']['hits']:
            all_data.append(hit["_source"])
        
        # Requête suivante en utilisant le scroll_id
        results = es.scroll(scroll_id=scroll_id, scroll='1m')
        
        # Mettre à jour le scroll_id et le scroll_size
        scroll_id = results['_scroll_id']
        scroll_size = len(results['hits']['hits'])
    
    return all_data

data = get_all_data_from_index()

print(len(data))

1616867


In [5]:
# Permet de convertir les données en dataframe
df = pd.DataFrame(data)

In [6]:
#permet de définir toutes les colonnes nécéssitant d'être transformer
all_categoricals = ['appName', 'direction', 'sourceTCPFlagsDescription', 'destinationTCPFlagsDescription', 'protocolName', 'source', 'destination', 'sourcePort', 'destinationPort']

In [7]:
# Initialisez le LabelBinarizer
label_binarizer = LabelBinarizer()

#on trasnforme toutes les données qui peuvent être transformées en binaires (celles qui ne possèdent pas beaucoup de valeur différentes)
binarizer = label_binarizer.fit_transform(df['appName'])
df_binarizer = pd.DataFrame(data=binarizer,columns=label_binarizer.classes_)

#on rajoute nos nouvelles données 
df = pd.concat([df, df_binarizer], axis=1)

In [8]:
import ipaddress
# nous faisont ci-dessous une fonction permettant de transformer les addresses IP en entier
def ipv4_to_int (ip_str):
    try:
        # permet de passer notre string d'ip en objet ipaddress
        ip_obj = ipaddress.IPv4Address(ip_str)
        #par la suite on représente ce nouvel objet en entier
        ip_int = int(ip_obj)
        return ip_int
    except ipaddress.AddressValueError:
        print(f"L'ip n'a pas était passé en entier: {ip_str}")
        return None


In [9]:
#création d'une nouvelle colonne dans le but de d'associé stopDateTime et startDateTime dans un format plus exploitable
df['duration'] = pd.to_datetime(df['stopDateTime']) - pd.to_datetime(df['startDateTime'])
if df['duration'].dtypes == 'timedelta64[ns]':
    df['duration'] = df['duration'].dt.total_seconds()
   
#on transforme nos string en valeur numérique 
if df['totalSourceBytes'].dtypes == 'object':
    df['totalSourceBytes'] = pd.to_numeric(df['totalSourceBytes'], errors='coerce')

if df['totalDestinationBytes'].dtypes == 'object':
    df['totalDestinationBytes'] = pd.to_numeric(df['totalDestinationBytes'], errors='coerce')

if df['totalSourcePackets'].dtypes == 'object':
    df['totalSourcePackets'] = pd.to_numeric(df['totalSourcePackets'], errors='coerce')

if df['totalDestinationPackets'].dtypes == 'object':
    df['totalDestinationPackets'] = pd.to_numeric(df['totalDestinationPackets'], errors='coerce')

#transformation des ip en int 
df['destination_int'] = df['destination'].apply(ipv4_to_int)
df['source_int'] = df['source'].apply(ipv4_to_int)
df['port_source'] = df['sourcePort'].apply(int)
df['port_destination'] = df['destinationPort'].apply(int)


#par la suite on va séparér les durées par 7 intervalles différents  
quantile = pd.cut(df['duration'], bins=7)
df['duration_quantile'] = quantile

#on sépare ici en intervalles dans le but de prendre moins de colonnes lors du one hot encoding 
quantile = pd.qcut(df['totalSourceBytes'], q=8)
df['totalSourceBytes_quantile'] = quantile

quantile = pd.qcut(df['totalDestinationBytes'], q=9) 
df['totalDestinationBytes_quantile'] = quantile

quantile = pd.qcut(df['totalSourcePackets'], q=6,duplicates='drop')
df['totalSourcePackets_quantile'] = quantile

quantile = pd.qcut(df['totalDestinationPackets'], q=7)
df['totalDestinationPackets_quantile'] = quantile

quantile = pd.qcut(df['source_int'], q=6,duplicates='drop')
df['source_quantile'] = quantile

quantile = pd.qcut(df['destination_int'], q=6,duplicates='drop')
df['destination_quantile'] = quantile

quantile = pd.qcut(df['port_source'], q=50,duplicates='drop')
df['portSource_quantile'] = quantile

quantile = pd.qcut(df['port_destination'], q=50,duplicates='drop')
df['portDestination_quantile'] = quantile


In [10]:
from sklearn.preprocessing import OneHotEncoder

# Initialize a dictionary to store OneHotEncoder objects for each categorical column.
onehot_encoders = {}

# Convert the 'Tag' column to binary: 0 if the value is 'Normal', and 1 otherwise (indicating an anomaly or attack).
df['Tag'] = df['Tag'].apply(lambda x: 0 if x == 'Normal' else 1)

# On définit les colonnes que l'on va transformer via le one hot encoding
categorical_columns = [
    'protocolName', 'direction', 'duration_quantile', 'source_quantile',
    'totalDestinationPackets_quantile', 'destination_quantile',
    'totalSourcePackets_quantile', 'totalSourceBytes_quantile',
    'totalDestinationBytes_quantile'
]

# Initialize the OneHotEncoder.
encoder = OneHotEncoder(drop='first', sparse_output=False)

# One-hot encode the specified categorical columns.
for column in categorical_columns:
    print(column)
    # Fit and transform the column using the encoder.
    encoded_cols = encoder.fit_transform(df[[column]].astype(str))
    
    # Create new column names for the encoded columns.
    col_names = [column + '_' + str(index) for index, _ in enumerate(encoder.categories_[0][1:], start=1)]
    
    # Convert the encoded columns to a dataframe.
    encoded_df = pd.DataFrame(encoded_cols, columns=col_names)
    
    # Concatenate the encoded dataframe with the original dataframe.
    df = pd.concat([df, encoded_df], axis=1)
    
    # Drop the original categorical column.
    df.drop(column, axis=1, inplace=True)
    
    # Store the encoder object in the dictionary for possible future use.
    onehot_encoders[column] = encoder



protocolName
direction
duration_quantile
source_quantile
totalDestinationPackets_quantile
destination_quantile
totalSourcePackets_quantile
totalSourceBytes_quantile
totalDestinationBytes_quantile


In [11]:
#On créer maintenant des colonnes dans le but de réprésenter la colonne destinationTCPFlagsDescription ainsi que source on représentera ces colonne à la manières de one hot encoding

#on mets un 1 losque l'on retrouve le tag sinon on met 0 

df['destinationTCPFlagsR'] = df['destinationTCPFlagsDescription'].apply(lambda x: 0 if x is None or 'N/A' in x else (1 if 'R' in x else 0))
df['destinationTCPFlagsS'] = df['destinationTCPFlagsDescription'].apply(lambda x: 0 if x is None or 'N/A' in x else (1 if 'S' in x else 0))
df['destinationTCPFlagsF'] = df['destinationTCPFlagsDescription'].apply(lambda x: 0 if x is None or 'N/A' in x else (1 if 'F' in x else 0))
df['destinationTCPFlagsP'] = df['destinationTCPFlagsDescription'].apply(lambda x: 0 if x is None or 'N/A' in x else (1 if 'P' in x else 0))
df['destinationTCPFlagsA'] = df['destinationTCPFlagsDescription'].apply(lambda x: 0 if x is None or 'N/A' in x else (1 if 'A' in x else 0))

df['sourceTCPFlagsTagR'] = df['sourceTCPFlagsDescription'].apply(lambda x: 0 if x is None or 'N/A' in x else (1 if 'R' in x else 0))
df['sourceTCPFlagsTagS'] = df['sourceTCPFlagsDescription'].apply(lambda x: 0 if x is None or 'N/A' in x else (1 if 'S' in x else 0))
df['sourceTCPFlagsTagF'] = df['sourceTCPFlagsDescription'].apply(lambda x: 0 if x is None or 'N/A' in x else (1 if 'F' in x else 0))
df['sourceTCPFlagsTagP'] = df['sourceTCPFlagsDescription'].apply(lambda x: 0 if x is None or 'N/A' in x else (1 if 'P' in x else 0))
df['sourceTCPFlagsTagA'] = df['sourceTCPFlagsDescription'].apply(lambda x: 0 if x is None or 'N/A' in x else (1 if 'A' in x else 0))
df['sourceTCPFlagsTagU'] = df['sourceTCPFlagsDescription'].apply(lambda x: 0 if x is None or 'N/A' in x else (1 if 'U' in x else 0))


In [12]:

#on retire toutes les colonnes que l'on a plus besoin
df.drop('totalDestinationBytes', axis=1, inplace=True)
df.drop('totalDestinationPackets', axis=1, inplace=True)
df.drop('startDateTime', axis=1, inplace=True)
df.drop('stopDateTime', axis=1, inplace=True)
df.drop('sourcePayloadAsUTF', axis=1, inplace=True)
df.drop('startTime', axis=1, inplace=True)
df.drop('duration', axis=1, inplace=True)
df.drop('sourceTCPFlagsDescription', axis=1, inplace=True)
df.drop('destinationTCPFlagsDescription', axis=1, inplace=True)
df.drop('source', axis=1, inplace=True)
df.drop('destination', axis=1, inplace=True)
df.drop('origin', axis=1, inplace=True)
df.drop('appName', axis=1, inplace=True)
df.drop('sourcePort', axis=1, inplace=True)
df.drop('sensorInterfaceId', axis=1, inplace=True)
df.drop('destinationPort', axis=1, inplace=True)
df.drop('totalSourceBytes', axis=1, inplace=True)
df.drop('destination_int', axis=1, inplace=True)
df.drop('source_int', axis=1, inplace=True)
df.drop('totalSourcePackets', axis=1, inplace=True)
df.drop('sourcePayloadAsBase64', axis=1, inplace=True)
df.drop('destinationPayloadAsBase64', axis=1, inplace=True)
df.drop('destinationPayloadAsUTF', axis=1, inplace=True)


In [13]:
#permet de stocker les élements transformées dans un fichier pickle
print(df.columns.to_list())
df.to_pickle('categorical_transform.pkl')

['Tag', 'AOL-ICQ', 'Anet', 'Authentication', 'BGP', 'BitTorrent', 'Blubster', 'Citrix', 'Common-P2P-Port', 'Common-Ports', 'DNS', 'DNS-Port', 'FTP', 'Filenet', 'Flowgen', 'Gnutella', 'Google', 'Groove', 'GuptaSQLBase', 'H.323', 'HTTPImageTransfer', 'HTTPWeb', 'Hosts2-Ns', 'Hotline', 'ICMP', 'IGMP', 'IMAP', 'IPSec', 'IPX', 'IRC', 'Ingres', 'Intellex', 'Kazaa', 'LDAP', 'MDQS', 'MGCP', 'MS-SQL', 'MSMQ', 'MSN', 'MSN-Zone', 'MSTerminalServices', 'ManagementServices', 'MicrosoftMediaServer', 'Misc-DB', 'Misc-Mail-Port', 'Misc-Ports', 'MiscApp', 'MiscApplication', 'NETBEUI', 'NFS', 'NNTPNews', 'NTP', 'Nessus', 'NetBIOS-IP', 'Network-Config-Ports', 'NortonAntiVirus', 'NortonGhost', 'OpenNap', 'OpenWindows', 'Oracle', 'PCAnywhere', 'POP', 'POP-port', 'PPTP', 'PeerEnabler', 'PostgreSQL', 'Printer', 'RPC', 'RTSP', 'Real', 'SAP', 'SIP', 'SMS', 'SMTP', 'SNA', 'SNMP-Ports', 'SSDP', 'SSH', 'SSL-Shell', 'SecureWeb', 'Squid', 'StreamingAudio', 'SunRPC', 'Tacacs', 'Telnet', 'Timbuktu', 'TimeServer', 'Un