In [1]:
import requests
from tqdm import tqdm

def download_atlasv2():
    # URL of the file to be downloaded
    url = "https://public.boxcloud.com/d/1/b1!DF-5pZffA7zR1_FcUVCjx4PJaM3r8Lx6kBOjMXR9DG-geexQh9pfYrZNUzBLVe5vu620bSdhYCFpRhlwpF8DRlkV1VXDUIytUAH0lEvnJsN2Gzu4IVGKYqvRCwEQw02lbOAxCKDHJ7tKVn-C2OQSZs6Cr2kCOHM5TLQxIfWzrJzTU4dxHicqi-tDUeMF4xo2WKxIH8aNtVmx5TIMa_9tWaOb3npDRkUvk7MGCuk6CFsQSTNigqvmC2gSRtBnrVtdnCDPZF_yNBBLCpyOVFAEeR18pnqSdCw0HS4nkDHeA9yZ1RF4t6eYRM4xK8DrWAWMdDmT2_qdw4g4wi8WSjXeCCUmE3kOaUcmd3pXgS46qtb8lZdmw4PTriZUt9szpaRe5AgxO7F_Up0b8mbNZWDJ8kF7G9gHR6vToa28kWq_TwF7nOEtbdzPHZeL-vAk73D2cCJTmJB4einKCrsGLe4R-MOzKcWj7wu2Fjt1IvS6aqVRVy1uvwIRiWrUyhHDxSxsFGezirHsGKbBQej5Cytn40BaOlmFHMvK3S9vfpL8m2XjqmBW_q6sw4HnNYMBKTb9DzKrQb2aj3apEyuANeh-VSImgX09kiIWM70Q-QpUAI794N8bnJ3iykg6kjWro-3EZb44wOjuvMNGUp-HOzAbwY2P_Gk2y-Opo3UkXwoJV66n0LyMVgt8tLI0Dm7Q_VEsyw9u-b_mn3-uxblZR4fllyfCur7Ew8R2jiIZcry-o5iE2M1lIL7zlDy3-yCMhN75VXMI6EVvtuIwaHmLMubvu2N62nGVJ2n039LVAkByCoGzRBNIOXVgBy-aY49xnScn0F_fzyjQIK7VdsObJnO9Tg5bjurfRQ_ZQNx-g4CPG-vne2FkuI1ft7bLEcIMmVk0zn67VHwmcQ4rcBFHW_d_Vw2OfozrS0QjpcOIX1oC7h_MZIHb2SMzLKu3VyhqroZkYSz-Eebl3ieQuF8NO9d4kRJin_OBxki1_7IFSa239hcMoMk3x3hmHmsfeOb3msEtCdcXvcQOdt-inyjuNlvd_uhqs-p2LWcW32GBpAfYpnP26wMT-DHo1lX7k4R4BEsxWIelC-_-GnGpx7R7wsxvHmamyjjKeNfv_RvUeBYGqC67q64HAZY3ggQyyuzCanzXukzevdO5CkmxB61cNAE4OfKYcuPpITWE9XStCjA_hQ7xsnSz6tG0v69Ul9-qWxrmlChf9aubNe6HyBifPh7l7udwRnaVumiTMN0HaM232VdT-t5HoetaqSCnKIj46ql4iEZOqCuA/download"
    
    # Send a GET request to the URL to get the file size
    response = requests.head(url, allow_redirects=True)
    file_size = int(response.headers.get('content-length', 0))
    
    # Stream the download
    response = requests.get(url, stream=True)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Define the filename to save the downloaded file
        filename = "atlasv2.tar.gz"
    
        # Open a local file in binary write mode
        with open(filename, 'wb') as file, tqdm(
            desc=filename,
            total=file_size,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
                bar.update(len(chunk))
    
        print(f"File downloaded successfully and saved as {filename}")
    else:
        print(f"Failed to download file. Status code: {response.status_code}")

    !tar -xzvf atlasv2.tar.gz -C atlas_dataset/

In [5]:
import pandas as pd
import json

def load_data(file_path="atlas_dataset/atlasv2/data/benign/h1/cbc-edr/edr-h1-benign.jsonl"):
    log_data = []
    with open(file_path, 'r') as file:
        for line in file:
            doc = json.loads(line)
            timestamp = doc.get('device_timestamp') 

            if doc.get('type') == 'endpoint.event.crossproc':
                parent_node_id = doc.get('process_guid')
                parent_node_label = doc.get('process_path')
                child_node_id = doc.get('crossproc_guid')
                child_node_label = doc.get('crossproc_name')
                edge_label = 'crossproc'
                data = {
                    'action': edge_label,
                    'actorID': parent_node_id,
                    'objectID': child_node_id,
                    'object': 'PROCESS',
                    'actorname': parent_node_label,
                    'objectname': child_node_label,
                    'timestamp': timestamp
                }
                log_data.append(data)

            elif doc.get('type') == 'endpoint.event.procstart':  
                parent_node_id = doc.get('process_guid')
                parent_node_label = doc.get('process_path')
                child_node_id = doc.get('childproc_guid')
                child_node_label = doc.get('childproc_name')
                edge_label = 'procstart'
                data = {
                    'action': edge_label,
                    'actorID': parent_node_id,
                    'objectID': child_node_id,
                    'object': 'PROCESS',
                    'actorname': parent_node_label,
                    'objectname': child_node_label,
                    'timestamp': timestamp
                }
                log_data.append(data)

            elif doc.get('type') == 'endpoint.event.filemod':  
                parent_node_id = doc.get('process_guid')
                parent_node_label = doc.get('process_path')
                child_node_id = doc.get('filemod_name')
                child_node_label = doc.get('filemod_name')
                edge_label = 'filemod'
                data = {
                    'action': edge_label,
                    'actorID': parent_node_id,
                    'objectID': child_node_id,
                    'object': 'FILE',
                    'actorname': parent_node_label,
                    'objectname': child_node_label,
                    'timestamp': timestamp
                }
                log_data.append(data)

            elif doc.get('type') == 'endpoint.event.netconn':  
                parent_node_id = doc.get('process_guid')
                parent_node_label = doc.get('process_path')
                child_node_id = doc.get('remote_ip')
                child_node_label = doc.get('remote_ip')
                edge_label = 'netconn'
                data = {
                    'action': edge_label,
                    'actorID': parent_node_id,
                    'objectID': child_node_id,
                    'object': 'SOCKET',
                    'actorname': parent_node_label,
                    'objectname': child_node_label,
                    'timestamp': timestamp
                }
                log_data.append(data)
                
    df = pd.DataFrame(log_data)
    return df

In [6]:
df = load_data()

In [9]:
df.head()

Unnamed: 0,action,actorID,objectID,object,actorname,objectname,timestamp
0,filemod,7DMF69PK-05debeef-000009b8-00000000-1d8984c85f...,c:\windows\system32\spool\drivers\w32x86\tp ps...,FILE,c:\program files\vmware\vmware tools\tpautocon...,c:\windows\system32\spool\drivers\w32x86\tp ps...,2022-07-19 07:24:57.2128941 +0000 UTC
1,filemod,7DMF69PK-05debeef-0000055c-00000000-1d8984c843...,c:\windows\system32\spool\drivers\w32x86\3\new...,FILE,c:\windows\system32\spoolsv.exe,c:\windows\system32\spool\drivers\w32x86\3\new...,2022-07-19 07:24:57.227895 +0000 UTC
2,filemod,7DMF69PK-05debeef-0000055c-00000000-1d8984c843...,c:\windows\system32\spool\drivers\w32x86\3\new...,FILE,c:\windows\system32\spoolsv.exe,c:\windows\system32\spool\drivers\w32x86\3\new...,2022-07-19 07:24:57.228895 +0000 UTC
3,filemod,7DMF69PK-05debeef-0000055c-00000000-1d8984c843...,c:\windows\system32\spool\drivers\w32x86\3\new...,FILE,c:\windows\system32\spoolsv.exe,c:\windows\system32\spool\drivers\w32x86\3\new...,2022-07-19 07:24:57.228895 +0000 UTC
4,filemod,7DMF69PK-05debeef-000009b8-00000000-1d8984c85f...,c:\windows\system32\spool\drivers\w32x86\tp ps...,FILE,c:\program files\vmware\vmware tools\tpautocon...,c:\windows\system32\spool\drivers\w32x86\tp ps...,2022-07-19 07:24:57.245896 +0000 UTC
