In [2]:
# This code is open source to USENIX Security submission only (now).  
# After the publication of the corresponding thesis, the code will be free to use, modify, and distribute.
# Copyright (c) [Anonymous authors] [2023], which will be public after the publication.
# Licensed under the MIT License, Version 2.0.
# We kindly hope that this code should only be used for academic perspectives, 
# Please do not abuse this code to launch any attacks. 

# This code is the CrossPoint controller CC attacks!
# After you collect the bot-config files, you can run this file to reveal hidden links.
# Input: 1. bot-config files (JSON) should be in the correct place.
#        2. Optionally, you can have CSV files that represent the result of congestion.
#        3. Your budget.
# Output: 1. The control group.
#         2. The attack bots (using CC,SD or both.)

# CrossPoint attacks workflow: 

# For researchers who want to rebuild our experiments in your own environment:
    # 1. Run the bot_config in each bot, generating attack_flow JSON files. 
    # 2. Send these JSON files to the controller. 
    # 3. In the controller, run controller_CrossPoint to output the suspicious attack flow set. (This file)
    # 4. In the controller, find the control group and the attack flow set.
    # 5. Run bash_ping in control group bots and suspicious attack flow bots. 
    # 6. run conrtoller_CrossPoint to find profitable links.

# (Fast example) We also give a fast example with configured bots and congestion files!!
    # This code gives a fast example to understand CrossPoint-CC attacks!
    # This example only contains four bot-paths congestion samples.
    # It might take a few minutes to run (the main part is analyzing the congestion). 

# The experiment can be done with simulations and experiments. 
# The only difference is in step 2: The method of how you transfer JSON bot config files. 

# Using this you can run the bash_ping command on a bot to get the csv ping files.
# You should add your destination address in the @ip_list variable.

import re
import json
import random
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt 

def raw_ping_to_csv(rawping_file,csv_file_name):
    # This function will read a rowping_file and output it to csv ping files.
    # Read the file of raw ping data. 
    # If you do not use our ping tool bash_ping, you will get a raw ping data
    # Then you can use this function to transfer the raw ping data to csv files.
    # '/home/hxb/CAIDA-dataset/myping_/ping_log_sh_5.18'
    with open(rawping_file, 'r') as f:
        data = f.readlines()
    # Extract the desired features
    timestamps = []
    icmp_seqs = []
    times = []
    for row in data:
        if 'bytes from' in row:
            timestamp = re.search(r"\[(.*?)\]", row).group(1)
            icmp_seq = re.search(r"icmp_seq=(\d+)", row).group(1)
            time = re.search(r"time=(.*?) ms", row).group(1)
            timestamps.append(timestamp)
            icmp_seqs.append(icmp_seq)
            times.append(time)
    # Create a DataFrame
    df = pd.DataFrame({'time': timestamps, 'seq': icmp_seqs, 'delay': times})
    # Save to a new CSV file
    df.to_csv(csv_file_name+'.csv', index=False)
    return 

class congestion:
    def __init__(self,start_time,congestion_sample,length,coef_start_time,minrtt) -> None:
        self.length = length
        # Most importantly, the start_time must correct. 
        # Most importantly, the start_time must correct. 
        # It is the one of the main feature that influence the accuracy of CrossPoint attacks. 
        # (One of the authors wrote a bug here ... We spent almost one week to figure it out....)
        self.start_time = start_time
        self.congestion_sample = congestion_sample
        self.coef_sample = [minrtt]*10 + congestion_sample + [minrtt]*10
        self.coef_start_time = coef_start_time
        
    def update_bucket(self,value):
        self.congestion_sample = [value if x == -1 else x for x in self.congestion_sample]
            
    def update_length(self):
        # Not use in the latest version for CCS submission
        assert(len(self.delta_rtt) == len(self.drop))
        self.length = len(self.delta_rtt)
        mow = int(self.length / 10 + 1) 
        
    def get_coef_trace(self):
        # Not use in the latest version for CCS submission
        return self.coef_rtt_before + self.delta_rtt + self.coef_rtt_after 

    def update_coef_list(self,tr_list):
        # Not use in the latest version for CCS submission
        assert(self.length!=0)
        coef_before_t = max(self.start_time - self.length * 0.1, 0)
        coef_after_t = min(self.start_time + self.length * 0.2, len(tr_list.tr_list))
        idx_before = tr_list.get_trace_from_time(coef_before_t)
        idx_after = tr_list.get_trace_from_time(coef_after_t)
        c = 0
        while c < self.length:
            self.coef_rtt_before.append(tr_list.tr_list[idx_before].rtt)
            self.coef_rtt_after.append(tr_list.tr_list[idx_after].rtt)
            c +=1
            idx_before +=1
            idx_after +=1
        return 0

    def __str__(self) -> str:
        s = f'Congestion {self.start_time} inteval {self.length} rtt {self.congestion_sample} '
        return s

class correlated_congestion:
    
    def __init__(self,start_time,member_congestion):
        self.start_time = start_time
        self.member_congestion = member_congestion

    @classmethod
    def from_json(cls, json_str):
        json_dict = json.loads(json_str)
        return cls(**json_dict)

    def to_json(self):
        return json.dumps(self.__dict__)


def get_minRTT(trace):
    # The minimal rtt of a trace represents the propagation delay.
    delays = trace["delay"]
    return delays.min()

def get_maxRTT(trace):
    # The maximal rtt of a trace represents the egde value of RTT near a packet loss.
    # Therefore, we use the maximal value of rtt to change the lost dropped packet's rtt.
    delays = trace["delay"]
    return delays.max()

def get_local_max(trace,idx):
    # Local max delay parameter indicates the edge value of a packet loss. 
    delays = trace["delay"].iloc[idx-5:idx+5]
    return delays.max()

def get_loss(trace):
    seq = trace["seq"]
    c = 0
    for idx,s in enumerate(seq):
        if idx == 0:
            continue
        loss_idt = seq[idx] - seq[idx-1]
        #print(loss_idt)
        if loss_idt > 1 :
            c += 1
    return c
            
def get_abnRTT(trace):
    d = trace["delay"]
    minrtt = get_minRTT(trace)
    c = 0
    for idx,s in enumerate(d):
        if idx == 0:
            continue
        if s > minrtt + 30:
            c += 1
    return c    

def delay_update(trace,minrtt):
    trace.loc[abs(trace["delay"] - minrtt)< 10,'delay'] = minrtt
    trace.loc[trace["delay"] < minrtt,'delay'] = minrtt

def time_synchronize(trace,propagation_delay_to_target):
    # This is a synchonization step for updating the time stamp in the trace.
    # This step is optional, it need you know the propagation delay to the target.
    # So you need to send extra ping messages and get the minimal delay.
    # We reconmand you do that because in evaluations it shows a ~20% performance increases.
    # Note that the propagation delay and the min rtt are ms. therefore we * 1000
    def algin(row):
        return row - minrtt/2 * 0.001 - propagation_delay_to_target * 0.001
    
    trace["time"] = trace["time"].apply(algin)

def init_trace(trace):
    def create_time_stamp(start,length):
        return [start+0.1 + i*0.1 for i in range(length)]

    rtt = trace["delay"]
    minrtt = get_minRTT(trace)
    idx_loss = trace.index[trace["seq"].diff() > 1]
    #print(idx_loss)
    while not idx_loss.empty:
        idx = idx_loss[0]
        #print(idx,seq[idx],seq[idx-1])
        diff = trace['seq'][idx] - trace['seq'][idx -1] - 1
        insert_df = pd.DataFrame(
                                 {"seq":range(trace.loc[idx-1,'seq'] + 1, trace.loc[idx,'seq']), 
                                  'delay':[-1.0] * diff, 
                                 "time":create_time_stamp(trace.loc[idx-1,'time'],diff),
                                 "drop":1})
        trace = pd.concat([trace.iloc[:idx],insert_df,trace.iloc[idx:]]).reset_index(drop=True)
        #print(trace.iloc[270:280])
        idx_loss = trace.index[trace["seq"].diff() > 1]
        #print(idx_loss)
    indices = trace.loc[trace['delay'] == -1 ].index
    for i in indices:
        local_max = get_local_max(trace,i)
        trace.loc[i,'delay'] = local_max
    #print(trace.iloc[270:280])
    return trace

def init_congestion(trace_file_name):
    congestion_list = []
    trace_raw = pd.read_csv(trace_file_name)
    trace = init_trace(trace_raw)
    #print(trace.iloc[270:300])
    seq = trace["seq"]
    rtt = trace["delay"]
    timestamp = trace["time"]
    minrtt = get_minRTT(trace)
    #maxrtt = get_maxRTT(trace)
    
    #minrtt = 56 # debug
    print(f"minrtt is {minrtt}")
    delay_update(trace,minrtt)
    #print(minrtt)

    # We set the congestion as delay increase more than 30\% of the minimal RTT.
    # In most congestion control thesis, congestion is any condition when delay > minimal RTT.
    delay_idt_thre = min([0.3 * minrtt,30])
    #print(delay_idt_thre)
    idx = 1 
    while idx < len(seq):
        # loss indicator to judge whether congestion happens.
        loss_idt = seq[idx] - seq[idx - 1]
        # delay indicator to judge whether congestion happens.
        delay_idt = rtt[idx] - minrtt
        # If lost package or delay is high
        if loss_idt > 1 or delay_idt > delay_idt_thre:
            c_flag = True
            congestion_start_time = timestamp[idx]
            congestion_sample = []
            if loss_idt > 1:
                #maxrtt = get_local_max(trace,idx)
                maxrtt = max(rtt[idx],rtt[idx-1])
                #maxrtt = rtt[idx]
                sample = [maxrtt] * (loss_idt - 1)
                congestion_sample += sample
            if delay_idt > delay_idt_thre:
                #congestion_sample.append(delay_idt // delay_idt_thre)
                congestion_sample.append(rtt[idx])
            leng_idx = idx
            # A congestion starts, judge whether the following trace belongs to this congestion.
            while c_flag:
                idx += 1
                if idx >= len(seq):
                    break
                loss_idt = seq[idx] - seq[idx - 1]
                delay_idt = rtt[idx] - minrtt
                if loss_idt > 1 or delay_idt > delay_idt_thre:
                    c_flag = True
                    if loss_idt > 1:
                        maxrtt = max(rtt[idx],rtt[idx-1])
                        sample = [maxrtt] * (loss_idt - 1)
                        congestion_sample += sample
                    if delay_idt > delay_idt_thre:
                        #congestion_sample.append(delay_idt // delay_idt_thre)
                        congestion_sample.append(rtt[idx])
                    continue
                else:
                    c_flag = False
                    congestion_length = len(congestion_sample)
                    coef_start_time = congestion_start_time - 1.0
                    cgt = congestion(congestion_start_time, \
                                     congestion_sample,congestion_length, \
                                    coef_start_time, minrtt)
                    congestion_list.append(cgt)
        idx += 1
    return congestion_list

def search_time_trace(trace,start_time):
    # Given a start_time (from another bot), we need to search the rtt samples in this bot.
    # And then we can use them to judge the coefficiency. 
    new_df = trace[abs(trace["time"] - start_time) < 0.1]
    if not new_df.empty:
        idx = (new_df["time"] - start_time).abs().idxmin()
        #result = new_df.loc[idx]
        #print("idx",idx,"time {:.3f}".format(new_df["time"][idx]))
        return idx
    else:
        return None

def prepare_coef_sample(trace,trace_start_time,length):
    # Trace : the whole csv trace
    # teace_start_time: the congestion.coef_start_time 
    # length: the congestion.length
    idx_base = search_time_trace(trace,trace_start_time)
    if idx_base == None:
        return []
    if idx_base - 10 < 0 or idx_base + length + 10 > len(trace):
        return []
    seq = trace["seq"]
    rtt = trace["delay"]
    timestamp = trace["time"]
    drop = trace["drop"]
    minrtt = get_minRTT(trace)
    delay_update(trace,minrtt)
    #idx = trace.loc[trace['time'] == coef_sample_start].index[0]
    delay_idt_thre = min([0.3 * minrtt,30])
    coef_sample = []
    idx = idx_base
    drop_count = 0
    while len(coef_sample) < length:
        coef_sample.append(rtt[idx])
        idx += 1
        drop_count += drop[idx]
    if drop_count > len(coef_sample)*0.8:
        return []
    nan_flag = True
    for t in coef_sample:
        if t != minrtt:
            nan_flag = False
    if nan_flag:
        coef_sample[0] -= random.random() / 10
    return coef_sample

def c_group_congestions(C_group_congestion_file,target_name) -> list:
    # Given members in C-group, this function generates a congestion samples list,
    # And save it to a JSON file. 
    # In CrossPoint attacks, this file should be sent to unknown bots for judging.
    # This function might be memory starving, depending on how many members in the c-group.
    basic_congestion_list = init_congestion(C_group_congestion_file[0])
    idx = 1
    tr_list = []
    c_group_congestion_list = []
    # Init trace_list_files
    while idx < len(C_group_congestion_file):
        print("Start inititial traces, ",idx)
        tr_raw = pd.read_csv(C_group_congestion_file[idx])
        # The init_trace may need a long time to run.
        tr = init_trace(tr_raw)
        tr_list.append(tr)
        idx += 1
    print("Init congestion and trace finished")
    for c in basic_congestion_list:
        bad_congestion_flag = False  
        if not c.length > 1:
            # Too short congestion, drop
            continue
        if c.coef_sample == []:
            # Bad congestion, might be the "host unreadable" in data.
            continue
        samples = [c.coef_sample]
        for tr in tr_list:
            bad_congestion_flag = False
            c_sample_tmp = prepare_coef_sample(tr, c.start_time -1.0, c.length + 20)
            if c_sample_tmp == []:
                bad_congestion_flag = True
                # If A observes congestion at 9:00 p.m.
                # But B's csv file lasts from 8:00 p.m. to 8:30 p.m. (end before the congestion)
                # The prepare_coef_sample will return [] to tell the abnormal condition. 
                break    
            samples.append(c_sample_tmp)
        if bad_congestion_flag:
            continue
        co = np.corrcoef(samples)
        dis = 1 - co
        max_distance = np.max(dis[np.where(~np.eye(dis.shape[0], dtype=bool))])
        if max_distance < 0.5 and max_distance >= 0:
            cc = correlated_congestion(c.start_time,samples)
            c_group_congestion_list.append(cc)
        else:
            continue
    file_name = '_'.join([target_name,'cc'])        
    with open(file_name+'.json','w') as f:
        for idx,cc in enumerate(c_group_congestion_list):
            file_content = cc.to_json()
            json.dump(file_content,f)
            f.write('\n')
        f.close()
            
    return c_group_congestion_list

def get_cgroup_congestion(filename):
    cc_congestion = []
    with open(filename) as infile:
        for line in infile:
            item = json.loads(line)
            cc_instance = correlated_congestion.from_json(item)
            cc_congestion.append(cc_instance)
        infile.close()
    return cc_congestion

def bot_judge_congestion(bot_congestion_file,cc_congestion_sample):
    trace_raw = pd.read_csv(bot_congestion_file)
    trace = init_trace(trace_raw)
    Flag_of_true_result = 0
    print("Trace initialize finished. ")
    for cc in cc_congestion_sample:
        bad_congestion_flag = False
        start_time = cc.start_time
        length = len(cc.member_congestion[0])
        bot_sample = prepare_coef_sample(trace, start_time -1.0, length)
        if bot_sample == []:
            bad_congestion_flag = True
            continue
        samples_all = []
        for cm in cc.member_congestion:
            samples_all.append(cm)
        samples_all.append(bot_sample)
        co = np.corrcoef(samples_all)
        dis = 1 - co
        max_distance = np.max(dis[np.where(~np.eye(dis.shape[0], dtype=bool))])
        if max_distance < 0.5 and max_distance >= 0:
                Flag_of_true_result += 1
    print(f"Find {Flag_of_true_result} evidence of link sharing.")
    # IF the FLAG >= 1, in CrossPoint CC attack we determine the bot-path hide the target.
    return Flag_of_true_result

In [4]:
cc_samples = get_cgroup_congestion("congestion-example-csv/cgruop-congestion_cc.json")
# In this example, we use the raw data of our experiments,
# In an real attack, t1 and t2 should be ran on individual bots, therefore more fast.
# t1 and t2 may cost sereral minutes to run ...
t1 = bot_judge_congestion("congestion-example-csv/pinglog_botpath1.csv",cc_samples)
t2 = bot_judge_congestion("congestion-example-csv/pinglog_botpath2.csv",cc_samples)

Trace initialize finished. 
Find 721 evidence of link sharing.
Trace initialize finished. 
Find 742 evidence of link sharing.
