# This is the file to create the labeled data that can be the input of the **Graphormer**

## Merge the **triplet and the label** into a txt file

In [None]:
%mkdir data

In [1]:
def process_second_file(file_path):
    result_dict = []

    with open(file_path, 'r') as file:
        next(file) # pass the first row(total num of the triplets)

        for line in file:
            columns = line.strip().split()

            if len(columns) >= 2:
                value = columns[1]

                # let the label be the string for the new dataset    
                if value == 'benign':
                    # value = 0
                    value = '0'   
                # elif value.startswith('T'):

                #     # Check if it's a float value (e.g., T1003.001)
                #     value = float(value[1:]) if '.' in value else int(value[1:])
                elif value.startswith('T'):

                    # Check if it's a float value (e.g., T1003.001)
                    if '.' in value[1:]:
                        value = value[1:]
                    else:
                        try:
                            int_value = int(value[1:])
                            value = str(int_value)
                        except ValueError:
                            # If not an integer or float, keep it as a string
                            pass

            result_dict.append(value)

    return result_dict

def merge_files_txt(first_file_path, label_list, output_txt_file):
    if len(label_list) == 0:
        raise ValueError("Label list is empty!")

    with open(output_txt_file, 'w') as txt_file:
        txt_file.write("src,dest,rel,label\n")

        with open(first_file_path, 'r') as first_file:
            next(first_file)

            for idx, line in enumerate(first_file):
                columns = line.strip().split()

                if len(columns) == 3:
                    src, rel, dest = columns
                    label = label_list[idx] if idx < len(label_list) else ''
                    txt_file.write(f"{src},{rel},{dest},{label}\n")


# first_file_path = '/content/train2id.txt'
# second_file_path = '/content/train2id_label.txt'
first_file_path = './data_euni/train2id.txt'
second_file_path = './data_euni/train2id_label.txt'

label_list = process_second_file(second_file_path)

output_txt_file = 'labeled_data.txt'
merge_files_txt(first_file_path, label_list, output_txt_file)

## Add the **node_num, node_feat and edge_feat** into the txt file

In [2]:
def add_column_to_txt(file_path, output_file_path):

    with open(file_path, 'r') as file:
        lines = file.readlines()  # 讀取所有行到一個列表中

    with open(output_file_path, 'w') as output_file:
        output_file.write(lines[0].rstrip() + ",num_nodes,node_feat,edge_attr\n")  # 寫入新的標題行，加在最後一個欄位後面
        for line in lines[1:]:
            output_file.write(line.rstrip() + ",3,0,0,0,0,0\n")  # 寫入每一行的內容，加上新的欄位值，加在最後一個欄位後面


# input_file_path = '/content/data/labeled_data.txt'
# output_file_path = '/content/data/labeled_data_final.txt'
input_file_path = 'labeled_data.txt'
output_file_path = 'labeled_data_final.txt'

add_column_to_txt(input_file_path, output_file_path)

## Convert to the **jsonl** file

In [3]:
import json

def convert_to_jsonl(file_path, output_file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
        header = lines[0].rstrip().split(',')  # 讀取標題行，以逗號分隔欄位

    jsonl_data = []
    for line in lines[1:]:
        values = line.rstrip().split(',')  # 以逗號分隔每一行的欄位值
        data = {}
        for i, field in enumerate(header):
            data[field] = values[i]  # 構建 JSON 對象

        edge_index = [[data['src'], data['rel']], [data['rel'], data['dest']]]
        data.pop('src')
        data.pop('rel')
        data.pop('dest')
        data['edge_index'] = edge_index  # 添加 edge_index 值
        data['node_feat'] = [[0], [0], [0]]  # 添加 node_feat 值
        data['edge_attr'] = [[0], [0]]  # 添加 edge_feat 值
        data['label'] = [data['label']]  # Convert label value to a list


        jsonl_data.append(json.dumps(data))  # 將 JSON 對象轉換為 JSON 字串並添加到列表中

    with open(output_file_path, 'w') as output_file:
        for json_str in jsonl_data:
            output_file.write(json_str + '\n')  # 寫入每個 JSON 字串到新的 JSONL 檔案中

# 設定輸入文本檔案的路徑和輸出 JSONL 檔案的路徑
# input_file_path = '/content/data/labeled_data_final.txt'
# output_file_path = '/content/data/labeled_data_final.jsonl'
input_file_path = 'labeled_data_final.txt'
output_file_path = 'labeled_data_final.jsonl'

# 呼叫函式以轉換為 JSONL 檔案
convert_to_jsonl(input_file_path, output_file_path)


# TEST version

In [None]:
import json

def process_file(file_path, second_file_path):
    result_dict = []

    # Process the second file and create the label list
    label_list = []
    with open(second_file_path, 'r') as second_file:
        next(second_file) # pass the first row(total num of the triplets)

        for line in second_file:
            columns = line.strip().split()

            if len(columns) >= 2:
                value = columns[1]

                if value == 'benign':
                    value = 0
                elif value.startswith('T'):
                    value = float(value[1:]) if '.' in value else int(value[1:])

                label_list.append(value)

    with open(file_path, 'r') as file:
        lines = file.readlines()
        header = lines[0].rstrip().split(',')  # 讀取標題行，以逗號分隔欄位

    jsonl_data = []
    for idx, line in enumerate(lines[1:]):
        values = line.rstrip().split(',')  # 以逗號分隔每一行的欄位值
        data = {}
        for i, field in enumerate(header):
            data[field] = values[i]  # 構建 JSON 對象

        edge_index = [[data['src'], data['rel']], [data['rel'], data['dest']]]
        data.pop('src')
        data.pop('rel')
        data.pop('dest')
        data['edge_index'] = edge_index  # 添加 edge_index 值
        data['node_feat'] = [[0], [0], [0]]  # 添加 node_feat 值
        data['edge_feat'] = [[0], [0]]  # 添加 edge_feat 值
        data['label'] = [label_list[idx]]  # Convert label value to a list

        jsonl_data.append(json.dumps(data))  # 將 JSON 對象轉換為 JSON 字串並添加到列表中

    return jsonl_data

def write_jsonl(data_list, output_file_path):
    with open(output_file_path, 'w') as output_file:
        for json_str in data_list:
            output_file.write(json_str + '\n')  # 寫入每個 JSON 字串到新的 JSONL 檔案中

# 設定輸入文本檔案的路徑和輸出 JSONL 檔案的路徑
input_file_path = '/content/train2id.txt'
second_file_path = '/content/train2id_label.txt'
output_file_path = '/content/data/labeled_data_final.jsonl'

jsonl_data = process_file(input_file_path, second_file_path)
write_jsonl(jsonl_data, output_file_path)
