In [None]:
import sys, pickle, os, json, re, time, random, logging, pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, scipy, sklearn, networkx as nx, importlib; sys.path.append('./Dropbox'); import tools; importlib.reload(tools)

# Define a function to generate prompt messages
def prompt_maker(text, threatname):
    # Construct a user role prompt message including the specific threat feature description method
    prompt_message = [
        {
            "role": "user",
            "content": (
                text
                + """Based on the above knowledge graph triples, summarize the features of """
                + threatname
                + """ using the following steps:
                1. Summarize the one-hop features of the threat, such as ['threat, Y, Z'].
                2. Then proceed to two-hop features, such as ['threat, Y, Z'] and [Z (or equivalent/related/similar to Z or in an inclusion relationship with Z), M, N].
                3. Continue to three-hop features, and so on, up to five hops.
                4. Format the summaries for each hop as follows:
                   * X-hop, Index Y: Original triple connections (e.g., [X, Y, Z]...[Z, M, N]) --> Summarized feature.
                5. Define the characteristics or behaviors of the threat:
                   - Identify specific attributes: including exploitation methods, targeted vulnerabilities, affected systems, attack vectors, payload delivery mechanisms, persistence techniques, and any unique signatures or patterns.
                6. Formatting requirements:
                   - Multi-hop summaries: Present multi-hop summaries clearly, with the format:
                     * X-hop, Serial Number Y: Original triple connections (e.g., [X, Y, Z]...[Z, M, N]) --> Summarized feature.
                   - Comprehensiveness: Ensure the final summary is cohesive and provides a complete picture of the threat without unnecessary repetition.
                7. Inference steps:
                   - Step 1:
                     * Edge Identification: Identify each edge in the graph that discusses the characteristic or behavior of the threat.
                     * Summarization: Summarize the characteristic or behavior represented by each identified edge.
                   - Step 2:
                     * Intermediate Node Construction: Use an additional intermediate node to construct sets with two connected edges that discuss the characteristics or behaviors of the threat.
                     * Set Summarization: Summarize the characteristic or behavior represented by each identified set.
                   - Step 3:
                     * Increase Nodes: Add more intermediate nodes as needed, up to four, to identify sets with more connected edges.
                     * Comprehensive Summarization: Summarize the characteristic or behavior represented by each identified set.
                8. Final step:
                   - Comprehensive Summary: Based on the summarized features and behaviors from the previous steps, write a detailed description of the threat.
                Please follow these rules and complete all steps before stopping output."""
            )
        }
    ]
    return prompt_message

# Extract all text and threat names from DataFrame
all_text = final_df['result'].tolist()
all_threat_names = final_df['threat True Name'].tolist()

# Use the prompt_maker function to generate prompts for each text and threat name
prompts = [prompt_maker(text, threatname) for text, threatname in zip(all_text, all_threat_names)]
code_name = 'threat_summary'
jsonl_file = tools.create_jsonl(prompts, model='gpt4', temp=0.7, token=4*4096, jsonlname=code_name, possible_output=300)
# Upload JSONL file and run batch processing
ids = tools.upload_RUN_PAY_jsonl(jsonl_file, code_name)

ans = tools.auto_down_ans('ids[batch_id]')
final_df['summary'] = ans

# Assume final_df already exists and contains the required columns
# Filter for two cases: mode is 'raw' and 'de-duplicate_order'
raw_mode_df = final_df[final_df['mode'] == 'raw']
dedup_mode_df = final_df[final_df['mode'] == 'de-duplicate_order']

# Generate the required formatted output
formatted_output = []

# Get all unique threat True Names
unique_threat_names = final_df['threat True Name'].unique()

# Output by threat
for threat_name in unique_threat_names:
    # Add a separator at the beginning of each threat
    formatted_output.append("========")
    
    # Filter for the current threat's two modes
    raw_rows = raw_mode_df[raw_mode_df['threat True Name'] == threat_name]
    dedup_rows = dedup_mode_df[dedup_mode_df['threat True Name'] == threat_name]
    
    # Output content for 'raw' mode
    for index, row in raw_rows.iterrows():
        source = row['source']
        gpt_summary_raw = row['GPT summary']
        formatted_output.append(f"threat name: {threat_name} Source: {source} mode: raw GPT Summary: {gpt_summary_raw}")
    
    # If there is 'de-duplicate_order' mode, add a separator and output content
    if not dedup_rows.empty:
        formatted_output.append("-----------------")
        for index, row in dedup_rows.iterrows():
            source = row['source']
            gpt_summary_dedup = row['GPT summary']
            formatted_output.append(f"threat name: {threat_name} Source: {source} mode: de-duplicate_order GPT Summary: {gpt_summary_dedup}")

# Display or output the required formatted results
for line in formatted_output:
    print(line)
