Chunking

In [None]:
import csv

def chunk_logs(input_file, output_file, chunk_size=5):
    # List to store chunks of log events
    log_chunks = []

    with open(input_file, 'r', newline='') as csv_in:
        reader = csv.reader(csv_in)
        header = next(reader)  # Assuming the first row is the header
        chunk = []

        for row in reader:
            chunk.append(row)
            if len(chunk) == chunk_size:
                log_chunks.append(chunk)  # Add chunk to log_chunks
                chunk = []

        # Add any remaining log events as the final chunk if they don't fill up the last chunk
        if chunk:
            log_chunks.append(chunk)

    # Write the chunks to a new CSV file
    with open(output_file, 'w', newline='') as csv_out:
        writer = csv.writer(csv_out)
        writer.writerow(['Chunk'])  # Update header for clarity
        for chunk in log_chunks:
            # Convert each chunk into a single string with rows separated by semicolons
            chunk_str = '; '.join([', '.join(event) for event in chunk])
            writer.writerow([chunk_str])

    return log_chunks

# Usage
input_csv = '/content/BGL_full.csv'
output_csv = '/content/BGL_5.csv'
log_chunks = chunk_logs(input_csv, output_csv)


Unique chunk selection

In [None]:
import pandas as pd

# Load the CSV file (replace the path with your actual file location)
file_path = "/content/BGL_full.csv"
df = pd.read_csv(file_path)

# Drop duplicate log chunks and select the first 1000 unique ones
unique_chunks = df['Content'].drop_duplicates()

# Create a new DataFrame with the unique log chunks
unique_df = pd.DataFrame({'Content': unique_chunks})

# Save the result to a new CSV file
output_path = "/content/BGL_full_no_duplicate.csv"
unique_df.to_csv(output_path, index=False)

print(f"Saved unique log chunks to: {output_path}")


Saved unique log chunks to: /content/BGL_full_no_duplicate.csv


test train selection

In [None]:
import pandas as pd

# Load your CSV file
df = pd.read_csv("/content/BGL_5_no_duplicate.csv")  # Replace with the path to your CSV

# Randomly sample 200 rows for test set
test_df = df.sample(n=1000, random_state=42)

# Remaining rows for train set
train_df = df.drop(test_df.index)

# Save to new CSV files
test_df.to_csv("BGL_test.csv", index=False)
train_df.to_csv("BGL_train.csv", index=False)

print("1000 rows saved to test.csv, remaining saved to train.csv")


Train Shrink

In [None]:
import pandas as pd

# Load the training data
df = pd.read_csv("BGL_train.csv")

# Randomly select 2000 chunks without replacement
sampled_df = df.sample(n=2000, random_state=42)  # You can change the seed for different random samples

# Save the selected chunks to a new CSV file
sampled_df.to_csv("BGL_train_2k.csv", index=False)

print("2000 random chunks saved to finaltrain.csv")


# Ground Truth Generation for Each Level of Chunking

In [None]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.87.0
    Uninstalling openai-1.87.0:
      Successfully uninstalled openai-1.87.0
Successfully installed openai-0.28.0


 Label Generation

In [None]:
import csv
import openai
import pandas as pd
import time

# Set your OpenAI API key here
openai.api_key = "xxx"
# Function to get summary from GPT-4o for a given chunk
def get_summary(chunk_text):
    response = openai.ChatCompletion.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are an advanced summarization and log analysis assistant. Your task is to analyze the given log chunk and generate a summary including the critical log events, then provide definitions for important concepts and an interpretation for the log events in summary."},
            {"role": "user", "content": f"""
Task:

Analyze the log chunk step-by-step and provide summary, definitions, and interpretation:

1) Summary: Extract and return only critical log events indicating errors, failures, system malfunctions, or potential issues. Exclude routine events.

2) Definitions: For every concept or term relevant to understanding the critical events or resolving them, provide clear and concise definitions. Prioritize terms that are technical, operational, or diagnostic in nature (e.g., protocols, subsystems, mechanisms).

3) Interpretation: Provide a brief interpretation, focusing on critical events present in summary and their impact on system operations. Keep it concise, avoid redundancy, and exclude irrelevant details.

If no critical events are found, return: <start>normal</end>.

Example:

Log chunk:
LDAP: Built with OpenLDAP LDAP SDK;LDAP: SSL support unavailable;suEXEC mechanism enabled (wrapper: /usr/sbin/suexec);Digest: generating secret for digest authentication ...Digest: done


<start><Summary>LDAP: SSL support unavailable</Summary><Definitions>- LDAP (Lightweight Directory Access Protocol): A protocol used for accessing and managing distributed directory information services over a network.- OpenLDAP: An open-source implementation of the LDAP protocol, providing directory services.- SSL (Secure Sockets Layer): A cryptographic protocol used to provide secure communication over a network.- SSL support unavailable: Indicates that encrypted communication via SSL is not configured or not supported, potentially exposing sensitive data.- suEXEC: A feature of Apache HTTP Server that allows users to run CGI and SSI scripts under user IDs different from the web server user ID.- Digest authentication: A method of HTTP authentication that uses hashing to secure credentials over a network.</Definitions><Interpretation>LDAP communications are unencrypted, exposing sensitive data (e.g., user credentials) to interception, especially on untrusted networks. This could lead to security breaches, undermining system integrity.</Interpretation></end>

Now, analyze the following log chunk and provide its summary, definitions, and interpretation:
{chunk_text}
"""}
        ],
        max_tokens=700,
        temperature=0.0
    )
    summary = response.choices[0].message['content'].strip()
    return summary

# Load the CSV file with chunks
input_file = '/content/Linux_train_2k.csv'
output_file = '/content/Linux_train_f4.csv'
checkpoint_file = '/content/Linux_train_f4_checkpoint.csv'  # New: temporary checkpoint file

# Read the file into a DataFrame
df = pd.read_csv(input_file)

# Check if 'Chunk' column exists and create 'GT' column for summaries
if 'Chunk' not in df.columns:
    raise ValueError("Expected column 'Chunk' not found in CSV file.")

# If GT column already exists, keep the old ones; otherwise create it
if 'GT' not in df.columns:
    df['GT'] = ""  # Create a column for summaries

# Resume from checkpoint if partially done
start_index = 0
if df['GT'].notna().any():
    completed_indexes = df[df['GT'] != ""].index.tolist()
    if completed_indexes:
        start_index = max(completed_indexes) + 1
        print(f"Resuming from index {start_index}")

# Process each chunk and get the summary
for index in range(start_index, len(df)):
    row = df.iloc[index]
    chunk_text = row['Chunk']
    print(f"\nProcessing index {index}:\n{chunk_text}\n")

    try:
        summary = get_summary(chunk_text)
        df.at[index, 'GT'] = summary  # Store the summary in the 'GT' column
        print(f"Summary:\n{summary}")
        print(f"------------------------index= {index} -------------------------------")
    except Exception as e:
        print(f"Error at index {index}: {e}")
        print("Waiting for 10 seconds before retrying...")
        time.sleep(5)
        continue  # Skip this entry or retry later

    # Save checkpoint every 10 entries
    if (index + 1) % 100 == 0:
        df.to_csv(checkpoint_file, index=False)
        print(f"Checkpoint saved at index {index}.")

# Final save
df.to_csv(output_file, index=False)
print(f"Final summaries saved to {output_file}")


**Post processing (remove newlines and Summary:)**

In [None]:
import csv

# Function to remove newlines and "Summary:" in a specific column of a CSV file
def remove_newlines_and_summary_in_column(input_file, output_file, column_name):
    with open(input_file, 'r', newline='', encoding='utf-8') as infile:
        reader = csv.DictReader(infile)

        # Get field names from the input file
        fieldnames = reader.fieldnames

        # Open the output file to write the updated data
        with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()

            # Process each row in the input file
            for row in reader:
                if column_name in row:

                    row[column_name] = row[column_name].replace('\n', ' ').replace('</end>\n', '</end>').replace('\r', ' ').replace('Summary:', '').strip()
                writer.writerow(row)

# Specify input and output file paths
input_csv = '/content/Linux_train_f4.csv'   # Replace with your input CSV file path
output_csv = '/content/Linux_train_f4_cleaned.csv' # Replace with your desired output CSV file path


column_to_clean = 'GT'

# Call the function
remove_newlines_and_summary_in_column(input_csv, output_csv, column_to_clean)

print(f"Newlines and 'Summary:' removed in column '{column_to_clean}' and saved to {output_csv}.")


Newlines and 'Summary:' removed in column 'GT' and saved to /content/Linux_train_f4_cleaned.csv.
