In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/mistral-7b-v0.3-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
alpaca_prompt = """You are an advanced log analysis assistant. Your task is to analyze the given log chunk and generate a concise interpretation.

Interpretation Task:
Provide a brief interpretation of this log chunk, focusing on critical events (e.g., errors, warnings, performance issues, system malfunctions, potential issues) and their impact on system operations. Keep it concise, avoid redundancy, and exclude irrelevant details. Provide the result in <start></end> tags. If no critical events are found, return: <start>normal</end>.


Example:
Log chunk:
LDAP: Built with OpenLDAP LDAP SDK
LDAP: SSL support unavailable
suEXEC mechanism enabled (wrapper: /usr/sbin/suexec)
Digest: generating secret for digest authentication ...
Digest: done

Interpretation:
<start>LDAP is built with OpenLDAP SDK but lacks SSL support, posing a moderate security risk due to potential unencrypted communication. No other critical issues detected.</end>

Now, analyze the following log chunk and provide its interpretation:

### Input:
Log chunk:
{}

### Response:
"""


EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    inputs = examples["Chunk"]

    texts = []
    for  input in inputs:
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format( input) + EOS_TOKEN
        # print(text)
        texts.append(text)

    return { "text" : texts, }
pass


# Inference

In [None]:
import re
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

def get_summ(response):

  pattern = r"(?<=### Response:\n)(.*)"

  match = re.search(pattern, response, re.DOTALL)

  if match:
      extracted_text = match.group(1)  # Extract the first capturing group
      # print(extracted_text)
      return extracted_text
  else:
      # print("No match found")
      return None


def get_response(input):
    inputs = tokenizer(
        [
            alpaca_prompt.format(
                input,  # input
                "",  # output - leave this blank for generation!
            )
        ], return_tensors="pt"
    ).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=400, use_cache=True)

    # print(f"Input: {input}" )

    full_response = tokenizer.batch_decode(outputs)

    print(f"Full response: {full_response}" )

    extracted_summ = get_summ(full_response[0])

    return extracted_summ


In [None]:
import pandas as pd
import os
import time  # Import the time module


input_file = "/content/test.csv"
output_file = "/content/test_base.csv"
checkpoint_file = "/content/test_base_checkpoint.csv"  # Temp checkpoint file

def process_file(input_file, output_file, checkpoint_file):
    df = pd.read_csv(input_file)

    # Ensure required columns exist
    if 'Chunk' not in df.columns:
        raise ValueError("The input CSV must contain 'Chunk'.")

    # Check if there is a checkpoint file to resume progress
    if os.path.exists(checkpoint_file):
        df_checkpoint = pd.read_csv(checkpoint_file)
        processed_count = len(df_checkpoint[df_checkpoint["Apache_base_f1"] != ""])
        print(f"Resuming from checkpoint... {processed_count} rows already processed.")
    else:
        df_checkpoint = df.copy()
        df_checkpoint["Apache_base_f1"] = ""  # Initialize column
        processed_count = 0  # Start from the beginning

    counter = processed_count
    start_time = time.time()  # Start global timer

    for index, row in df.iloc[processed_count:].iterrows():  # Resume from last processed row
        counter += 1
        log_entry = row['Chunk']

        print(f"Counter: {counter}")
        print(f"Log Chunk: {log_entry}")

        try:
            summary = get_response(log_entry)
        except Exception as e:
            print(f"Error processing log entry: {e}")
            summary = "ERROR"

        print(f"Extracted Summary: {summary}")
        df_checkpoint.at[index, "Apache_base_f1"] = summary  # Store result
        print("--------------------------------------------------------\n")

        # Save progress and print cumulative execution time every 100 entries
        if counter % 100 == 0:
            df_checkpoint.to_csv(checkpoint_file, index=False)
            elapsed = time.time() - start_time
            mins, secs = divmod(elapsed, 60)
            print(f"Checkpoint saved at row {counter}")
            print(f"Cumulative execution time: {int(mins)} min {int(secs)} sec")

    # Final save after all processing
    df_checkpoint.to_csv(output_file, index=False)
    print("Final results saved!")

    # Remove checkpoint file after completion
    if os.path.exists(checkpoint_file):
        os.remove(checkpoint_file)

    # Total execution time
    total_time = time.time() - start_time
    mins, secs = divmod(total_time, 60)
    print(f"\n Total execution time: {int(mins)} min {int(secs)} sec")

# Run the processing
process_file(input_file, output_file, checkpoint_file)
