In [None]:
!pip install transformers torch bitsandbytes pandas


In [None]:

from dotenv import SECRET_HF
secret_hf = SECRET_HF
!huggingface-cli login --token $secret_hf

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import bitsandbytes as bnb
import pandas as pd


# Load the tokenizer
model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load the model with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(model_name, 
                                             quantization_config=bnb_config)

In [None]:
import re

# Function to process chunks of text
def process_chunk(prompt, model, tokenizer, max_len):
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(inputs.input_ids, max_length=max_len, num_return_sequences=1, temperature=0.7)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


    
    
    


In [None]:
import pandas as pd
import re
import gc


def get_gpu_free_memory():
    total_memory = torch.cuda.get_device_properties(0).total_memory
    reserved_memory = torch.cuda.memory_reserved(0)
    allocated_memory = torch.cuda.memory_allocated(0)
    free_memory = total_memory - (reserved_memory + allocated_memory)
    return free_memory

test_df = pd.read_csv('/kaggle/input/mistral-dataset/test.csv')
num_of_rows = test_df.shape[0]
print("Total testing datapoints: ",num_of_rows)

predicted_json = []
testing_count = 0

# Read the CSV file
df = pd.read_csv('/kaggle/input/few-shot-learning/train_modified.csv')
prompt = "Task: You are a cyber security specialist. Provide the list of MITRE techniques indicated in the context. Do not need to specify any reasoning, just provide a comma separated list. Choose the techniques among these 15 techniques: T1059 Command and Scripting interpreter, T1003 OS Credential Dumping, T1486 Data Encrypted for Impact, T1055 Process Injection, T1082 System Information Discovery, T1021 Remote Services, T1047 Windows Management Instrumentation, T1053 Scheduled Task/Job, T1497 Virtualization/Sandbox Evasion, T1018 Remote System Discovery, T1566 Phishing, T1027 Obfuscated Files or information, T1105 Ingress tool transfer, T1562 Impair defenses, T0814 Denial of Service.\n\n"

count = 0
for i in range(500):
    data = df.iloc[i]
    input = data['input']
    if len(input) <= 500 and len(input) >= 300:
        prompt +=  f"Context: {data['input']}\nQuestion: What are the MITRE techniques indicated in the threat report?\nAnswer: {data['output']}\n\n"
        count += 1
    if count == 3:
        break
#     print(i)

In [None]:

# generated_text = process_chunk(few_shot_prompt, model, tokenizer)
# print(f"Sample {index + 1}:\n{generated_text}\n")

In [None]:
for i in range(0, num_of_rows):


    row_df = test_df.iloc[i]

    #print(df)

    #take instruction and input column
    #instruction = row_df['instruction']
    input = row_df['input']
    output = row_df['output']



    text = input
    text = text.replace('\n','')
    text = text.replace('\r\n','')
    text = str(text)
    text = f"Context: {input}\nQuestion: What are the MITRE techniques indicated in the threat report?\nAnswer: " #to be changed
    
    print()
    print("GPU free memory available: ",get_gpu_free_memory()/(1024*1024*1024)," GB")
    
    print("Processing ",i+1,"th datapoint.Text size(in words): ",len(text.split(" ")))
    

    if(len(text.split(" ")) > 2000):
         print("Ignored ",i+1,"th datapoint.Text size(in words): ",len(text.split(" ")))        
         continue


    few_shot_prompt = prompt + text
    max_len = len(few_shot_prompt)
    
    generated_text = process_chunk(few_shot_prompt, model, tokenizer, max_len)
    
    response = generated_text
    # Regular expression to extract the answer
    print(response)
    # Regular expression to extract the last "Answer"
    
    print()
    print()
    print()
    
    matches = re.findall(r'Answer:\s*(.*)', response)
    if len(matches) > 3:
        fourth_answer = matches[3]
        print(f"4th Answer: {fourth_answer}")
    else:
        print("There are less than 4 answers in the context.")
        continue
   
    responded_techniques = fourth_answer.split(",")

    responded_techniques_list =[]
    for st in responded_techniques:
        st = st.replace('\n','')
        st = st.replace('\r\n','')
        st = st.strip()
        responded_techniques_list.append(st)


    predicted_techniques = ','.join(responded_techniques_list)
    #print(predicted_techniques)
    #break
    
    predicted_json.append({
        "Article" : input,
        "Actual_output" : output,
        "Predicted_output" : predicted_techniques

    })
    print("Testing for ",i+1,"th datapoints done.")
    
    testing_count = testing_count + 1

    predicted_df=pd.DataFrame(predicted_json)
    #predicted_df
    predicted_df.to_csv('/kaggle/working/mistral_few_shot.csv', index=False)
    print("Storing ",i+1,"datapoints to csv.Text size(in words): ",len(text.split(" ")))
    gc.collect()


print("Tested datapoint count: ",testing_count)
print("test_evaluation.csv is created successfully")
