In [None]:
import shutil, os, subprocess
from google.colab import drive
drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/LLM/')

In [None]:
%%capture
# Upgrade PyTorch to 2.3.0 with CUDA 12.1
!pip install torch==2.3.0+cu121 torchvision==0.15.2+cu121 torchaudio==2.3.0+cu121 -f https://download.pytorch.org/whl/torch_stable.html

# Install Xformers compatible with PyTorch 2.3.0 and CUDA 12.1
!pip install xformers -f https://download.pytorch.org/whl/cu121

# Install Unsloth with the colab-new setup
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

# Upgrade Triton
!pip install -U triton
!pip install bitsandbytes
!pip install trl
!pip install peft


In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Ariadaikalam/Entity_extraction", # or choose "unsloth/Llama-3.2-1B-Instruct"
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

In [None]:
# Load the model
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0! Suggested values are 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but 0 is optimized
    bias="none",     # Supports any, but "none" is optimized
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # Use rank stabilized LoRA
    loftq_config=None, # For LoftQ
)

In [None]:
alpaca_prompt = """You are a entity value extraction assistant.. Your task is to extract specific entities from the product description using the provided regex pattern.you have to extract the entity value from the product description as per the entity name given. go through the description completely and find the appropriate entity value using the given regex pattern.
for example if the product description is "NAVITAS BARLEY GRASS POWDER HOW TOUSE 3. TAKEIBAGOF Pour about 30 ml of Thaw, add ice if you PRODUCT,POUR IT boiling water want and enjoy INTOA CUP" and you have to find the item volume, look into the regex pattern and extract the entity value accordingly. dont just assume the first number you encounter to be the entity value

### Instruction:
{}

### Product Description:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

In [None]:
import pandas as pd
import ast  # To safely evaluate the string into a dictionary


# Load the dataset
file_path = '/content/drive/MyDrive/test_instructions.csv'
df = pd.read_csv(file_path)


# Initialize an empty list to store the generated outputs
identified_entities = []


# Loop through each row in the 'prompt' column
for i in range(len(df)):
    prompt = df['prompt'][i]  # Access the 'prompt' column value


    # Convert the string representation of the dictionary to an actual dictionary
    try:
        prompt_dict = ast.literal_eval(prompt)  # Safely evaluate the string as a dictionary
    except (ValueError, SyntaxError):
        # If evaluation fails, skip this prompt
        print(f"Skipping row {i}: Invalid format")
        identified_entities.append("")  # Append empty value
        continue

    # Extract 'input' and 'instruction' from the dictionary
    mna_news_input = prompt_dict.get('input', '')  # Get 'input' key
    mna_news_instruction1 = prompt_dict.get('instruction', '')  # Get 'instruction' key

    # Format the prompt string (ensure that alpaca_prompt is defined)
    formatted_prompt = alpaca_prompt.format(
        mna_news_instruction1,
        mna_news_input,
        ""  # Leave this blank for generation
    )

    # Tokenize the input for the model
    inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")

    # Generate output using the model
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode(outputs)[0].split("\n\n### Response:\n")[1].split("\n\n### Explanation:\n")[0]

    # Append the response to the identified_entities list
    identified_entities.append(response)


# Add the identified_entities list as a new column to the DataFrame
df['identified_entity'] = identified_entities


# Save the updated DataFrame back to the CSV file (overwrite the existing file)
output_path = '/content/drive/MyDrive/test_result.csv'
df.to_csv(output_path, index=False)


print("Output saved to the new CSV file successfully!")

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Load the dataset
file_path = r"D:Projects\Entity_extraction\Data\test_result.csv"
df = pd.read_csv(file_path)

# True values (actual entities) and predicted values (identified entities)
true_values = df['entity_value']
predicted_values = df['identified_entity']

# Generate the confusion matrix
conf_matrix = confusion_matrix(true_values, predicted_values)

# Calculate the accuracy
accuracy = accuracy_score(true_values, predicted_values)

#Print the accuracy
print(f"Accuracy: {100*accuracy}")