In [None]:
import pandas as pd
import json
import os
import re
from langchain_groq import ChatGroq

# Initialize the Groq LLM (replace with your key and model name)
groq_api_key = ""
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")

# Path to your original CSV file
csv_file_path = r"C:\Users\Pritish\Downloads\RRR.csv"

# Step 1: Read the input CSV file
df = pd.read_csv(csv_file_path)

# Ensure that 'Abstracts' column exists in the DataFrame
if 'Abstracts' not in df.columns:
    raise ValueError("The input CSV must contain an 'Abstracts' column.")

# Create a copy of the original DataFrame
results_df = df.copy()

# Create lists to store categories and justifications
categories = []
justifications = []
error_rows = []

# Step 2: Function to call Groq LLM and classify abstracts
def classify_abstract(abstract):
    prompt = f"""
    Act as a chemistry expert and assign one or several of the following categories to the patent text below. 
    The categories are defined based on the following criteria:

    ### Category Definitions
    "0": "Unrelated",
    "1": "Insecticides",
    "2": "Herbicides",
    "3": "Fungicides",

    If the patent text is unrelated to any previous categories, assign category 0.

    ### Text:
    {abstract}

    ### Output Format Instruction:
    Output the results in the following JSON structure:
    {{"categories": "...", "justification": "..." }}
    """

    response = llm.predict(prompt)
    cleaned_response = response.strip()

    return cleaned_response

# Step 3: Classify abstracts and handle responses
for idx, abstract in enumerate(df['Abstracts']):
    if pd.isna(abstract) or not abstract.strip():  # Skip empty abstracts
        categories.append("")
        justifications.append("Empty abstract, skipped")
        continue

    retries = 0  # Initialize retry counter
    while retries < 3:  # Retry up to 3 times
        response = classify_abstract(abstract)
        
        try:
            # Parse the response using regex
            categories_part = re.search(r'"categories": \[(.*?)\]', response)
            justifications_part = re.search(r'"justification": "(.*?)"', response, re.DOTALL)

            if categories_part and justifications_part:
                category_text = categories_part.group(1).strip()
                justification_text = justifications_part.group(1).strip()
                categories.append(category_text)
                justifications.append(justification_text)
                break  # Exit the retry loop if parsing is successful
            else:
                raise ValueError("Invalid response format")

        except Exception as e:
            retries += 1  # Increment retry count
            print(f"Failed to parse row {idx}, attempt {retries}")
            if retries == 3:
                error_rows.append(idx)  # Log the row index with the error after 3 attempts
                categories.append("error")
                justifications.append("error in response parsing after 3 retries")

# Step 4: Add categories and justifications to the results DataFrame
results_df['Categories'] = categories
results_df['Justifications'] = justifications

# Save the updated DataFrame to the new CSV file
results_csv_file_path = r"C:\Users\Pritish\Downloads\results.csv"
results_df.to_csv(results_csv_file_path, index=False)

print(f"Results saved to {results_csv_file_path}")
