In [18]:
import pandas as pd
import json
import os
from langchain_groq import ChatGroq

# Initialize the Groq LLM
groq_api_key = ""
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")

# Path to your CSV file
csv_file_path = r"C:\Users\Pritish\Downloads\r&d.csv"

# Read the input CSV file
df = pd.read_csv(csv_file_path)

# Ensure that 'Abstracts' column exists
if 'Abstracts' not in df.columns:
    raise ValueError("The input CSV must contain an 'Abstracts' column.")

# Create lists to store categories and justifications
categories = []
justifications = []

# Loop through each abstract and classify it
for abstract in df['Abstracts']:
    # Check if abstract is empty or invalid, skip if so
    if pd.isna(abstract) or not abstract.strip():  # Skip empty abstracts
        categories.append("0")
        justifications.append("Empty abstract, skipped")
        continue

    # Define the initial prompt template
    prompt = f"""
    Act as a chemistry expert and assign one or several of the following categories to the patent text below. 
    The categories are defined based on the following criteria:

    ### Category Definitions
    "0": "Unrelated",
    "1": "Insecticides",
    "2": "Herbicides",
    "3": "Fungicides",

    If the patent text is unrelated to any previous categories, assign category 0.

    ### Text:
    {abstract}

    ### Output Format Instruction:
    Output the results in the following JSON structure:
    {{"categories": "...", "justification": "..." }}
    """

    # Use the Groq LLM to generate a response based on the prompt
    response = llm.predict(prompt)

    # Clean the response in case it contains formatting
    cleaned_response = response.strip('```json').strip('```').strip()

    try:
        # Parse the cleaned response as JSON
        result = json.loads(cleaned_response)

        # Ensure that the response has the necessary keys
        if 'categories' in result and 'justification' in result:
            categories.append(result['categories'])
            justifications.append(result['justification'])
        else:
            categories.append("error")
            justifications.append("error in response structure")
    except json.JSONDecodeError:
        print(f"Error parsing response: {cleaned_response}")
        with open("error_responses.txt", "a") as file:
            file.write(f"Error parsing response: {cleaned_response}\n")
        categories.append("error")
        justifications.append("error in response parsing")

# Step 1: Extract data from the .txt file
def extract_data_from_txt(txt_file):
    extracted_data = []
    with open(txt_file, 'r') as file:
        for line in file:
            if "Error parsing response:" in line:
                json_str = line.split("Error parsing response: ")[1]
                try:
                    response_data = json.loads(json_str)
                    categories = response_data.get('categories', [])
                    justification = response_data.get('justification', '')
                    extracted_data.append((categories, justification))
                except json.JSONDecodeError:
                    print(f"Failed to parse line: {line}")
    return extracted_data

# Step 2: Update the DataFrame with extracted data from the .txt file
txt_file_path = 'error_responses.txt'
extracted_data = extract_data_from_txt(txt_file_path)

# Assuming that the extracted data needs to replace the rows with errors
error_indices = [i for i, cat in enumerate(categories) if cat == "error"]

if len(error_indices) == len(extracted_data):
    for idx, (category, justification) in zip(error_indices, extracted_data):
        categories[idx] = category
        justifications[idx] = justification
else:
    print(f"Warning: Mismatch between extracted data and error indices.")

# Add categories and justifications to the DataFrame
df['Categories'] = categories
df['Justifications'] = justifications

# Save the updated DataFrame back to CSV
df.to_csv(csv_file_path, index=False)

# Step 3: Delete the text file after processing
def delete_txt_file(txt_file):
    if os.path.exists(txt_file):
        os.remove(txt_file)
        print(f"{txt_file} deleted.")
    else:
        print(f"{txt_file} does not exist.")

# Delete the txt file
delete_txt_file(txt_file_path)


Error parsing response: {"categories": ["1"], "justification": "The patent text describes a mosquito repellent device, clearly falling under the category of insecticides."}
```
Error parsing response: {"categories": ["3"], "justification": "The patent explicitly states that the compound (I) is a fungicide and describes its use as an antifungal agent for treating fungal infections."}
```
Error parsing response: {"categories": ["3"], "justification": "The patent explicitly states that the invention is a 'fungicidal composition' and details its activity against a plant pathogen in soybean plants. "}
```
error_responses.txt deleted.


In [23]:
import pandas as pd
import json
import os
from langchain_groq import ChatGroq

# Initialize the Groq LLM
groq_api_key = ""
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")

# Path to your original CSV file
csv_file_path = r"C:\Users\Pritish\Downloads\RRR.csv"

# Read the input CSV file
df = pd.read_csv(csv_file_path)

# Ensure that 'Abstracts' column exists
if 'Abstracts' not in df.columns:
    raise ValueError("The input CSV must contain an 'Abstracts' column.")

# Create a copy of the original DataFrame
results_df = df.copy()

# Create lists to store categories and justifications
categories = []
justifications = []

# Loop through each abstract and classify it
for abstract in df['Abstracts']:
    # Check if abstract is empty or invalid, skip if so
    if pd.isna(abstract) or not abstract.strip():  # Skip empty abstracts
        categories.append("0")
        justifications.append("Empty abstract, skipped")
        continue

    # Define the initial prompt template
    prompt = f"""
    Act as a chemistry expert and assign one or several of the following categories to the patent text below. 
    The categories are defined based on the following criteria:

    ### Category Definitions
    "0": "Unrelated",
    "1": "Insecticides",
    "2": "Herbicides",
    "3": "Fungicides",

    If the patent text is unrelated to any previous categories, assign category 0.

    ### Text:
    {abstract}

    ### Output Format Instruction:
    Output the results in the following JSON structure:
    {{"categories": "...", "justification": "..." }}
    """

    # Use the Groq LLM to generate a response based on the prompt
    response = llm.predict(prompt)

    # Clean the response in case it contains formatting
    cleaned_response = response.strip('```json').strip('```').strip()

    try:
        # Parse the cleaned response as JSON
        result = json.loads(cleaned_response)

        # Ensure that the response has the necessary keys
        if 'categories' in result and 'justification' in result:
            categories.append(result['categories'])
            justifications.append(result['justification'])
        else:
            categories.append("error")
            justifications.append("error in response structure")
    except json.JSONDecodeError:
        print(f"Error parsing response: {cleaned_response}")
        with open("error_responses.txt", "a") as file:
            file.write(f"Error parsing response: {cleaned_response}\n")
        categories.append("error")
        justifications.append("error in response parsing")

# Step 1: Extract data from the .txt file
def extract_data_from_txt(txt_file):
    extracted_data = []
    with open(txt_file, 'r') as file:
        for line in file:
            if "Error parsing response:" in line:
                json_str = line.split("Error parsing response: ")[1]
                try:
                    response_data = json.loads(json_str)
                    categories = response_data.get('categories', [])
                    justification = response_data.get('justification', '')
                    extracted_data.append((categories, justification))
                except json.JSONDecodeError:
                    print(f"Failed to parse line: {line}")
    return extracted_data

# Step 2: Update the DataFrame with extracted data from the .txt file
txt_file_path = 'error_responses.txt'
extracted_data = extract_data_from_txt(txt_file_path)

# Assuming that the extracted data needs to replace the rows with errors
error_indices = [i for i, cat in enumerate(categories) if cat == "error"]

if len(error_indices) == len(extracted_data):
    for idx, (category, justification) in zip(error_indices, extracted_data):
        categories[idx] = category
        justifications[idx] = justification
else:
    print(f"Warning: Mismatch between extracted data and error indices.")

# Add categories and justifications to the results DataFrame (copy)
results_df['Categories'] = categories
results_df['Justifications'] = justifications

# Path to save the results CSV file
results_csv_file_path = r"C:\Users\Pritish\Downloads\results.csv"

# Save the updated DataFrame to the new CSV file
results_df.to_csv(results_csv_file_path, index=False)

# Step 3: Delete the text file after processing
def delete_txt_file(txt_file):
    if os.path.exists(txt_file):
        os.remove(txt_file)
        print(f"{txt_file} deleted.")
    else:
        print(f"{txt_file} does not exist.")

# Delete the txt file
delete_txt_file(txt_file_path)

print(f"Results saved to {results_csv_file_path}")

Error parsing response: {"categories": ["1"], "justification": "The patent describes a mosquito repellent device using icaridin as a primary repellent. This clearly falls under the category of insecticides."}
```
Error parsing response: {"categories": ["3"], "justification": "The patent explicitly states that the compound (I) is a fungicide and describes its use as an antifungal agent for treating fungal infections."}
```
Error parsing response: {
 "categories": ["3"],
 "justification": "The patent text explicitly states that the invention is a 'Fungicidal composition' and details its activity as a fungicide against plant pathogens."
}
```
Error parsing response: {"categories": ["2", "3"], "justification": "The patent describes a system for producing hypochlorous acid water, which is known to have both herbicidal and fungicidal properties.  The text specifically mentions using this water to: \n\n*  'reduce stress during plant growth by removing harmful bacteria that inhibit plant growt

In [3]:
import pandas as pd
import json
import os
from langchain_groq import ChatGroq

# Initialize the Groq LLM
groq_api_key = ""
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Gemma2-9b-It")

# Path to your original CSV file
csv_file_path = r"C:\Users\Pritish\Downloads\RRR.csv"

# Read the input CSV file
df = pd.read_csv(csv_file_path)

# Ensure that 'Abstracts' column exists
if 'Abstracts' not in df.columns:
    raise ValueError("The input CSV must contain an 'Abstracts' column.")

# Create a copy of the original DataFrame
results_df = df.copy()

# Create lists to store categories and justifications
categories = []
justifications = []

# Loop through each abstract and classify it
for abstract in df['Abstracts']:
    # Check if abstract is empty or invalid, skip if so
    if pd.isna(abstract) or not abstract.strip():  # Skip empty abstracts
        categories.append("0")
        justifications.append("Empty abstract, skipped")
        continue

    # Define the initial prompt template
    prompt = f"""
    Act as a chemistry expert and assign one or several of the following categories to the patent text below. 
    The categories are defined based on the following criteria:

    ### Category Definitions
    "0": "Unrelated",
    "1": "Insecticides",
    "2": "Herbicides",
    "3": "Fungicides",

    If the patent text is unrelated to any previous categories, assign category 0.

    ### Text:
    {abstract}

    ### Output Format Instruction:
    Output the results in the following JSON structure:
    {{"categories": "...", "justification": "..." }}
    """

    # Use the Groq LLM to generate a response based on the prompt
    response = llm.predict(prompt)

    # Clean the response in case it contains formatting
    cleaned_response = response.strip('```json').strip('```').strip()

    try:
        # Parse the cleaned response as JSON
        result = json.loads(cleaned_response)

        # Ensure that the response has the necessary keys
        if 'categories' in result and 'justification' in result:
            categories.append(result['categories'])
            justifications.append(result['justification'])
        else:
            categories.append("error")
            justifications.append("error in response structure")
    except json.JSONDecodeError:
        print(f"Error parsing response: {cleaned_response}")
        with open("error_responses.txt", "a") as file:
            file.write(f"Error parsing response: {cleaned_response}\n")
        categories.append("error")
        justifications.append("error in response parsing")

# Step 1: Extract data from the .txt file and handle errors carefully
def extract_data_from_txt(txt_file):
    extracted_data = []
    with open(txt_file, 'r') as file:
        for line in file:
            # Check if the line contains a valid "Error parsing response:"
            if "Error parsing response:" in line:
                json_str = line.split("Error parsing response: ", 1)[1].strip()
                
                # Skip empty or incomplete JSON strings
                if not json_str or json_str == "{":
                    continue
                
                try:
                    response_data = json.loads(json_str)
                    category = response_data.get('categories', [])
                    justification = response_data.get('justification', '')
                    extracted_data.append((category, justification))
                except json.JSONDecodeError:
                    print(f"Failed to parse line: {line}")
    return extracted_data

# Step 2: Update the DataFrame with extracted data from the .txt file
txt_file_path = 'error_responses.txt'
extracted_data = extract_data_from_txt(txt_file_path)

# Check if error rows and extracted data match
error_indices = [i for i, cat in enumerate(categories) if cat == "error"]
if len(error_indices) != len(extracted_data):
    print(f"Warning: Mismatch between extracted data and error indices. Error rows: {len(error_indices)}, Extracted: {len(extracted_data)}")
else:
    for idx, (category, justification) in zip(error_indices, extracted_data):
        categories[idx] = category
        justifications[idx] = justification

# Add categories and justifications to the results DataFrame (copy)
results_df['Categories'] = categories
results_df['Justifications'] = justifications

# Path to save the results CSV file
results_csv_file_path = r"C:\Users\Pritish\Downloads\results.csv"

# Save the updated DataFrame to the new CSV file
results_df.to_csv(results_csv_file_path, index=False)

# Step 3: Delete the text file after processing
def delete_txt_file(txt_file):
    if os.path.exists(txt_file):
        os.remove(txt_file)
        print(f"{txt_file} deleted.")
    else:
        print(f"{txt_file} does not exist.")

# Delete the txt file
delete_txt_file(txt_file_path)

print(f"Results saved to {results_csv_file_path}")


Error parsing response: {"categories": ["3"], "justification": "The patent explicitly states that the compound (I) is a 'Fungicide' and describes its use as an antifungal agent for treating fungal infections."}
```
Error parsing response: {
"categories": ["3"],
"justification": "The patent explicitly states that the composition is a fungicide and its activity is tested against a plant pathogen. The use section also mentions protection against various fungal phyla."
}
```
Error parsing response: {"categories": ["2", "3"], "justification": "The patent describes a system that produces hypochlorous acid water, which has both herbicidal and fungicidal properties.  The text states the system promotes plant growth by removing harmful bacteria that inhibit plant growth, indicating a herbicidal effect. Additionally, the system's ability to sterilize the electrolysis tank points towards its fungicidal properties."}
```
Error parsing response: {"categories": ["1"], "justification": "The patent de