In [None]:
import openai
import pandas as pd
import numpy as np
import pickle
import os
from transformers import GPT2TokenizerFast
from typing import List
from langchain.document_loaders import CSVLoader
from langchain.chains import RetrievalQA
from langchain.llms import AzureOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models.azure_openai import AzureChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
from langchain.prompts import ChatPromptTemplate

# Set your OpenAI API key
openai_api_key = 'XXXX'  # Replace with your actual API key
os.environ['OPENAI_API_KEY'] = openai_api_key
openai.api_key = openai_api_key

# Azure OpenAI deployment details
openai_api_version = '2023-03-15-preview'
deployment_name = 'XXXX'
EMBEDDING_MODEL = "text-embedding-ada-002"

# Classification model
model_name = "GPT-3.5"  # Or 'GPT-4'

# Define classification categories and their descriptions
category_definitions = {
    "0": "Unrelated",
    "1": "Insecticides",
    "2": "Herbicides",
    "3": "Fungicides",
}

def text_word_splitter(text, num_words=2000):
    """Splits text into a maximum number of words."""
    text_list = text.split(" ")
    return " ".join(text_list[:num_words])

def classify_text(text, rag_model):
    """
    Classifies a given text using the Azure LLM for text classification.

    Args:
        text (str): The text to classify.
        rag_model (RetrievalQA): The retrieval-augmented generation model.

    Returns:
        dict: A dictionary containing the predicted category and justification.
    """
    initial_prompt = f"""Act as a chemistry expert and assign one or several of the following categories (0, 1, 2, 3) to the patent text below. The categories are defined based on the following criteria:

    ### Category Definitions
    {category_definitions}

    If the patent text is unrelated to any previous categories, assign category 0.
    """

    template_string = initial_prompt + """

    ### Text:
    {text}
    ###

    ### Output Format Instruction:
    Output the results in the following JSON structure:
    {{"categories": "..",
     "justification": ".." }}

    For the value 'categories' put all applicable categories separated with a semicolon.
    For the value of "justification" put one short sentence regarding ALL defined categories above why it is assigned or not assigned. The structure should be category 0: one sentence for justification why it was assigned or not assigned; category 1: one sentence for justification why it was assigned or not assigned; ...
    """

    prompt = ChatPromptTemplate.from_template(template=template_string)
    chat = AzureChatOpenAI(deployment_name=f"XX-{model_name}-16k")

    messages = prompt.format_messages(text=text_word_splitter(text, num_words=2000))
    response = chat(messages)

    category_schema = ResponseSchema(name="categories", description="Assigned categories")
    justification_schema = ResponseSchema(name="justification", description="Justification for categories")
    response_schemas = [category_schema, justification_schema]

    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    try:
        output_dict = output_parser.parse(response.content)
        return output_dict
    except Exception as e:
        print(f"Error parsing LLM response: {e}")
        return {"categories": "ERROR", "justification": "Error parsing response"}

# Load reference dataset and create embeddings
reference_data_path = r"path_to_reference_data.csv"  # Path to reference data file
loader = CSVLoader(file_path=reference_data_path)
reference_docs = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=1)
reference_texts = text_splitter.split_documents(reference_docs)

embeddings = OpenAIEmbeddings(
    openai_api_base='XXXX',  # Replace with your actual OpenAI API base
    openai_api_type='azure',
    deployment='XXXX',  # Replace with your actual deployment name
    openai_api_key=openai_api_key,
    chunk_size=1,
)
doc_search = Chroma.from_documents(reference_texts, embeddings)

rag_model = RetrievalQA(
    retriever=doc_search.as_retriever(),
    llm=AzureOpenAI(deployment_name=deployment_name)
)

# Path to the data file for classification and output path
data_path = r"path_to_data_for_classification.csv"  # Adjust this path as needed
output_path = r"path_to_save_output_csv_file.csv"  # Adjust this path as needed

def classify_data(data_path, output_path, rag_model):
    """
    Classifies text data from a CSV file and saves the results to another CSV file.

    Args:
        data_path (str): The path to the CSV file containing text data.
        output_path (str): The path to save the output CSV file with classifications.
        rag_model (RetrievalQA): The retrieval-augmented generation model.
    """
    data = pd.read_csv(data_path)
    classifications = []
    for index, row in data.iterrows():
        text = row['text_column_name']  # Replace 'text_column_name' with the actual name of the text column
        classification_result = classify_text(text, rag_model)
        classifications.append(classification_result)

    classified_data = pd.DataFrame(classifications)
    classified_data = pd.concat([data, classified_data], axis=1)
    classified_data.to_csv(output_path, index=False)

# Run the classification on the provided CSV file
classify_data(data_path, output_path, rag_model)


In [None]:
import openai
import pandas as pd
import numpy as np
import pickle
import os
from transformers import GPT2TokenizerFast
from typing import List
from langchain.document_loaders import CSVLoader
from langchain.chains import RetrievalQA
from langchain.llms import AzureOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models.azure_openai import AzureChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
from langchain.prompts import ChatPromptTemplate

# Set your OpenAI API key
openai_api_key = 'XXXX'  # Replace with your actual API key
os.environ['OPENAI_API_KEY'] = openai_api_key
openai.api_key = openai_api_key

# Azure OpenAI deployment details
openai_api_version = '2023-03-15-preview'
deployment_name = 'XXXX'
EMBEDDING_MODEL = "text-embedding-ada-002"

# Classification model
model_name = "GPT-3.5"  # Or 'GPT-4'

# Define classification categories and their descriptions
category_definitions = {
    "0": "Unrelated",
    "1": "Insecticides",
    "2": "Herbicides",
    "3": "Fungicides",
}

def text_word_splitter(text, num_words=2000):
    """Splits text into a maximum number of words."""
    text_list = text.split(" ")
    return " ".join(text_list[:num_words])

def classify_text(text, rag_model):
    """
    Classifies a given text using the Azure LLM for text classification.

    Args:
        text (str): The text to classify.
        rag_model (RetrievalQA): The retrieval-augmented generation model.

    Returns:
        dict: A dictionary containing the predicted category and justification.
    """
    initial_prompt = f"""Act as a chemistry expert and assign one or several of the following categories (0, 1, 2, 3) to the patent text below. The categories are defined based on the following criteria:

    ### Category Definitions
    {category_definitions}

    If the patent text is unrelated to any previous categories, assign category 0.
    """

    template_string = initial_prompt + """

    ### Text:
    {text}
    ###

    ### Output Format Instruction:
    Output the results in the following JSON structure:
    {{"categories": "..",
     "justification": ".." }}

    For the value 'categories' put all applicable categories separated with a semicolon.
    For the value of "justification" put one short sentence regarding ALL defined categories above why it is assigned or not assigned. The structure should be category 0: one sentence for justification why it was assigned or not assigned; category 1: one sentence for justification why it was assigned or not assigned; ...
    """

    prompt = ChatPromptTemplate.from_template(template=template_string)
    chat = AzureChatOpenAI(deployment_name=f"XX-{model_name}-16k")

    messages = prompt.format_messages(text=text_word_splitter(text, num_words=2000))
    response = chat(messages)

    category_schema = ResponseSchema(name="categories", description="Assigned categories")
    justification_schema = ResponseSchema(name="justification", description="Justification for categories")
    response_schemas = [category_schema, justification_schema]

    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    try:
        output_dict = output_parser.parse(response.content)
        return output_dict
    except Exception as e:
        print(f"Error parsing LLM response: {e}")
        return {"categories": "ERROR", "justification": "Error parsing response"}

# Load reference dataset and create embeddings
reference_data_path = r"path_to_reference_data.csv"  # Path to reference data file
loader = CSVLoader(file_path=reference_data_path)
reference_docs = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=1)
reference_texts = text_splitter.split_documents(reference_docs)

embeddings = OpenAIEmbeddings(
    openai_api_base='XXXX',  # Replace with your actual OpenAI API base
    openai_api_type='azure',
    deployment='XXXX',  # Replace with your actual deployment name
    openai_api_key=openai_api_key,
    chunk_size=1,
)
doc_search = Chroma.from_documents(reference_texts, embeddings)

rag_model = RetrievalQA(
    retriever=doc_search.as_retriever(),
    llm=AzureOpenAI(deployment_name=deployment_name)
)

# Path to the data file for classification and output path
data_path = r"path_to_data_for_classification.csv"  # Adjust this path as needed
output_path = r"path_to_save_output_csv_file.csv"  # Adjust this path as needed

def classify_data(data_path, output_path, rag_model):
    """
    Classifies text data from a CSV file and saves the results to another CSV file.

    Args:
        data_path (str): The path to the CSV file containing text data.
        output_path (str): The path to save the output CSV file with classifications.
        rag_model (RetrievalQA): The retrieval-augmented generation model.
    """
    data = pd.read_csv(data_path)
    category_list = []
    justification_list = []

    for index, row in data.iterrows():
        text = row['text_column_name']  # Replace 'text_column_name' with the actual name of the text column
        classification_result = classify_text(text, rag_model)
        category_list.append(classification_result['categories'])
        justification_list.append(classification_result['justification'])

    data['Category Predictions'] = category_list
    data['Justification of Category Predictions'] = justification_list
    data.to_csv(output_path, index=False)

# Run the classification on the provided CSV file
classify_data(data_path, output_path, rag_model)


In [None]:
###Final code
import openai
import pandas as pd
import numpy as np
import pickle
import os
from transformers import GPT2TokenizerFast
from typing import List
from langchain.document_loaders import CSVLoader
from langchain.chains import RetrievalQA
from langchain.llms import AzureOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models.azure_openai import AzureChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
from langchain.prompts import ChatPromptTemplate

# Set your OpenAI API key
openai_api_key = 'XXXX'  # Replace with your actual API key
os.environ['OPENAI_API_KEY'] = openai_api_key
openai.api_key = openai_api_key

# Azure OpenAI deployment details
openai_api_version = '2023-03-15-preview'
deployment_name = 'XXXX'
EMBEDDING_MODEL = "text-embedding-ada-002"

# Classification model
model_name = "GPT-3.5"  # Or 'GPT-4'

# Define classification categories and their descriptions
category_definitions = {
    "0": "Unrelated",
    "1": "Insecticides",
    "2": "Herbicides",
    "3": "Fungicides",
}

def text_word_splitter(text, num_words=2000):
    """Splits text into a maximum number of words."""
    text_list = text.split(" ")
    return " ".join(text_list[:num_words])

def classify_text(text, rag_model):
    """
    Classifies a given text using the Azure LLM for text classification.

    Args:
        text (str): The text to classify.
        rag_model (RetrievalQA): The retrieval-augmented generation model.

    Returns:
        dict: A dictionary containing the predicted category and justification.
    """
    initial_prompt = f"""Act as a chemistry expert and assign one or several of the following categories (0, 1, 2, 3) to the patent text below. The categories are defined based on the following criteria:

    ### Category Definitions
    {category_definitions}

    If the patent text is unrelated to any previous categories, assign category 0.
    """

    template_string = initial_prompt + """

    ### Text:
    {text}
    ###

    ### Output Format Instruction:
    Output the results in the following JSON structure:
    {{"categories": "..",
     "justification": ".." }}

    For the value 'categories' put all applicable categories separated with a semicolon.
    For the value of "justification" put one short sentence regarding ALL defined categories above why it is assigned or not assigned. The structure should be category 0: one sentence for justification why it was assigned or not assigned; category 1: one sentence for justification why it was assigned or not assigned; ...
    """

    prompt = ChatPromptTemplate.from_template(template=template_string)
    chat = AzureChatOpenAI(deployment_name=f"XX-{model_name}-16k")

    text_chunks = text_word_splitter(text, num_words=2000)
    messages = prompt.format_messages(text=text_chunks)
    response = chat(messages)

    category_schema = ResponseSchema(name="categories", description="Assigned categories")
    justification_schema = ResponseSchema(name="justification", description="Justification for categories")
    response_schemas = [category_schema, justification_schema]

    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    try:
        output_dict = output_parser.parse(response.content)
        return output_dict
    except Exception as e:
        print(f"Error parsing LLM response: {e}")
        return {"categories": "ERROR", "justification": "Error parsing response"}

# Load reference dataset and create embeddings
reference_data_path = r"path_to_reference_data.csv"  # Path to reference data file
loader = CSVLoader(file_path=reference_data_path)
reference_docs = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=1)
reference_texts = text_splitter.split_documents(reference_docs)

embeddings = OpenAIEmbeddings(
    openai_api_base='XXXX',  # Replace with your actual OpenAI API base
    openai_api_type='azure',
    deployment='XXXX',  # Replace with your actual deployment name
    openai_api_key=openai_api_key,
    chunk_size=1,
)
doc_search = Chroma.from_documents(reference_texts, embeddings)

rag_model = RetrievalQA(
    retriever=doc_search.as_retriever(),
    llm=AzureOpenAI(deployment_name=deployment_name)
)

# Path to the data file for classification and output path
data_path = r"path_to_data_for_classification.csv"  # Adjust this path as needed
output_path = r"path_to_save_output_csv_file.csv"  # Adjust this path as needed

def classify_data(data_path, output_path, rag_model):
    """
    Classifies text data from a CSV file and saves the results to another CSV file.

    Args:
        data_path (str): The path to the CSV file containing text data.
        output_path (str): The path to save the output CSV file with classifications.
        rag_model (RetrievalQA): The retrieval-augmented generation model.
    """
    data = pd.read_csv(data_path)
    category_list = []
    justification_list = []

    for index, row in data.iterrows():
        text = row['text']  # Replace 'text' with the actual name of the text column in your CSV
        classification_result = classify_text(text, rag_model)
        category_list.append(classification_result['categories'])
        justification_list.append(classification_result['justification'])

    data['Category Predictions'] = category_list
    data['Justification of Category Predictions'] = justification_list
    data.to_csv(output_path, index=False)

# Run the classification on the provided CSV file
classify_data(data_path, output_path, rag_model)
