In [None]:
import os
import pandas as pd
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from some_llm_library import (
    llm,
)  # Replace with your actual LLM interface (e.g., OpenAI, HuggingFace, etc.)

# 1. Define the folder path and list all CSV files
data_folder = "data/data_usda_crop"
csv_files = [f for f in os.listdir(data_folder) if f.endswith(".csv")]

# 2. Initialize the LangChain prompt template
template = """
You are a data science expert specializing in defining and explaining different types of "recall" based on data analysis. Using the data content provided below, you need to summarize why this type is defined as recall and generate a corresponding list of definitions.

1. Start by summarizing the first recall definition based on the data content and add it to the list.
2. Then, based on subsequent data, continue iteratively adding other types of recall definitions, gradually expanding this list with each iteration.
3. After all iterations are complete, return the final list of recall definitions.

Data content:
{data_content}

Current recall definitions list:
{recall_definitions}

Now summarize the next recall definition:
"""

recall_prompt = PromptTemplate(
    input_variables=["data_content", "recall_definitions"],
    template=template,
)


# 3. Function to check if a new recall definition is already in the list
def add_if_unique(recall_list, new_definition):
    if new_definition not in recall_list:
        recall_list.append(new_definition)
    return recall_list


# 4. Iterate over all CSV files, load them as DataFrame, and apply the LangChain model
all_recall_definitions = {}

for csv_file in csv_files:
    # Load each CSV file as a DataFrame
    file_path = os.path.join(data_folder, csv_file)
    df = pd.read_csv(file_path)

    # Convert the DataFrame content to a descriptive string for data_content input
    data_content = df.head().to_string()  # Use the first few rows as a summary
    recall_definitions = []  # Initialize an empty recall definitions list

    # Initialize the LLMChain by passing in the model and prompt template
    chain = LLMChain(llm=llm, prompt=recall_prompt)

    while True:
        # Generate the next recall definition
        response = chain.run(
            data_content=data_content, recall_definitions=recall_definitions
        )

        # Check if the response (new definition) is unique and add it if it is
        recall_definitions = add_if_unique(recall_definitions, response.strip())

        # Terminate the loop if no new unique definitions are added
        if response.strip() in recall_definitions:
            break

    # Add the generated recall definitions to the results dictionary
    all_recall_definitions[csv_file] = recall_definitions

# 5. Print or save the final recall definitions
for file_name, recall_list in all_recall_definitions.items():
    print(f"File: {file_name}")
    print(f"Recall Definitions: {recall_list}\n")