In [2]:
import os
from getpass import getpass
import dotenv
from dotenv import load_dotenv

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Text-mining-for-taxonomy"
load_dotenv()

True

In [2]:
import logging
import operator
from typing import Annotated, List, Optional, TypedDict

logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger("tnt-llm")

class Doc(TypedDict):
    id: str
    content: str
    summary: Optional[str]
    explanation: Optional[str]
    category: Optional[str]

class TaxonomyGenerationState(TypedDict):
    # The raw docs; we inject summaries within them in the first step
    documents: List[Doc]
    # Indices to be concise
    minibatches: List[List[int]]
    # Candidate Taxonomes (full trajectory)
    clusters: Annotated[List[List[dict]], operator.add]

# Summarize docs

Chat logs can get quite long. Our taxonomy generation step needs to see large, diverse minibatches to be able to adequately capture the distribution of categories. To ensure they can all fit efficiently into the context window, we first summarize each chat log. Downstream steps will use these summaries instead of the raw doc content.

In [3]:
import re
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableConfig, RunnableLambda, RunnablePassthrough

summary_prompt = hub.pull("wfh/tnt-llm-summary-generation").partial(
    summary_prompt=20, explanation_length=30
)

def parse_summary(xml_string: str) -> dict:
    summary_pattern = r"<summary>(.*?)</summary>"
    explanation_pattern = r"<explanation>(.*?)</explanation>"

    summary_match = re.search(summary_pattern, xml_string, re.DOTALL)
    explanation_match = re.search(explanation_pattern, xml_string, re.DOTALL)

    summary = summary_match.group(1).strip() if summary_match else ""
    explanation = explanation = explanation_match.group(1).strip() if explanation_match else ""

    return {"summary": summary, "explanation": explanation}

summary_llm_chain = (
    summary_prompt
    | ChatOpenAI(model="gpt-4o-mini")
    | StrOutputParser()
    # Customise the tracking name for easier organization
).with_config(run_name="GenerateSummary")
summary_chain = summary_llm_chain | parse_summary

# Now combine as a "map" operation in a map-reduce chain 
# Input: state
# Output: state U summaries
# Processes docs in parallel
def get_content(state: TaxonomyGenerationState):
    docs = state['documents']
    return [{"content": doc['content']} for doc in docs]

map_step = RunnablePassthrough.assign(
    summaries=get_content
    # This effectively creates a "map" operation
    # Note you can make this more robust by handling individual errors
    | RunnableLambda(func=summary_chain.batch, afunc=summary_chain.abatch)
)

def reduce_summaries(combined: dict) -> TaxonomyGenerationState:
    summaries = combined['summaries']
    documents = combined['documents']
    return {
        "documents": [
            {
                "id": doc['id'],
                "content": doc['content'],
                "summary": summ_info['summary'],
                "explanation": summ_info['explanation'],
            }
            for doc, summ_info in zip(documents, summaries)
        ]
    }
map_reduce_chain = map_step | reduce_summaries

# Split into Minibatches

Each minibatch contains a random sample of docs. This lets the flow identify inadequacies in the current taxonomy using new data.

In [4]:
import random

def get_minibatches(state: TaxonomyGenerationState, config: RunnableConfig):
    batch_size = config['configurable'].get("batch_size", 200)
    original = state['documents']
    indices = list(range(len(original)))
    random.shuffle(indices)
    if len(indices) < batch_size:
        # Don't pad needlessly if we can't fill a single batch
        return [indices]
    
    num_full_batches = len(indices) // batch_size

    batches = [
        indices[i * batch_size: (i+1) * batch_size] for i in range(num_full_batches)
    ]
    leftovers = len(indices) % batch_size
    if leftovers:
        last_batch = indices[num_full_batches * batch_size :]
        elements_to_add = batch_size - leftovers
        last_batch += random.sample(indices, elements_to_add)
        batches.append(last_batch)

    return {
        "minibatches": batches,
    }

# Taxonomy Generation Utilities

In [5]:
from typing import Dict
from langchain_core.runnables import Runnable

def parse_taxa(output_text: str) -> Dict:
    """Extract the taxonomy from the generated output."""
    cluster_matches = re.findall(
        r"\s*<id>(.*?)</id>\s*<name>(.*?)</name>\s*<description>(.*?)</description>\s*",
        output_text,
        re.DOTALL,
    )
    clusters = [
        {'id': id.strip(), "name": name.strip(), "description": description.strip()}
        for id, name, description in cluster_matches
    ]
    # We don't parse the explanation since it isn't used downstream
    return {'clusters': clusters}

def format_docs(docs: List[Doc]) -> str:
    xml_table = "<conversation>\n"
    for doc in docs:
        xml_table += f'<conv_summ id={doc["id"]}>{doc["summary"]}</conv_summ>\n'
    xml_table +="</conversation>"
    return xml_table

def format_taxonomy(clusters):
    xml = "<cluster_table>\n"
    for label in clusters:
        xml += "  <cluster>\n"
        xml += f'    <id>{label["id"]}</id>\n'
        xml += f'    <name>{label["name"]}</name>\n'
        xml += f'    <description>{label["description"]}</description>\n'
        xml += "  </cluster>\n"
    xml += "</cluster_table>"
    return xml

def invoke_taxonomy_chain(
        chain: Runnable,
        state: TaxonomyGenerationState,
        config: RunnableConfig,
        mb_indices: List[int],
) -> TaxonomyGenerationState:
    configurable = config['configurable']
    docs = state['documents']
    minibatch = [docs[idx] for idx in mb_indices]
    data_table_xml = format_docs(minibatch)

    previous_taxonomy = state['clusters'][-1] if state['clusters'] else []
    cluster_table_xml = format_taxonomy(previous_taxonomy)

    updated_taxonomy = chain.invoke(
        {"data_xml": data_table_xml,
         "use_case": configurable['use_case'],
         "cluster_table_xml": cluster_table_xml,
         "suggestion_length": configurable.get("suggestion_length", 30),
         "cluster_name_length": configurable.get("cluster_name_length", 10),
         "cluster_description_length": configurable.get("cluster_description_length", 30),
         "explanation_length": configurable.get("explanation_length", 20),
         "max_num_clusters": configurable.get("max_num_clusters", 25)
         }
    )

    return {
        "clusters": [updated_taxonomy['clusters']],
    }



# Generate initial taxonomy

In [6]:
# We will share an LLM for each step of the generate -> update -> review cycle
taxonomy_generation_llm = ChatOpenAI(model="gpt-4o", max_tokens_to_sample=2000)

# Initial generation
taxonomy_generation_prompt = hub.pull("wfh/tnt-llm-taxonomy-generation").partial(
    use_case='Generate the taxonomy that can be used to label the user intent in the conversation.',
)

taxa_gen_llm_chain = (
    taxonomy_generation_prompt | taxonomy_generation_llm | StrOutputParser()
).with_config(run_name='GenerateTaxonomy')

generate_taxonomy_chain = taxa_gen_llm_chain | parse_taxa

def generate_taxonomy(
        state: TaxonomyGenerationState, config: RunnableConfig
) -> TaxonomyGenerationState:
    return invoke_taxonomy_chain(
        generate_taxonomy_chain, state, config=state['minibatches'][0]
    )

                max_tokens_to_sample was transferred to model_kwargs.
                Please confirm that max_tokens_to_sample is what you intended.


# Update Taxonomy

In [7]:
taxonomy_update_prompt = hub.pull("wfh/tnt-llm-taxonomy-update")

taxa_update_llm_chain = (
    taxonomy_update_prompt | taxonomy_generation_llm | StrOutputParser()
).with_config(run_name='UpdateTaxonomy')

update_taxonomy_chain = taxa_update_llm_chain | parse_taxa

def update_taxonomy(
        state: TaxonomyGenerationState, config: RunnableConfig
) -> TaxonomyGenerationState:
    which_mb = len(state['clusters']) % len(state['minibatches'])
    return invoke_taxonomy_chain(
        update_taxonomy_chain, state, config, state['minibatches'][which_mb]
)

# Review Taxonomy

In [8]:
taxonomy_review_prompt = hub.pull("wfh/tnt-llm-taxonomy-review")
taxa_review_llm_chain = (
    taxonomy_review_prompt | taxonomy_generation_llm | StrOutputParser()
).with_config(run_name='ReviewTaxonomy')

review_taxonomy_chain = taxa_review_llm_chain | parse_taxa

def review_taxonomy(
        state: TaxonomyGenerationState, config: RunnableConfig
) -> TaxonomyGenerationState:
    batch_size = config['configurable'].get("batch_size", 200)
    original = state['documents']
    indices = list(range(len(original)))
    random.shuffle(indices)
    return invoke_taxonomy_chain(
        review_taxonomy_chain, state, config, indices[:batch_size]
    )

# Define the Graph

In [9]:
from langgraph.graph import StateGraph, START, END

graph = StateGraph(TaxonomyGenerationState)
graph.add_node("summarize", map_reduce_chain)
graph.add_node("get_minibatches", get_minibatches)
graph.add_node("generate_taxonomy", generate_taxonomy)
graph.add_node("update_taxonomy", update_taxonomy)
graph.add_node("review_taxonomy", review_taxonomy)

graph.add_edge('summarize', 'get_minibatches')
graph.add_edge("get_minibatches", "generate_taxonomy")
graph.add_edge("generate_taxonomy", 'update_taxonomy')

def should_review(state: TaxonomyGenerationState) -> str:
    num_minibatches = len(state['minibatches'])
    num_revisions = len(state['clusters'])
    if num_revisions < num_minibatches:
        return "update_taxonomy"
    return "review_taxonomy"
graph.add_conditional_edges(
    "update_taxonomy",
    should_review,
    # Optional (but required for the diagram to be drawn correctly below)
    {"update_taxonomy": "update_taxonomy", 'review_taxonomy': "review_taxonomy"},
)
graph.add_edge('review_taxonomy', END)
graph.add_edge(START, 'summarize')

app = graph.compile()

# Download customer reviews for a products

# Kaggle Credentials

I don't want to use google drive here, so i decided to use Kaggle API to directly  download dataset from Kaggle competition page.
For this:

* First, you need to install the Kaggle API by running the following command in a code cell:   


```
!pip install kaggle
```

* Next, you need to download your Kaggle API key. To do this, go to your Kaggle account settings page, scroll down to the "API" section, and click on the "Create New API Token" button. This will download a JSON file with your Kaggle API credentials.  
link: https://www.machinelearningmindset.com/kaggle-dataset-in-google-colab/


* Upload your Kaggle API key to Jupyter Notebook by clicking on the "Files" tab on the left-hand side of the Colab interface, then click on "Upload" and select the JSON file containing your Kaggle API credentials.

* Run the following commands in a code cell to authenticate the Kaggle API:


```
import os
os.environ['KAGGLE_USERNAME'] = "your_kaggle_username"
os.environ['KAGGLE_KEY'] = "your_kaggle_api_key"

```


In [3]:
# Downloading dataset from kaggle
import os
import random
import shutil
import requests
import zipfile
from pathlib import Path

!kaggle datasets download -d nicapotato/womens-ecommerce-clothing-reviews



# Unzip journey-springfield.zip
with zipfile.ZipFile("womens-ecommerce-clothing-reviews.zip", "r") as zip_ref:
    print("Unzipping  data...") 
    zip_ref.extractall()
    print("Unzipping complete.")

Dataset URL: https://www.kaggle.com/datasets/nicapotato/womens-ecommerce-clothing-reviews
License(s): CC0-1.0
Downloading womens-ecommerce-clothing-reviews.zip to /Users/azamatkuzdibayev/train/llm/projects/Text-mining-at-Scale-for-taxonomy
 72%|███████████████████████████▎          | 2.00M/2.79M [00:00<00:00, 3.18MB/s]
100%|██████████████████████████████████████| 2.79M/2.79M [00:00<00:00, 3.74MB/s]


In [7]:
import pandas as pd

df  = pd.read_csv("./Womens Clothing E-Commerce Reviews.csv")

In [8]:
df

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
...,...,...,...,...,...,...,...,...,...,...,...
23481,23481,1104,34,Great dress for many occasions,I was very happy to snag this dress at such a ...,5,1,0,General Petite,Dresses,Dresses
23482,23482,862,48,Wish it was made of cotton,"It reminds me of maternity clothes. soft, stre...",3,1,0,General Petite,Tops,Knits
23483,23483,1104,31,"Cute, but see through","This fit well, but the top was very see throug...",3,0,1,General Petite,Dresses,Dresses
23484,23484,1084,28,"Very cute dress, perfect for summer parties an...",I bought this dress for a wedding i have this ...,3,1,2,General,Dresses,Dresses
