In [None]:
from langchain import PromptTemplate
from langchain.chat_models import PromptLayerChatOpenAI
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain, StuffDocumentsChain
from langchain.chains import LLMChain
from langchain.output_parsers import CommaSeparatedListOutputParser
from langchain.docstore.document import Document
import os
import promptlayer
import pandas as pd


gpt_model = "gpt-4"
promptlayer.api_key = os.environ["PROMPTLAYER_API_KEY"]
openai = promptlayer.openai
openai.api_key = os.environ["OPENAI_API_KEY"]





# Parameters

In [None]:
filename = "instagram-reel-archive.csv"
topic_filename = "topics.csv"

# Read Data in From Instagram Reels CSV

The Instagram posts that we are attempting to classify are contained within a CSV file. These will be read into an array of post captions.

In [None]:
df = pd.read_csv(filename)
captions = df['Caption'].iloc[1:]
doc_captions = [Document(page_content=t) for t in captions]


# Read Existing Topics from CSV

This notebook is designed to taje as an input the global set of topics that have already been put together, most likely as an output from this notebook previously run. In this step we use Pandas to read the existing set of topics into the notebook.

In [None]:
df = pd.read_csv(topic_filename)
existing_topics = df["topics"]

if len(existing_topics)!=1:
    topics_combined = '\n\n'.join(existing_topics.astype(str))
else:
    topics_combined = ''
    
print(f"Topics List: {topics_combined}")

# Map Prompt and Chain

The following prompt is used to develop the "map" step of the Map-Reduce chain. This prompt is run on each individual post, and is used to extract a set of "topics" local to that post. 

In [None]:
llm = PromptLayerChatOpenAI(model=gpt_model,pl_tags=["InstagramClassifier"])
map_template = """The following is a set of captions taken from Instagram posts written by a Reproductive Endocrinologist, they are delimeted by ``` . 

Based on these captions please create a comprehensive list of topics relating to fertility, reproduction and women's health. 

If a post does not relate to any of those broad themes, please do not include them in the list of generated topics.
```
{captions}
```
"""

map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm,prompt=map_prompt)

# Reduce Prompt and Chain

The following prompt is for the "reduce" step of the algorithm. It operates against the entire set of output that is produced by the "map" chain. In our example, the map chain yields a set of topics that are defined on the caption it was run on. These are then grouped together and passed as a result to this reduce chain. This reduce chain is designed to take this global output of the map step and reduce it down to a final set of unique fertility topics that minimized contextual overlap.

In [None]:
reduce_template = """The following is a set of fertility, reproduction and women's health topics which are delimeted by ``` .
Take these and organize these into a final, consolidated list of unique topics. 

You should combine topics into one that are similar but use different variations of the same words. 

Also, you should combine topics that use an acronym versus the full spelling. For example, 'TTC Myths' and 'Trying to Conceive Myths' should be a single topic 'Trying to Concieve Myths'.

{format_instructions}

```
{topics}
```
"""

output_parser = CommaSeparatedListOutputParser()
format_instructions = output_parser.get_format_instructions()


reduce_prompt = PromptTemplate(template=reduce_template,input_variables=["topics"],partial_variables={"format_instructions":format_instructions})
#reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain=LLMChain(llm=llm,prompt=reduce_prompt)

combine_documents_chain = StuffDocumentsChain(
    llm_chain=reduce_chain, document_variable_name="topics"
)

reduce_documents_chain = ReduceDocumentsChain(
    combine_documents_chain=combine_documents_chain,
    collapse_documents_chain=combine_documents_chain,
    token_max=4000)


map_reduce_chain = MapReduceDocumentsChain(
    llm_chain=map_chain,
    reduce_documents_chain=reduce_documents_chain,
    document_variable_name="captions",
    return_intermediate_steps=False)





# Run Map-Reduce Chain

In this step we run the actual MapReduceDocumentsChain, this will then begin to process all of the captions that were read in from the CSV file and perform the "map" step on each of them. Once the "map" step is completed opn all of them it will then run the final "reduce" step and output a final list of topics.

In [None]:
output = map_reduce_chain.run(doc_captions)
new_topics = output_parser.parse(output)
print(f"Generated {len(new_topics)} new topics")
print(new_topics)

# Merge New Topic List with Existing One

In the following step, we use a new separate chain to eliminate any new topics which are already covered or similar to topics that were read in at the start of processing. 

In [None]:
eliminate_duplicates_template = """The items delimeted by ``` are a list of newly proposed reproductive health topics.

The items delimeted by <> are a list of existing topics. 

Your job is to return a set of newly proposed topics that are different than any of the existing topics.

You should treat any two topics that use different variations and conjugations of the same word roots as being the same.

You should treat any two topics that use an acronym versus the full spelling of that acronym as being the same.

For example, if a newly proposed topic is "IVF Journey", and there exists an existing topic called "Fertility Journey", these two topics are the same and you should not return "IVF Journey"

{format_instructions}

```
{new_topics}
```

<{existing_topics}>

""" 


prompt_template = PromptTemplate(template=eliminate_duplicates_template, input_variables=["new_topics","existing_topics"],partial_variables={"format_instructions":format_instructions})
llm_chain = LLMChain(llm=llm,prompt=prompt_template)
output = llm_chain.predict(new_topics=output,existing_topics=topics_combined)
new_topics = output_parser.parse(output)
print(f"Generated final set of {len(new_topics)} topics.")
print(new_topics)



In [None]:
print(new_topics)

# Write back the new Topics List

In this step, we merge together the ready in topics list with the set of newly defined topics and write them back to the CSV file. 

In [None]:
#need to combine the existing topics list with the new set of topics
existing_topics_lower = [item.lower() for item in existing_topics]
new_topics_lower = [item.lower() for item in new_topics]

merged_topics_list_lower = existing_topics_lower + new_topics_lower
unique_topics_list_lower = list(set(merged_topics_list_lower))

final_topics_list = []
for item in unique_topics_list_lower:
    if item in existing_topics_lower:
        final_topics_list.append(existing_topics[existing_topics_lower.index(item)])
    elif item in new_topics_lower:
        final_topics_list.append(new_topics[new_topics_lower.index(item)])

print(final_topics_list)

df = pd.DataFrame({'topics':final_topics_list})
df.to_csv(topic_filename,index=False)