# Dependancies

## Requirements

In [None]:
#!pip install sentence_transformers langchain openai tqdm datasets asyncio scikit-learn cohere tiktoken umap altair

In [None]:
import numpy as np
import re
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
import umap
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from typing import List
import enum

from langchain_community.llms import Ollama
from langchain.output_parsers.regex_dict import RegexDictParser
from langchain.output_parsers import PydanticOutputParser
from langchain_core.messages import HumanMessage, SystemMessage, ChatMessage
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from pydantic import BaseModel, Field, validator, create_model
from openai import AsyncOpenAI, OpenAI
#import asyncio
import os

import requests

from pydantic import BaseModel, ValidationInfo, model_validator


import json

import itertools
from copy import deepcopy
from tqdm.notebook import tqdm, trange
from sklearn.cluster import KMeans

import umap.umap_ as umap
#import umap
import hdbscan

In [None]:
from src.bubble import *
from src.models import *
from src.utilities import *

In [None]:
prompts_path = "Prompts/fr/"

## Bubble API

In [None]:
feedbacks_df = get("Feedback", max_objects=100)

In [None]:
types_df = get("Type", constraints=[])
categories_df = get("Category")
original_subcategories_df = get("SubCategory")


In [None]:
company_infos = bubble_client.get(
    "Company",
    bubble_id=COMPANY_ID,
)
project_infos = bubble_client.get(
    "Project",
    bubble_id=PROJECT_ID,
)

In [None]:
types_df

In [None]:
TypeInsight = enum.Enum("Type de l'insight", [(convert_text_to_constants(x), x) for x in types_df.Name])
TypeInsight("Point positif")

In [None]:
TypeInsight.POINT_POSITIF

In [None]:
TypeInsight = enum.Enum("Type de l'insight", [(convert_text_to_constants(x), x) for x in types_df.Name])
CategoryInsight = enum.Enum("Categories de l'insight", [(convert_text_to_constants(x), x) for x in categories_df.Name])
dict_SubCategoriesInsight = {
    row["Name"]:enum.Enum("Categories de l'insight", [(convert_text_to_constants(x), x) for x in original_subcategories_df[original_subcategories_df["Parent_category"] == row["_id"]].Name])
    for _,row in categories_df.iterrows()
}

In [None]:
categories_df

In [None]:
df  = categories_df.copy()
df['Parent_category'] = df['_id'].astype(str)
original_subcategories_df['Parent_category'] = original_subcategories_df['Parent_category'].astype(str)
df = pd.merge(original_subcategories_df, df, on=["Parent_category", "Company"])
df = df[["Name_x", "Name_y", "Company", "Description", "_id_x", "_id_y"]]
df.columns = ["Name", "Category",  "Company", "Description", "_id", "Category_id"]
subcategories_df = df
subcategories_df.head()

In [None]:
SubCategoriyInsight = enum.Enum("Categories de l'insight", [(row["Category"]+" : "+row["Name"], convert_text_to_constants(row["Category"]+" : "+row["Name"])) for _, row in subcategories_df.iterrows()])

In [None]:
types_descr = columns_to_string(types_df, "Name", "Description")
print(types_descr)


In [None]:
tags_descr = columns_to_string(subcategories_df, "Name", "Description")
print(tags_descr)

In [None]:
#example_insight = "Manque de clarté de l'affichage des prix en magasin"
#exemple_commentaire = "je suis exclusif metro je n ai aucun representant j achetais jusqu a present tout metro par facilite mais je suis tres souvent décue par la reponse ha non on n en a pas cela arrive demain je pense que depuis le covid tout le monde ou presque s en fou!!!"
#examples_insights_df = pd.DataFrame([
#    {"Insights qui devraient en découler": "Déceptions face aux retards de livraison"},
#    {"Insights qui devraient en découler": "Impression d'une baisse de qualité du service depuis le Covid"},
#])

feedback_context = {
            "entreprise": company_infos["Name"],
            "context": company_infos['Context'],
            "role": company_infos['Role'],
            "cible": project_infos['Target'],
            "insight_types": types_descr,
            "insight_categories": tags_descr,
            #"question": project_infos['Study_question'],
            #"exemple_commentaire": exemple_commentaire,
            #"example_insights": '\n- '.join(list(examples_insights_df['Insights qui devraient en découler'])),
        }

feedback_context

# Insights extraction

### Aspects and Insights creation

In [None]:
#FeedbackIndex = enum.Enum("Indice du retour associé", [(str(i), i) for i in range(BATCH_SIZE)])

class SousCategorie(BaseModel):
    indice: int = Field(description="Indice de la sous-catégorie. Doit être un entier.")
    nom: str = Field(description="Nom de la sous-catégorie.")

    def __str__(self):
        return self.nom + ' ('+str(self.indice)+')'

    @model_validator(mode="after")
    def validate_ids(self, info: ValidationInfo):
        context = info.context
        if context:
            tags: List[SousCategorie] = context.get("sous_categories")
            assert self.indice in {
                tag.indice for tag in tags
            }, f"sous_categories ID {self.indice} not found in context"
            assert self.nom in {
                tag.nom for tag in tags
            }, f"sous_categories name {self.nom} not found in context"
        return self
    
class SousCategorieAvecDescription(SousCategorie):
    categorie : str
    description: str


class Aspect(BaseModel):
    sous_categorie : SousCategorie = Field(description="Sous-catégorie concernée.")
    note_satisfaction : int = Field(description="Note de satisfaction du client concernant cette sous-catégorie, de 1 (pas content) à 5 (très content).")
    explication: str = Field(description="Eventuelle explication du ressenti du client, si celle-ci parait importante à faire remonter au sein de l'entreprise. Doit être aussi claire et concise que possible.") #Field(description="Point intéressant a retenir du commentaire.")


    def __str__(self):
        return '\n' + str(self.sous_categorie) + '\nSatisfaction: ' + str(self.note_satisfaction) + "/5\nExplication: " + self.explication
    
    @model_validator(mode="after")
    def validate_ids(self, info: ValidationInfo):
        assert (0 <= self.note_satisfaction) and (self.note_satisfaction <= 5)
        return self

class ListAspects(BaseModel):
    list_aspects: List[Aspect] = Field(description="Liste des différents aspects évoqués dans le feedback.")

    def __str__(self):
        return '\n'.join([str(x) for x in self.list_aspects])
    

class AspectsRequest(BaseModel):
    texts: List[str]
    sous_categories: List[SousCategorieAvecDescription]


class AspectsResponse(BaseModel):
    texts: List[str]
    predictions: List[Aspect]

In [None]:
sous_categories = [SousCategorieAvecDescription(indice=i, nom=row["Category"] + " : " + row["Name"], categorie=row["Category"], description=row["Description"]) for i ,row in  subcategories_df.iterrows()]
sous_categories

In [None]:
with open(prompts_path+'prompt_aspects.txt') as f:
    prompt_aspects = PromptTemplate.from_template(f.read())

print(prompt_aspects.template)

In [None]:
", ".join([f"`{tag}`" for tag in sous_categories])

In [None]:
context

In [None]:
feedback_context

In [None]:
def tag_single_request(prompt: str, sous_categories: List[SousCategorie]) -> Aspect:
    allowed_tags = [(tag.indice, tag.nom) for tag in sous_categories]
    allowed_tags_str = ", ".join([f"`{tag}`" for tag in allowed_tags])

    return client.chat.completions.create(
        model="mixtral",
        messages=[
            {
                "role": "system",
                "content": f"Tu es {feedback_context['role']} au sein de l'entreprise {feedback_context['entreprise']}. Voici un bref rappel sur cette entreprise: \n'{feedback_context['context']}'\n\En tant que  {feedback_context['role']}, tu est spécialisé dans l'analyse de commentaire."
            },
            {"role": "user", "content": prompt},
            {
                "role": "user",
                "content": f"Voici les sous-catégories: {allowed_tags_str}",
            },
        ], 
        response_model=ListAspects,  # Minimizes the hallucination of tags that are not in the allowed tags.
        validation_context={"sous_categories": sous_categories},
    )

def tag_request(request: AspectsRequest) -> AspectsResponse:
    predictions = [tag_single_request(text, request.tags) for text in request.texts]
    
    return AspectsResponse(
        texts=request.texts,
        predictions=predictions)

In [None]:
#feedback ="J'ai commandé une paire de chaussures sur votre site. Le site était facile à utiliser mais j'ai galéré à trouver ma taille. La livraison a été super rapide, mais les chaussures étaient trop petites. J'ai contacté le service client pour les renvoyer et ils m'ont dit que je devais payer les frais de retour. Du coup, j'ai décidé de les garder et de les donner à ma sœur. Elles sont bien mais un peu trop serrées pour moi."
feedback ="I ordered a pair of shoes on your site. The site was easy to use but I had a hard time finding my size. The delivery was super fast, but the shoes were too small. I contacted the customer service to return them and they told me I had to pay the return shipping. So I decided to keep them and give them to my sister. They are good but a little too tight for me."
prompt = prompt_aspects.invoke({"feedback": feedback})
response = tag_single_request(prompt.text, sous_categories=sous_categories)
print(response)

In [None]:
#feedback ="J'ai commandé une paire de chaussures sur votre site. Le site était facile à utiliser mais j'ai galéré à trouver ma taille. La livraison a été super rapide, mais les chaussures étaient trop petites. J'ai contacté le service client pour les renvoyer et ils m'ont dit que je devais payer les frais de retour. Du coup, j'ai décidé de les garder et de les donner à ma sœur. Elles sont bien mais un peu trop serrées pour moi."
feedback ="I ordered a pair of shoes on your site. The site was easy to use but I had a hard time finding my size. The delivery was super fast, but the shoes were too small. I contacted the customer service to return them and they told me I had to pay the return shipping. So I decided to keep them and give them to my sister. They are good but a little too tight for me."
prompt = prompt_aspects.invoke({"feedback": feedback})
response = tag_single_request(prompt.text, sous_categories=sous_categories)
print(response)

In [None]:
#feedback ="J'ai commandé une paire de chaussures sur votre site. Le site était facile à utiliser mais j'ai galéré à trouver ma taille. La livraison a été super rapide, mais les chaussures étaient trop petites. J'ai contacté le service client pour les renvoyer et ils m'ont dit que je devais payer les frais de retour. Du coup, j'ai décidé de les garder et de les donner à ma sœur. Elles sont bien mais un peu trop serrées pour moi."
feedback = feedbacks_df.loc[0, "Content"]
prompt = prompt_aspects.invoke({"feedback": feedback})
response = tag_single_request(prompt.text, sous_categories=sous_categories)
print(response)

In [None]:
feedbacks_df.loc[0, "Content"]

In [None]:
responses = []
for i, feedback in tqdm(feedbacks_df.iterrows()):
    prompt = prompt_aspects.invoke({"feedback": feedback['Content']})
    response = tag_single_request(prompt.text, sous_categories=sous_categories)
    responses.append(response)

In [None]:
for i, feedback in tqdm(feedbacks_df.iterrows()):
    results = bubble_client.create(
        "Aspect",
        [{
            "Company": COMPANY_ID,
            "Project": PROJECT_ID,
            "Category": subcategories_df.loc[aspect.sous_categorie.indice, "Category_id"],
            "Consequence": "",
            "Explanation": aspect.explication,
            "Rating": aspect.note_satisfaction,
            "Sub_category": subcategories_df.loc[aspect.sous_categorie.indice, "_id"],
            "Associated_feedback": feedback["_id"],
            }  for aspect in responses[i].list_aspects]
        )

    bubble_client.update_object(bubble_type="Feedback", bubble_id=feedback["_id"], fields={"Aspects": [res['id'] for res in results]})


In [None]:
with open(prompts_path+'prompt_insights_creation.txt') as f:
    prompt_insights = PromptTemplate.from_template(f.read())

In [None]:
prompts = []
BATCH_SIZE = 10

for batch_df in batchify(feedbacks_df, size=BATCH_SIZE):
    context = deepcopy(feedback_context)
    context["feedbacks"] = '\n\n'.join([str(i)+" : "+x for i, x in zip(batch_df.index, batch_df["Content"])])  
    #"- "+"\n- ".join(batch_df['content'])
    #context["insights"] = "- "+"\n- ".join(batch_df['content'])
    prompts.append(prompt_insights.invoke(context))

print(len(prompts))

In [None]:
print(prompts[0].text)

In [None]:
responses = apply_analysis(prompts, InsightsList, bar=True)
list_batch_insights_df = [pd.DataFrame(enum_to_str(response.insights_list)) for response in responses]

print(len(list_batch_insights_df), "batch have been processed")

In [None]:
responses[0].insights_list


In [None]:
[len(df) for df in list_batch_insights_df]

In [None]:
list_batch_insights_df[0]

In [None]:
list(list_batch_insights_df[0]['contenu'])

## Accociate newly created insights to feedbacks 

In [None]:
with open(prompts_path+'prompt_feedbacks.txt') as f:
    prompt_feedbacks = PromptTemplate.from_template(f.read())

In [None]:
class Sentiment(str, enum.Enum):
    POSITIF = "Positif"
    NEUTRE = "Neutre"
    NEGATIF = "Négatif"


In [None]:
InsightsIndex = enum.Enum("Indice de l'insight associé", [(str(i), i) for i in range(BATCH_SIZE)])

class Feedback(BaseModel):
        insights_list: List[InsightsIndex] = Field(description="Indices des insights associés à ce retour")
        sentiment: Sentiment = Field(description="Sentiment exprimé")

class FeedbackInfosList(BaseModel):
        feedbacks_list: List[Feedback] = Field(description="Liste des informations associées aux feedbacks.")

In [None]:
prompts = []
for batch_insights_df, batch_feedbacks_df in zip(list_batch_insights_df, batchify(feedbacks_df, size=BATCH_SIZE)):
    #InsightsEnum = enum.Enum("Insight associé", [(convert_text_to_constants(x), i) for i, x in zip(batch_insights_df.index, batch_insights_df["content"])])

    context = deepcopy(feedback_context)
    #context["feedbacks"] = "- "+"\n- ".join(batch_feedbacks_df['content'])
    context["feedbacks"] = '\n'.join([str(i)+" : "+x for i, x in zip(batch_insights_df.index, batch_feedbacks_df["content"])])  
    context["insights"] = '\n'.join([str(i)+" : "+x for i, x in zip(batch_insights_df.index, batch_insights_df["contenu"])])
    prompts.append(prompt_feedbacks.invoke(context))



In [None]:
print(prompts[0].text)

In [None]:

responses = apply_async_analysis(prompts, FeedbackInfosList)

list_enriched_feedbacks_df = [pd.DataFrame(enum_to_str(response.feedbacks_list)) for response in responses]

In [None]:
[len(df) for df in list_enriched_feedbacks_df]

In [None]:
len(pd.concat(list_enriched_feedbacks_df))

In [None]:
for batch_insights_df, batch_index_feedbacks, enriched_feedbacks_df in zip(list_batch_insights_df, batchify(feedbacks_df.index, size=BATCH_SIZE), list_enriched_feedbacks_df):
    feedbacks_df.loc[batch_index_feedbacks, 'sentiment'] = enriched_feedbacks_df['sentiment']
    feedbacks_df.loc[batch_index_feedbacks, 'insights_index'] = enriched_feedbacks_df['insights_list']

In [None]:
batch_insights_df

In [None]:
list_batch_feedbacks_df = [pd.DataFrame(enum_to_str(response.feedbacks_list)) for response in responses]
list_batch_feedbacks_df

In [None]:
feedbacks_df

In [None]:
list_batch_feedbacks_df[0]

In [None]:
list_batch_insights_df[-1]

In [None]:
[x for x in batchify(feedbacks_df, size=BATCH_SIZE)][-1]

In [None]:
[len(df) for df in list_batch_feedbacks_df]

In [None]:

l = [response.feedbacks_list for response in responses]
l = list(itertools.chain.from_iterable(l))
feedbacks_infos_df = pd.DataFrame(enum_to_str(l))
feedbacks_infos_df

In [None]:
feedbacks_infos_df

In [None]:
feedbacks_infos_df

In [None]:
feedbacks_df['sentiment'] = feedbacks_infos_df['sentiment']
feedbacks_df['insights_list'] = feedbacks_infos_df['insights_list']
feedbacks_df

## Feedbacks attribution

In [None]:
insights_enum = enum.Enum("Insight associé", [(convert_text_to_constants(x), i) for i, x in zip(batch_insights_df.index, batch_insights_df["content"])])

In [None]:
with open(prompts_path+'prompt_feedbacks.txt') as f:
    prompt_feedbacks = PromptTemplate.from_template(f.read())

In [None]:
feedback_parser = PydanticOutputParser(pydantic_object=Feedback)

prompt_feedback = PromptTemplate.from_template(
    template= prompt_template_feedback,
    partial_variables= {"format_instructions": feedback_parser.get_format_instructions()},
)

prompts = []
for feedback in feedbacks_df[feedbacks_column]:
    context = deepcopy(feedback_context)
    context["feedback"] = feedback
    prompts.append(prompt_feedback.invoke(context))

#print(prompts[0].text)

In [None]:
parsed_responses = safe_async_analysis(prompts, feedback_parser)

feedbacks_df["sentiment"] = [rep.sentiment for rep in parsed_responses]
feedbacks_df["insights"] = [[] for rep in parsed_responses]

k=0
insights = []
for i, rep in enumerate(parsed_responses):
    for j, insight in enumerate(rep.insights_list):
        insights.append(insight)
        feedbacks_df["insights"].iloc[i].append(str(k))
        k += 1

In [None]:
feedbacks_df.head()

In [None]:
insights_df = pd.DataFrame({
    "content":insights,
    "feedback_count": 1,
    })

In [None]:
feedbacks_df

In [None]:
insights_df["related_feedback"] = [[] for _ in range(len(insights_df))]

for i, row in feedbacks_df.iterrows():
    for j in row["insights"]:
        insights_df["related_feedback"].iloc[int(j)] = row['_id'] #[int(i)]

insights_df["childrens"] = [[] for _ in range(len(insights_df))]

insights_df.head()

# Insights categorisation

### Tagging

In [None]:


for i, filter in filters_df.iterrows():
    prompt_tags += '\n\n'+filter["Name"]#+' ('+filter["_id"] +')'
    tags = tags_df[tags_df["Filter"] == filter["_id"]]
    for _, tag in tags.iterrows():
        prompt_tags += '\n'+"- "+tag["Name"]+' ('+tag["_id"] +')'

print(prompt_tags)


In [None]:
with open(prompts_path+'prompt_categorsiation.txt') as f:
    prompt_categorsiation = PromptTemplate.from_template(f.read())

In [None]:
class FirstInsight(BaseModel):
    tags_id: List[str] = Field(description="Identifiants des tags de l'insight")
    content: str = "" #Field(description="Point intéressant a retenir du commentaire.")

    def __str__(self):
        return '- ' + self.content + "\nTypes: " + ', '.join(self.insight_types)

In [None]:
categorsiation_parser = PydanticOutputParser(pydantic_object=FirstInsight)

prompt_categorsiation = PromptTemplate.from_template(
    template= prompt_template_categorsiation,
    partial_variables= {"format_instructions": categorsiation_parser.get_format_instructions()},
)

prompts = []
for insight in insights_df["content"]:
    context = deepcopy(feedback_context)
    context["insight"] = insight
    prompts.append(prompt_categorsiation.invoke(context))

#print(prompts[0].text)

In [None]:
parsed_responses = safe_async_analysis(prompts, categorsiation_parser)


In [None]:

insights_df["tag"] = [rep.tags_id for rep in parsed_responses]
#insights_df["Insights"] = [[] for rep in parsed_responses]


### Types affectation

In [None]:
prompt_types = ""

for _, tag in types_df.iterrows():
    prompt_types += '\n'+"- "+tag["Title"]+' ('+tag["_id"] +') : ' + tag["Definition"]

print(prompt_types)

In [None]:
categorsiation_parser = PydanticOutputParser(pydantic_object=FirstInsight)

prompt_categorsiation = PromptTemplate.from_template(
    template= prompt_template_types,
    partial_variables= {"format_instructions": categorsiation_parser.get_format_instructions()},
)

prompts = []
for insight in insights_df["content"]:
    context = deepcopy(feedback_context)
    context["insight"] = insight
    prompts.append(prompt_categorsiation.invoke(context))

#print(prompts[0].text)

In [None]:
parsed_responses = safe_async_analysis(prompts, categorsiation_parser)


In [None]:
insights_df["type"] = [rep.insight_type for rep in parsed_responses]

In [None]:
feedbacks_df.to_csv(project_path+'/feedbacks.csv', index_label='Index')
insights_df.to_csv(project_path+'/insights.csv', index_label='Index')

# Data cleaning

# Visualisation

In [None]:
insight_layers = [
    pd.DataFrame(bubble_client.get_objects(
        "python_insight",
        [
            BubbleField("step") == i+1,
            BubbleField("company") == company_id,
            ],
    )) for i in range(n_layers)
]

In [None]:
insight_layers[0].tail()

In [None]:
sentences = insight_layers[0]["content"]
sentence_embeddings = embedding_model.encode(sentences)
sentence_embeddings.shape

In [None]:
insight_layers[0]['parent']

In [None]:
insight_layers[0]

In [None]:
def to_int(i):
    try:
        return int(i)
    except:
        return -1

for layer in insight_layers:
    layer['parent'] = layer['parent'].apply(to_int)


In [None]:
list(insight_layers[1]["content"])

In [None]:
for i, layer in enumerate(insight_layers):
    print(list(insight_layers[0][insight_layers[0]['parent'] == 'None']["content"]))

In [None]:
sum(insight_layers[0]['parent']<0)

In [None]:
insight_layers[1].iloc[insight_layers[0]['parent'], "content"]

In [None]:
insight_layers[0].loc[0, "cluster"] == 0

In [None]:
map_to_parent(0, insight_layers[1])

In [None]:
insight_layers[1].loc[0, 'parent']

In [None]:
#@Insight Plot the archive {display-mode: "form"}

# UMAP reduces the dimensions from 1024 to 2 dimensions that we can plot
reducer = umap.UMAP(n_neighbors=15)
umap_embeds = reducer.fit_transform(sentence_embeddings)

def map_to_parent(i, parents_df):
    try:
        return parents_df.loc[i, 'content']
    except:
        return ""
    
# Prepare the data to plot and interactive visualization
# using Altair
df_explore = pd.DataFrame(data={
    'content': insight_layers[0]['content'], 
    'parent': insight_layers[0]['parent'].apply(lambda x: map_to_parent(x, insight_layers[1])),
    'cluster': insight_layers[0]['cluster'].astype(str),
    })
df_explore['x'] = umap_embeds[:,0]
df_explore['y'] = umap_embeds[:,1]
df_explore


In [None]:

# Plot
chart = alt.Chart(df_explore).mark_circle(size=60).encode(
    x=#'x',
    alt.X('x',
        scale=alt.Scale(zero=False)
    ),
    y=
    alt.Y('y',
        scale=alt.Scale(zero=False)
    ),
    color='cluster',
    tooltip=['content', "parent"]
).properties(
    width=700,
    height=400
)
chart.interactive()

TF-IDF

In [None]:
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')

In [None]:
def td_idf(documents)
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    dense = vectors.todense()
    denselist = dense.tolist()
    df = pd.DataFrame(denselist, columns=feature_names)
    df = df[df.columns.difference(stopwords.words('french'))]


In [None]:
df = td_idf(feedbacks_df['content'])
#print('\n'.join(df.columns))

In [None]:
#print('\n'.join(df.columns))

In [None]:

def get_top_two_columns(row):
    top_two_indexes = row.nlargest(5).index.tolist()
    return top_two_indexes

top_two_columns_df = df.apply(get_top_two_columns, axis=1)

print(top_two_columns_df)

In [None]:
#print('\n'.join(insights_df['content']))