# Dependancies

## Requirements

In [None]:
#!pip install sentence_transformers langchain openai tqdm datasets asyncio scikit-learn cohere tiktoken umap altair

In [None]:
import numpy as np
import re
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
import umap
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from typing import List
import enum

from langchain_community.llms import Ollama
from langchain.output_parsers.regex_dict import RegexDictParser
from langchain.output_parsers import PydanticOutputParser
from langchain_core.messages import HumanMessage, SystemMessage, ChatMessage
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from pydantic import BaseModel, Field, validator, create_model
from openai import AsyncOpenAI, OpenAI
import asyncio
import os

import requests
import json

import itertools
from copy import deepcopy
from tqdm.notebook import tqdm, trange
from sklearn.cluster import KMeans

import umap.umap_ as umap
#import umap
import hdbscan

from typing import Literal, Union, Optional
from pydantic.config import ConfigDict

import openai
import instructor

from src.bubble import *
from src.models import *
from src.utilities import *


In [None]:
#PROJECT =  "Metro" #"Cheerz"
#project_path = 'Results/'+PROJECT
#os.makedirs(project_path, exist_ok=True)

In [None]:
aspects_df = get("Aspect")

In [None]:
aspects_df.head()

# Insights extraction

In [None]:
TYPES_LIST = ['Point positif', 'Nouvelle fonctionnalité', 'Point de douleur', 'Bug']

tags_df = get("Tag", constraints=[])
#types_df = get("Type", constraints=[])
categories_df = get("Category")
subcategories_df = get("SubCategory")

In [None]:
company_infos = bubble_client.get(
    "Company",
    bubble_id=COMPANY_ID,
)
project_infos = bubble_client.get(
    "Project",
    bubble_id=PROJECT_ID,
)

feedback_context = {
    "entreprise": company_infos["Name"],
    "context": company_infos['Context'],
    "role": company_infos['Role'],
    "cible": project_infos['Target'],
    "types": '- '+' \n- '.join(TYPES_LIST),
    "tags": '- '+' \n- '.join([row["Name"]+' : '+row["Description"] for _,row in tags_df.iterrows()]),
    #"types": '- '+' \n- '.join([row["Name"]+' : '+row["Description"] for _,row in types_df.iterrows()]),
    #"insight_types": types_descr,
    #"insight_categories": tags_descr,
    #"question": project_infos['Study_question'],
    #"exemple_commentaire": exemple_commentaire,
    #"example_insights": '\n- '.join(list(examples_insights_df['Insights qui devraient en découler'])),
}

feedback_context

In [None]:
ID_CATEG_NONE = categories_df[categories_df["Name"].isna()].iloc[0]["_id"]
SUBCATEG_NONE = subcategories_df[subcategories_df["Name"].isna()]
ID_CATEG_NONE, SUBCATEG_NONE

In [None]:
TypeInsight = enum.Enum("Type de l'insight", [(convert_text_to_constants(t), t) for t in TYPES_LIST])
#types_to_id = {convert_text_to_constants(row.Name): row._id for _, row in types_df.iterrows()}

#TypeInsight = enum.Enum("Type de l'insight", [(convert_text_to_constants(row.Name), row.Name) for _, row in types_df.iterrows()])
TagInsight = enum.Enum("Tag de l'insight", [(convert_text_to_constants(row.Name), row.Name) for _, row in tags_df.iterrows()])
tags_to_id = {convert_text_to_constants(row.Name): row._id for _, row in tags_df.iterrows()}
#type_to_id = {convert_text_to_constants(row.Name): row._id for _, row in types_df.iterrows()}
#type_to_id[convert_text_to_constants('Point de douleur')]

In [None]:
list(TypeInsight)

In [None]:
#FeedbackIndex = enum.Enum("Indice du retour associé", [(str(i), i) for i in range(BATCH_SIZE)])

class Insight(BaseModel):
    insight: str = Field(description="Insight, c'est a dire infirmation importante que révèle cette étude à l'entreprise, et lui permettera d'améliorer son experience utilisateur, sa stratégie ou son produit.") 
    insight_type: TypeInsight = Field(description="Type de l'insight, parmis "+', '.join(TYPES_LIST)) 
    insight_tags: List[TagInsight] = Field(description="Tags de l'insight. Peut eventuellement être une liste vide.")
    associated_indexes: List[int] = Field(description="Indices des retours associés.")
    details: List[str] = Field(description="Détails de l'insights. Peut eventuellement être une liste vide.") 
    consequences: List[str] = Field(description="Conséquences pour l'entrerpise. Peut eventuellement être une liste vide.")
    recommandations: List[str] = Field(description="Recommandations pour l'entrerpise. Peut eventuellement être une liste vide.")
    def __str__(self):
        return """{0}

Détails:
    - {1}
Conséquences:
    - {2}
Recommandations:
    - {3}
Retours: 
    - {4}
Type: 
    {5}
Tags: 
    {6}
        """.format(self.insight, '\n    - '.join(self.details), '\n    - '.join(self.consequences), '\n    - '.join(self.recommandations), '\n    - '.join([str(x) for x in self.associated_indexes]), self.insight_type._value_, ", ".join([x._value_ for x in self.insight_tags]))

class ListInsights(BaseModel):
    insights: List[Insight] = Field(description="Liste des insights qui ont été déduits.")
    def __str__(self):
        return '\n\n'.join([str(x) for x in self.insights])

#ListInsights.model_json_schema() 
    
test = ListInsights(insights=[Insight(insight='Accueil chaleureux et personnel serviable', insight_type=TypeInsight.POINT_POSITIF, insight_tags=[TagInsight.MAGASIN, TagInsight.SERVICE_CLIENT], associated_indexes=[96, 123, 475, 1249, 1267, 1372, 1695, 1965], details=[], consequences=[], recommandations=[]), Insight(insight='Ecoute attentive et conseils pertinents', insight_type=TypeInsight.POINT_POSITIF, insight_tags=[TagInsight.MAGASIN, TagInsight.SERVICE_CLIENT], associated_indexes=[20, 891, 1372], details=[], consequences=[], recommandations=[])])
print(test)

In [None]:
with open('Prompts/fr/prompt_regroupement.txt') as f:
    prompt_regroupement = PromptTemplate.from_template(f.read())


print(prompt_regroupement.template)

In [None]:

with open('Prompts/fr/prompt_regroupement_create_example.txt') as f:
    prompt_regroupement_create_example = PromptTemplate.from_template(f.read())

example = apply_async_analysis([prompt_regroupement_create_example], ListInsights)

In [None]:
print(example)

In [None]:
example_clustering_json = example[0].json()
from pprint import pprint
pprint(example_clustering_json)


In [None]:
prompts = []
subcat_ids = []
for subcat_id, df in aspects_df[aspects_df['Explanation'].notna()].groupby('SubCategory'):
    subcat = subcategories_df[subcategories_df['_id'] == subcat_id].iloc[0]
    cat = categories_df[categories_df['_id'] == subcat['Category']].iloc[0]

    feedbacks = '\n'.join([str(index)+' : '+content for (index, content) in df['Explanation'].items()])
    
    prompts.append(prompt_regroupement.invoke({"feedbacks": feedbacks, "category": cat["Name"]+" : "+subcat['Name'], "example":example_clustering_json, **feedback_context}).text)
    subcat_ids.append(subcat_id)

#print(prompts[0])
print("Traitement synchronisé de", len(prompts), "prompts.")
list_insights = apply_async_analysis(prompts, ListSubCategory)

In [None]:
def send_insights(insights_group, cat_id, subcat_id):

    if len(insights_group.insights)>0:
      res = bubble_client.create("Insight",
        [{
          "Company": COMPANY_ID,
          "Project": PROJECT_ID,
          "Name": insight.insight,
          "Category": cat_id,
          "SubCategory": subcat_id,
          "Type": insight.insight_type._value_,
          "Tags": [tags_to_id[convert_text_to_constants(tag._name_)] for tag in insight.insight_tags],
          "Aspects": list(aspects_df.iloc[insight.associated_indexes]._id),
          "Feedbacks": list(aspects_df.iloc[insight.associated_indexes].Associated_feedback),
          "Nb Feedbacks": len(list(aspects_df.iloc[insight.associated_indexes].Associated_feedback)),
          }  for insight in insights_group.insights]                     
        )
      insights_id = [x['id'] for x in res]
    else:
      insights_id = []

    if len(insights_group.consequences)>0:
      res = bubble_client.create("Consequence",
        [{
          "Company": COMPANY_ID,
          "Project": PROJECT_ID,
          "Description": conseq.detail,
          "Name": conseq.title,
          }  for conseq in insights_group.consequences]                     
        )
      consequences_id = [x['id'] for x in res] 
    else:
      consequences_id = []


    bubble_id = bubble_client.create("Insights Group", {
      "Company": COMPANY_ID,
      "Project": PROJECT_ID,
      "Name": insights_group.title,
      "Category": cat_id,
      "SubCategory": subcat_id,
      "Insights": insights_id,
      "Consequences": consequences_id,
      #"Tags": [tags_to_id[tag._name_] for tag in insight.insight_tags],
      })

    

for (list_insight_groups, subcat_id) in tqdm(zip(list_insights, subcat_ids)):
  cat_id = subcategories_df[subcategories_df['_id'] == subcat_id].iloc[0].Category
  for insights_group in list_insight_groups.sub_categories_list:
    send_insights(insights_group, cat_id, subcat_id)

    empty_subcat = SUBCATEG_NONE[SUBCATEG_NONE["Category"] ==cat_id].iloc[0]._id
    send_insights(insights_group, cat_id, empty_subcat)

    empty_subcat = SUBCATEG_NONE[SUBCATEG_NONE["Category"] ==ID_CATEG_NONE].iloc[0]._id
    send_insights(insights_group, ID_CATEG_NONE, empty_subcat)
