# Dependancies

## Requirements

In [None]:
#!pip install sentence_transformers langchain openai tqdm datasets asyncio scikit-learn cohere tiktoken umap altair

In [None]:
import numpy as np
import re
import pandas as pd
from tqdm.notebook import tqdm
from datasets import load_dataset
import umap
import altair as alt
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from typing import List, Optional
import enum

from langchain_community.llms import Ollama
from langchain.output_parsers.regex_dict import RegexDictParser
from langchain.output_parsers import PydanticOutputParser
from langchain_core.messages import HumanMessage, SystemMessage, ChatMessage
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from pydantic import BaseModel, Field, validator, create_model
from openai import AsyncOpenAI, OpenAI
#import asyncio
import os
from typing import Tuple, List, Optional

import requests

from pydantic import BaseModel, ValidationInfo, model_validator

import json

import itertools
from copy import deepcopy
from tqdm.notebook import tqdm, trange
from sklearn.cluster import KMeans

import umap.umap_ as umap
#import umap
import hdbscan

In [None]:
from src.bubble import *
from src.models import *
from src.utilities import *

In [None]:
prompts_path = "Prompts/fr/"

## Bubble API

In [None]:
feedbacks_df = get("Feedback", max_objects=None)

In [None]:
categories_df = get("Category")
original_subcategories_df = get("SubCategory")


In [None]:
categories_df

In [None]:
ID_CATEG_NONE = categories_df[categories_df["Name"].isna()].iloc[0]["_id"]

In [None]:
company_infos = bubble_client.get(
    "Company",
    bubble_id=COMPANY_ID,
)
project_infos = bubble_client.get(
    "Project",
    bubble_id=PROJECT_ID,
)

In [None]:
categories_df

In [None]:
CategoryInsight = enum.Enum("Categories de l'insight", [(convert_text_to_constants(x), x) for x in list(categories_df[categories_df["Name"].notna()].Name)])

dict_SubCategoriesInsight = {}
for _,row in categories_df[categories_df["Name"].notna()].iterrows():
    concerned_subcat_df = original_subcategories_df[original_subcategories_df["Category"] == row["_id"]]
    concerned_subcat_df = concerned_subcat_df[concerned_subcat_df["Name"].notna()]
    row["Name"] = enum.Enum("Categories de l'insight", [(convert_text_to_constants(x), x) for x in concerned_subcat_df.Name])


In [None]:
df  = categories_df.copy()
df['Category'] = df['_id'].astype(str)
original_subcategories_df['Category'] = original_subcategories_df['Category'].astype(str)
df = pd.merge(original_subcategories_df, df, on=["Category", "Company"])
df = df[["Name_x", "Name_y", "Company", "_id_x", "_id_y"]]
df.columns = ["Name", "Category",  "Company", "_id", "Category_id"]
subcategories_df = df
subcategories_df.head()

In [None]:
types_descr = columns_to_string(subcategories_df[subcategories_df["Name"].notna()], "Category", "Name")
print(types_descr)


In [None]:
#example_insight = "Manque de clarté de l'affichage des prix en magasin"
#exemple_commentaire = "je suis exclusif metro je n ai aucun representant j achetais jusqu a present tout metro par facilite mais je suis tres souvent décue par la reponse ha non on n en a pas cela arrive demain je pense que depuis le covid tout le monde ou presque s en fou!!!"
#examples_insights_df = pd.DataFrame([
#    {"Insights qui devraient en découler": "Déceptions face aux retards de livraison"},
#    {"Insights qui devraient en découler": "Impression d'une baisse de qualité du service depuis le Covid"},
#])


feedback_context = {
    "entreprise": company_infos["Name"],
    "context": company_infos['Context'],
    "role": company_infos['Role'],
    "cible": project_infos['Target'],
    "insight_types": types_descr,
    #"insight_categories": tags_descr,
    #"question": project_infos['Study_question'],
    #"exemple_commentaire": exemple_commentaire,
    #"example_insights": '\n- '.join(list(examples_insights_df['Insights qui devraient en découler'])),
}

feedback_context

# Insights extraction

### Aspects and Insights creation

In [None]:
#FeedbackIndex = enum.Enum("Indice du retour associé", [(str(i), i) for i in range(BATCH_SIZE)])

class Categorie(BaseModel):
    nb_parents: int = Field(description="Nombre de  parents dans l'arbdre des catégories.")
    indice: str = Field(description="Indice de la catégorie. Doit être un string.")
    nom: str = Field(description="Nom de cette catégorie.")

    def __str__(self):
        return self.nom + ' ('+str(self.indice)+')'

    @model_validator(mode="after")
    def validate_ids(self, info: ValidationInfo):
        context = info.context
        if context:
            tags = context.get("sous_categories")
            assert self.indice in {
                tag.indice for tag in tags
            }, f"sous_categories ID {self.indice} not found in context"
            assert self.nom in {
                tag.nom for tag in tags
            }, f"sous_categories name {self.nom} not found in context"
        return self
    


class Aspect(BaseModel):
    categorie : Categorie = Field(description="Sous-catégorie concernée.")
    note_satisfaction : int = Field(description="Note de satisfaction du client concernant cette sous-catégorie, de 1 (pas content) à 5 (très content).")
    explication: Optional[str] = Field(description="Eventuel insight qui permetterait d'améliorer l'experience client, les produits ou la stratégie de l'entreprise. Ne doit être ajouté que ni réellement intéressant, et doit alors être aussi claire et concise que possible.") #Field(description="Point intéressant a retenir du commentaire.")

    def __str__(self):
        res = '\n' + str(self.sous_categorie) + '\nSatisfaction: ' + str(self.note_satisfaction) 
        if self.explication is not None:
            res += "\nExplication: " + self.explication
        return MAX_RETRIES
    
    @model_validator(mode="after")
    def validate_ids(self, info: ValidationInfo):
        assert (0 <= self.note_satisfaction) and (self.note_satisfaction <= 5)
        return self

class ListAspects(BaseModel):
    list_aspects:  Optional[List[Aspect]] = Field(description="Eventielle liste des différents aspects évoqués dans le feedback.")

    def __str__(self):
        if self.list_aspects is not None:
            return '\n'.join([str(x) for x in self.list_aspects])
        else:
            return ''
    

In [None]:
with open(prompts_path+'prompt_aspects.txt') as f:
    prompt_aspects = PromptTemplate.from_template(f.read())

In [None]:
#feedback ="I ordered a pair of shoes on your site. The site was easy to use but I had a hard time finding my size. The delivery was super fast, but the shoes were too small. I contacted the customer service to return them and they told me I had to pay the return shipping. So I decided to keep them and give them to my sister. They are good but a little too tight for me."
from time import sleep

batch_size = 100
aspects = []
for batch_df in tqdm(batchify(feedbacks_df, batch_size)):

    subcategories = "\n".join([f"{i} : '"+row["Category"]+" : "+row["Name"]+"'" for i, row in subcategories_df[subcategories_df["Name"].notna()].iterrows()])
    batch_feedbacks = list(batch_df["Content"])

    prompts = [prompt_aspects.invoke({"feedback": feedback, "subcategories": subcategories}).text for feedback in batch_feedbacks]

    aspects += apply_async_analysis(prompts, ListAspects)
    sleep(60)

In [None]:
sum([aspect.list_aspects is None for aspect in aspects])

In [None]:
for i, feedback in tqdm(feedbacks_df.iterrows()):
    if (aspects[i].list_aspects is not None) and (aspects[i].list_aspects != []):
        results = bubble_client.create(
            "Aspect",
            [{
                "Company": COMPANY_ID,
                "Project": PROJECT_ID,
                "Category": subcategories_df.loc[int(aspect.categorie.indice), "Category_id"],
                "Consequence": "",
                "Date": str(feedback["Date"]),
                "Explanation": aspect.explication,
                "Rating": aspect.note_satisfaction,
                "SubCategory": subcategories_df.loc[int(aspect.categorie.indice), "_id"],
                "Associated_feedback": feedback["_id"],
                }  for aspect in aspects[i].list_aspects]
            )

    bubble_client.update_object(bubble_type="Feedback", bubble_id=feedback["_id"], fields={"Aspects": [res['id'] for res in results]})


### Prepared for visu

In [None]:
aspects_df = get("Aspect")
ID_CATEG_NONE = categories_df[categories_df["Name"].isna()].iloc[0]["_id"]


In [None]:
import pandas as pd

def global_stats_and_rating_counts(df, interval="M"):
    """
    Calculates various statistics (mean, max, count) and rating counts for each groupby variable
    and interval in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        interval (str, optional): The time interval for grouping. Defaults to "M".
        groupby (str, optional): The column to group by. Defaults to "Category".

    Returns:
        pd.DataFrame: The output DataFrame with additional columns for statistics and rating counts.
    """

    df["Date"] = pd.to_datetime(df["Date"])  # Ensure correct datetime format
    groups = df.groupby(df["Date"].dt.to_period(interval))

    # Calculate mean, max, and total count using agg
    statistics = groups.agg(
        mean_rating=("Rating", "mean"),
        max_rating=("Rating", "max"),
        min_rating=("Rating", "min"),
        median_rating=("Rating", "median"),
        q1_rating = ("Rating", lambda x: np.quantile(x, 0.25)),
        q3_rating = ("Rating", lambda x: np.quantile(x, 0.75)),
        count=("Rating", "count")
    ).reset_index()

    # Calculate rating counts using value_counts() within a loop
    rating_counts = []
    for name, group in groups:
        rating_counts.append(group["Rating"].value_counts().rename(name))

    # Combine rating counts into a single DataFrame
    rating_counts_df = pd.concat(rating_counts, axis=1).fillna(0)

    merged_df = pd.concat([statistics.set_index("Date"), rating_counts_df.T], axis=1)
    # Merge statistics and rating counts
    display_format = "%m/%d/%Y"

    merged_df = merged_df.rename_axis('Date').reset_index()

    merged_df["Date"] = merged_df["Date"].apply(lambda x:x.strftime(display_format))
    merged_df['Date'] = pd.to_datetime(merged_df["Date"])  # Ensure correct datetime format

    if interval == 'W':
        merged_df['Date'] = merged_df['Date'] - pd.to_timedelta(merged_df['Date'].dt.day_of_week, unit='d')
    elif interval == 'M':
        merged_df['Date'] = merged_df['Date'] - pd.to_timedelta(merged_df['Date'].dt.day-1, unit='d')
    elif interval == 'Y':
        merged_df['Date'] = merged_df['Date'] - pd.to_timedelta(merged_df['Date'].dt.day_of_year-1, unit='d')

    merged_df["Date"] = merged_df["Date"].apply(lambda x:x.strftime(display_format))
    #merged_df["Date"] = merged_df["Date"].apply(lambda x:x.to_timestamp(interval))
    merged_df["Period"] = interval
    merged_df["Category"] = ID_CATEG_NONE
    merged_df["SubCategory"] = None
    merged_df["Grouped by"] = None
    return merged_df

# Example usage
df = aspects_df  # Load your DataFrame
#statistics, rating_counts_df = group_stats_and_rating_counts(df)
statistics_and_counts = global_stats_and_rating_counts(df, interval="M")
statistics_and_counts #.head()

In [None]:
import pandas as pd

def group_stats_and_rating_counts(df, interval="M", groupby="Category"):
    """
    Calculates various statistics (mean, max, count) and rating counts for each groupby variable
    and interval in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.
        interval (str, optional): The time interval for grouping. Defaults to "M".
        groupby (str, optional): The column to group by. Defaults to "Category".

    Returns:
        pd.DataFrame: The output DataFrame with additional columns for statistics and rating counts.
    """
    if groupby is None:
        return global_stats_and_rating_counts(df, interval=interval)

    df["Date"] = pd.to_datetime(df["Date"])  # Ensure correct datetime format
    added_columns = [groupby] if groupby is not None else []
    added_columns += ["Category"] if groupby=="SubCategory" else []
    groups = df.groupby([df["Date"].dt.to_period(interval)]+added_columns)

    # Calculate mean, max, and total count using agg
    statistics = groups.agg(
        mean_rating=("Rating", "mean"),
        max_rating=("Rating", "max"),
        min_rating=("Rating", "min"),
        median_rating=("Rating", "median"),
        q1_rating = ("Rating", lambda x: np.quantile(x, 0.25)),
        q3_rating = ("Rating", lambda x: np.quantile(x, 0.75)),
        count=("Rating", "count")
    ).reset_index()

    # Calculate rating counts using value_counts() within a loop
    rating_counts = []
    for name, group in groups:
        rating_counts.append(group["Rating"].value_counts().rename(name))

    # Combine rating counts into a single DataFrame
    rating_counts_df = pd.concat(rating_counts, axis=1).fillna(0)

    multi_index = ["Date"] + added_columns
    merged_df = pd.concat([statistics.set_index(multi_index), rating_counts_df.T], axis=1)
    # Merge statistics and rating counts
    #merged_df = pd.merge(statistics, rating_counts_df, on=[name[0], groupby])

    #display_format = "%Y"
    #display_format += "-%m" if interval != "Y" else ""
    #display_format += "-%d" if interval != "M" else ""
    display_format = "%m/%d/%Y"

    print(multi_index)
    merged_df = merged_df.reset_index(names=multi_index)

    merged_df["Date"] = merged_df["Date"].apply(lambda x:x.strftime(display_format))
    merged_df['Date'] = pd.to_datetime(merged_df["Date"])  # Ensure correct datetime format

    if interval == 'W':
        merged_df['Date'] = merged_df['Date'] - pd.to_timedelta(merged_df['Date'].dt.day_of_week, unit='d')
    elif interval == 'M':
        merged_df['Date'] = merged_df['Date'] - pd.to_timedelta(merged_df['Date'].dt.day-1, unit='d')
    elif interval == 'Y':
        merged_df['Date'] = merged_df['Date'] - pd.to_timedelta(merged_df['Date'].dt.day_of_year-1, unit='d')

    merged_df["Date"] = merged_df["Date"].apply(lambda x:x.strftime(display_format))
    #merged_df["Date"] = merged_df["Date"].apply(lambda x:x.to_timestamp(interval))
    merged_df["Period"] = interval
    merged_df["Grouped by"] = groupby
    if "SubCategory" not in merged_df:
        merged_df["SubCategory"] = None

    return merged_df

# Example usage
df = aspects_df  # Load your DataFrame
#statistics, rating_counts_df = group_stats_and_rating_counts(df)
statistics_and_counts = group_stats_and_rating_counts(df, interval="M", groupby="Category")
statistics_and_counts #.head()

In [None]:
def find_empty_subcat(cat_id):
    df = original_subcategories_df.copy()

    df = df[df["Category"]==cat_id]
    return df[df["Name"].isna()].iloc[0]["_id"]

find_empty_subcat("1709253065849x444427432726514300")




In [None]:
def send_aspects(df, batch_size=1000):
    d = {
        "Company": COMPANY_ID,
        "Project": PROJECT_ID,
        "Grouped by": row["Grouped by"],
        "Category": row['Category'],
        "SubCategory": row['SubCategory'],
        "Date": row["Date"],
        "Period": row["Period"],
        "Mean Rating": row["mean_rating"],
        "Min Rating": row["min_rating"],
        "Max Rating": row["max_rating"],
        "Q1 Rating": row["max_rating"],
        "Median Rating": row["max_rating"],
        "Q3 Rating": row["max_rating"],
        "Count": sum([row[i] for i in range(1,6)]),
        "Count of 1s": row[1],
        "Count of 2s": row[2],
        "Count of 3s": row[3],
        "Count of 4s": row[4],
        "Count of 5s": row[5],
        }  

    bubble_id = bubble_client.create("Aspect Evol",d)

all_statistics = []
for groupby in [None, "Category", "SubCategory"]:
  for interval in ["Y", "M", "W", "D"]:
    statistics = group_stats_and_rating_counts(aspects_df, interval=interval, groupby=groupby)

    #if "Category" not in statistics:
    #   statistics["Category"] = ID_SUBCATEG_NONE

    #if "SubCategory" not in statistics:
    #   statistics["SubCategory"] = ID_SUBCATEG_NONE

    assert statistics["Category"].isna().sum() == 0
    
    #statistics["Category"] = statistics["Category"].fillna(ID_CATEG_NONE)

    
    #statistics["SubCategory"] = statistics["SubCategory"].fillna(ID_SUBCATEG_NONE)
    #
    # _aspects(statistics)
    all_statistics.append(statistics)
all_statistics_df = pd.concat(all_statistics).reset_index()
all_statistics_df

In [None]:
df = all_statistics_df.copy()
#df["Date"] = pd.to_datetime(df["Date"])
df.set_index("Date", drop="True", inplace=True)
df = df[df["Category"]=='1709322143530x849396050152903400']

df = df[df["Period"]=='M']
df

In [None]:

df = df[[1, 2, 3, 4, 5]]
df.plot(kind='bar', stacked=True)

In [None]:
df = all_statistics_df.copy()
df["Date"] = pd.to_datetime(df["Date"])
df = df[df["Category"]!='1709322143530x849396050152903400']
df = df[df["SubCategory"].isna()]

df = df[df["Period"]=='M']
#df.set_index(["Date", "Category"], drop="True", inplace=True)
df.set_index("Date", drop="True", inplace=True)
df = df[["count", "Category"]]
df.head().to_csv("brouillon")

In [None]:
categories_df

In [None]:
ids

In [None]:
[x for x in categories_df["_id"]]

In [None]:
import matplotlib.pyplot as plt

# Reshape the data frame using pivot_table to prepare it for plotting
df_pivoted = df.pivot_table(index='Date', columns='Category', values='count', aggfunc='sum')

df_pivoted.columns = [categories_df[categories_df["_id"]==c].iloc[0]["Name"] for c in list(df_pivoted.columns)]

# Create the stacked bar plot using Matplotlib
plt.figure(figsize=(10, 6))  # Adjust figure size as needed
df_pivoted.plot(kind='bar', stacked=True, colormap='Set2')

# Customize plot elements (optional):
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Stacked Bar Plot')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for readability
plt.legend(title='Category')
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
X = df["Date"].drop_duplicates()

In [None]:


import matplotlib.pyplot as plt
import numpy as np

# data from https://allisonhorst.github.io/palmerpenguins/

species = (
    "Adelie\n $\\mu=$3700.66g",
    "Chinstrap\n $\\mu=$3733.09g",
    "Gentoo\n $\\mu=5076.02g$",
)
weight_counts = {
    "Below": np.array([70, 31, 58]),
    "Above": np.array([82, 37, 66]),
}
width = 0.5

fig, ax = plt.subplots()
bottom = np.zeros(3)

for boolean, weight_count in weight_counts.items():
    p = ax.bar(species, weight_count, width, label=boolean, bottom=bottom)
    bottom += weight_count

ax.set_title("Number of penguins with above average body mass")
ax.legend(loc="upper right")

plt.show()

In [None]:
all_statistics_df[all_statistics_df["SubCategory"].isna()]

In [None]:
df = all_statistics_df.copy()
df = df[df["SubCategory"].isna()]
df["SubCategory"] = df["Category"].apply(find_empty_subcat)
res_df = all_statistics_df.copy()
res_df[res_df["SubCategory"].isna()] = df
all_statistics_df = res_df
all_statistics_df

In [None]:
send_aspects(all_statistics_df)