# Disclaimer:
##### A lot of the code and plots in this notebook are meant for the analysis of close to 2000 pilot prompts by more than 100 users.

##### The data within this repository only serves demo purposes and would results in less interesting or confusing plots.

### First, set up your OpenAI connection

In [None]:
AZURE_OPENAI_API_KEY = "YOUR_API_KEY"
AZURE_OPENAI_API_VERSION = "YOUR_API_VERSION_WE_USED_2023-05-15"
AZURE_OPENAI_ENDPOINT = "YOUR_ENDPOINT"

In [None]:
import json
from openai import AzureOpenAI
import time
from tqdm import tqdm


client = AzureOpenAI(
    api_key=AZURE_OPENAI_API_KEY,
    api_version="2023-05-15",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
)


def prompt_gpt(prompt, context=None, system=None):

    conversation = []

    if system:
        conversation.append({"role": "system", "content": system})

    if context:
        conversation.append({"role": "system", "content": context})

    conversation.append({"role": "user", "content": prompt})

    response = client.chat.completions.create(
                    model="gpt-35-turbo",
                    messages=conversation,
                    temperature=0.3,
                    # max_tokens=200,
                    top_p=0.95,
                    frequency_penalty=0,
                    presence_penalty=0,
                    stop=None,
                    # response_format={ "type": "json_object" }
    )

    finish_reason = response.choices[0].finish_reason
    if finish_reason != "stop":
        print(finish_reason)

    return response.choices[0].message.content

### Set up configs, paths, etc

In [None]:
data_folder = "../data"

input_file= f"{data_folder}/example_data_pilot_analysis.csv"
output_file_html = f"{input_file}-analyzed-{{level}}.html"
output_file_csv = f"{input_file}-analyzed-{{level}}.csv"
output_file_xlsx = f"{input_file}-analyzed.xlsx"

In [None]:
import csv
from collections import defaultdict
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
import re
from tqdm import tqdm

tqdm.pandas()

logs = pd.read_csv(
    input_file, sep=",", quotechar='"', 
    on_bad_lines="warn", converters={'Prompt':lambda x:x.replace('\n\n','')}, 
    quoting=csv.QUOTE_MINIMAL,
    parse_dates=True,
    date_format="%Y-%m-%dT%H:%M:%S.%fZ")

logs["LoggedAt"] = pd.to_datetime(logs["LoggedAt"], format='%Y-%m-%dT%H:%M:%S.%fZ', errors="ignore")
logs["promptLength"] = logs.Prompt.map(lambda x: len(word_tokenize(x)))
if "Response" in logs:
    logs["responseLength"] = logs.Response.progress_map(lambda x: len(word_tokenize(x)) if isinstance(x, str) else np.nan)
else:
    logs["responseLength"] = np.nan

In [None]:
logs

### Filter only relevant dates

In [None]:
logs = logs[(logs["LoggedAt"] > pd.Timestamp(2024, 2, 25)) & (logs["LoggedAt"] < pd.Timestamp(2024, 6, 30))]
logs

# Group User Sessions

In [None]:
full_sessions = logs.groupby(["cid", "uhash"])["LoggedAt"].agg(["count", "min", "max"]).reset_index()
full_sessions["duration"] = full_sessions["max"] - full_sessions["min"]

full_sessions["duration_minutes"] = \
    full_sessions["duration"].dt.components.days * 24 * 60 + \
    full_sessions["duration"].dt.components.hours * 60 + \
    full_sessions["duration"].dt.components.minutes

full_sessions.head()

# Basic Log Analysis

### Calculate a few things

In [None]:
import numpy as np

def mean(x):
    return round(np.mean(x), 2)

def std(x):
    return round(np.std(x), 2)

def median(x):
    return round(np.median(x), 2)

print(f"Total prompts: {logs.Prompt.count()}")
print(f"Unique users: {logs.uhash.nunique()}")
print(f"Unique session ids: {logs.cid.nunique()}")
print(f"Prompts per user: {mean(logs.uhash.value_counts())}±{std(logs.uhash.value_counts())}; Median: {median(logs.uhash.value_counts())}")
print(f"Prompts per session: {mean(logs.cid.value_counts())}±{std(logs.cid.value_counts())}; Median: {median(logs.cid.value_counts())}")
print(f"Session duration: {mean(full_sessions.duration_minutes)}±{std(full_sessions.duration_minutes)}; Median: {median(full_sessions.duration_minutes)}")
print(f"Prompt Length: {mean(logs.promptLength)}±{std(logs.promptLength)}; Median: {median(logs.promptLength)}")
print(f"Response Length: {mean(logs.responseLength)}±{std(logs.responseLength)}; Median: {median(logs.responseLength)}")

### Prompts per user

In [None]:
print(f"Reminder: Prompts per user: {mean(logs.uhash.value_counts())}±{std(logs.uhash.value_counts())}; Median: {median(logs.uhash.value_counts())}")

In [None]:
fontsize = 25

prompts_per_user = logs.uhash.value_counts().sort_values(ascending=False)
ax = prompts_per_user.plot(kind="bar", xticks=range(0, logs.uhash.nunique(), 5), figsize=(12, 9), color=["#00AEEF"])
ax.set_xlabel("user ID*", fontdict={'fontsize': fontsize})
ax.set_ylabel("number of prompts", fontdict={'fontsize': fontsize})
ax.yaxis.set_tick_params(labelsize=fontsize/2)
ax

In [None]:
ax.figure.savefig(f"{input_file}-prompts_per_user.png", bbox_inches='tight')

##### Usage by most active users

In [None]:
logs.uhash.value_counts().head(5)

### Sessions per user

In [None]:
logs.groupby(["uhash"]).cid.nunique().sort_values(ascending=False).plot(kind="bar", use_index=False, figsize=(12, 9))

### Prompts per session

In [None]:
print(f"Prompts per session: {mean(logs.cid.value_counts())}±{std(logs.cid.value_counts())}; Median: {median(logs.cid.value_counts())}")

In [None]:
logs.cid.value_counts().sort_values(ascending=False).plot(kind="bar", xticks=range(0, logs.cid.nunique(), 20), figsize=(12, 9))

##### Top most active users in a single conversation

In [None]:
logs.cid.value_counts().head(5)

### Prompt Length

In [None]:
print(f"Reminder: Prompt Length: {mean(logs.promptLength)}±{std(logs.promptLength)}; Median: {median(logs.promptLength)}")

In [None]:
logs.promptLength.sort_values(ascending=False).plot(kind="bar", xticks=range(0, logs.promptLength.count(), 20), figsize=(12, 9))

##### Inspect extremely short or long prompts
###### Reminder: adjust lengths of interest

In [None]:
short_length = 50
extremely_short_prompts = logs[logs.promptLength < short_length]
extremely_short_prompts.tail(5)

In [None]:
long_length = 100
extremely_long_prompts = logs[logs.promptLength > long_length]
extremely_long_prompts

### Prompt Length

In [None]:
print(f"Reminder: Response Length: {mean(logs.responseLength)}±{std(logs.responseLength)}; Median: {median(logs.responseLength)}")

In [None]:
notna_responses = logs.responseLength[logs.responseLength.notna()]
notna_responses.sort_values(ascending=False).plot(kind="bar", xticks=range(0, notna_responses.count(), 20), figsize=(12, 9))

### Times and intervals

In [None]:
print(f"Reminder: Session duration: {mean(full_sessions.duration_minutes)}±{std(full_sessions.duration_minutes)}; Median: {median(full_sessions.duration_minutes)}")

#### Duration of full sessions

In [None]:
# full_sessions["duration_minutes"].plot(kind="line")

##### Sessions that last more than 1 working day (9h)

In [None]:
extremely_long_sessions = full_sessions[full_sessions.duration_minutes > 9 * 60]
extremely_long_sessions.shape[0]

#### Unique users that like long sessions

In [None]:
extremely_long_sessions.uhash.nunique()

In [None]:
print(f'Mean very long sessions: {mean(extremely_long_sessions["count"])}')
print(f'Mean not-so-long sessions: {mean(full_sessions[~full_sessions.cid.isin(extremely_long_sessions.cid)]["count"])}')

# Analyse prompt content

### Define categories and prompt

The task and domains were iteratively refined based on
- a few existing lists and ontologies
- the needs and interests of the municipality 
- the preliminary results of the prompt analysis (e.g. new categories were introduced by the model in a meaningful consructive way) 




##### Some useful prompt categories, tasks & domains that served as inspiration
* https://txt.cohere.com/generative-ai-part-2/
* https://github.com/tatsu-lab/stanford_alpaca#data-release
* themes from [amsterdam.nl](amsterdam.ml)
* tasks from start survey

##### Dutch categories & prompt

In [None]:
task_types = {
    "Entertainment en lol": ["chatten"],
    "Samenvatten van documenten of emails": ["tekst samenvatten"],
    "Verbeteren/herschrijven van tekst": ["tekst vertalen", "tekst herschrijven", "tekst vereenvoudigen"],
    "Analyseren van data zoals excel": ["tekst analyseren"],
    "Hulp bij opstellen/ verbeteren e-mails en documenten": ["schrijf e-mail", "schrijf tekst"],
    "Hulp met coderen": ["schrijf code"],
    "Informatie vinden": [ # // Including "Persoonlijke ontwikkeling en studie"
        "geef antwoord", "geef definitie", "geef uitleg",
        "geef instructies",
        "geef lijst",
    ],
    "Creatieve ideeën opdoen": ["geef voorbeelden", "geef ideeën", "geef mening", "geef advies"],
    "Hulp bij technische problemen met computer of software": ["hulp met software"],
    "Spirituele of mentale steun bij werkstress": ["mentale steun"],
    "Other": ["test", "verzoek verduidelijken", "geen van toepassing"],
}

tasks = sum(task_types.values(), [])

In [None]:
domains = [
    "algemene kennis", "stadskennis",
    "technologie", "AI", "programmeren", "wiskunde",
    "productiviteit",
    "belastingen", "financiën", "juridisch", "politiek", "overheid",
    "openbare ruimte", "stedelijke ontwikkeling", "wonen",
    "mobiliteit", "veiligheid",
    "duurzaamheid", "milieu", "voedsel",
    "gezondheid", "sport",
    "onderwijs",
    # "innovatie",
    "HR", "sociaal", 
    # "inclusie",
    "communicatie",
    # "taal",
    "geen van toepassing",
]

In [None]:
questions = ["ja/nee", "wat", "wie", "welke", "waar", "wanneer", "waarom", "hoe", "hoeveel", "kan je"]

sensitive = ["[NAME]", "[BSN]", "[ADDRESS]", "[PHONE]", "[EMAIL]", "[ORGANIZATION]"]
risk_words = ["sharepoint", "intranet", "burger", "@amsterdam.nl"]

In [None]:
fields = {
    "topic": {
        "topic": f"bevat het belangrijkste onderwerp van het {{granularity}}",
        "task": f"bevat een enkele taak uit de volgende lijst die de gebruikersbehoeften het beste beschrijft: {tasks}. Kom niet met nieuwe taken.",
        "domain": f"bevat één woord uit de volgende lijst dat het domein het beste beschrijft: {domains}. Bedenk geen nieuwe domeinen.",
    },
    "question": {
        "factual": f"een bolean waar/onwaar - of deze {{granularity}} om feitelijke informatie vraagt (bijvoorbeeld wie momenteel een rol vervult, of waar iets te vinden is).",
        "question": f"als de {{granularity}} een vraag bevat, welk vraagwoord uit de volgende lijst beschrijft deze het beste: {questions}. Als er geen van toepassing zijn, retourneert u een lege tekenreeks.",
    },
    "risk_general": {
        "risk": f"een boolean waar/onwaar - of dit een mogelijk riskante {{granularity}} is die persoonlijke gegevens of gevoelige bedrijfsinformatie bevat.",
        "risk_word": f"een lijst met alle woorden die kunnen aantonen dat dit een mogelijk riskante, onethische of schadelijke {{granularity}} is.",
        "harmful": f"een booleaanse waar/onwaar - of dit nu een mogelijk onethische of schadelijke aanwijzing is, die in tegenspraak is met onze waarden van inclusiviteit, openheid, rechtvaardigheid en integriteit.",
    },
    "risk_concrete": {
        "namen": f"alle persoonlijke namen die in de {{granularity}} voorkomen.",
        "organisaties": f"alle namen van organisaties die in de {{granularity}} voorkomen.",
        "adressen": f"alle adressen die in de {{granularity}} voorkomen (inclusief persoonlijke- of werkadressen).",
        "emails": f"alle e-mail adressen die in de {{granularity}} voorkomen.",
        "inloggegevens": f"alle inloggegevens die in de {{granularity}} voorkomen (inclusief gebruikersnamen en wachtwoorden).",
        "geboortedata": f"alle geboortedata die in de {{granularity}} voorkomen",
        "identificatienummers": f"alle nummers die in de {{granularity}} voorkomen (inclusief BSN's, paspoort- of rijbewijsnummers, IP-adressen).",
        "nummers": f"alle nummers die in de {{granularity}} voorkomen (inclusief telefoonnummers, bankrekeningnummer, salarisgegevens).",
        "urls": f"alle URLs die in de {{granularity}} voorkomen.",
        # "sociale identiteit": f"alle ras- of etniciteitswoorden woorden die in de {{granularity}} voorkomen.",
        "sociale identiteit": f"alle woorden met betrekking tot ras of etniciteit, religie of levensbeschouwing, politieke voorkeur, geslacht of seksuele geaardheid.",
        "gezondheid": f"alle gezondheidsinformatie",
    }
}

In [None]:
prompt_template_general = f"""
Ik zal u {{input_description}} geven van een chatsysteem.
Geef mij voor deze {{granularity}} een enkele geformatteerd json-object terug dat de volgende velden bevat:
<<FIELDS>>
-----
De {{granularity}} is:

"{{input}}"
-----
De geformatteerd json-object is:

"""

prompts = {
    field_type: prompt_template_general.replace(
        "<<FIELDS>>", "\n\n".join([f'"{field}": {description}' for field, description in fields[field_type].items()])
    ) for field_type in fields.keys()
}

In [None]:
input_description = "een gebruikersprompt"
granularity = "prompt"
# print(prompts["risk_concrete"].format(input_description=input_description, granularity=granularity, input="{input}" ).replace("\n\n", "\n"))
# print(prompts["risk_general"].format(input_description=input_description, granularity=granularity, input="{input}" ).replace("\n\n", "\n"))
print(prompts["topic"].format(input_description=input_description, granularity=granularity, input="{input}" ).replace("\n\n", "\n"))

In [None]:
def jsonify_response(response):
    # sometimes there's extra "opmerkingen" and so on after the jsons
    shortened = response.split("}")[0] + "}"
    # sometimes json doesn't get loaded due to funky white spaces around
    jsonified = json.loads(re.sub(r",[\n\s\t]\}", "\n}", shortened).strip())
    # sometimes model returns some nonsense fields we don't care about
    long_keys = list(filter(lambda x: len(x) > 20, jsonified.keys()))
    if long_keys:
        print(f"Ignoring {long_keys}")
    return {key: val for key, val in jsonified.items() if key not in long_keys}
    # return {key: val for key, val in jsonified.items() if key in preserve_fields}

### Run on prompt level

In [None]:
try:
    df = pd.read_csv(output_file_csv.format(level="prompt"), index_col=0)

except:
    df = pd.DataFrame()

    input_description = "een gebruikersprompt"
    granularity = "prompt"

    individual_broken = defaultdict(int)

    #for test_prompt in tqdm(individual_prompts):
    for idx, entry in tqdm(logs.iterrows(), total=logs.shape[0]):
        test_prompt = entry["Prompt"]
        response_dict = entry[["Prompt", "cid", "uhash", "LoggedAt", "promptLength"]].to_dict()
        if entry["promptLength"] < 15:
            response_dict.update({"skipped-prompt": "prompt-too-short"})
        else:
            for topic, topic_prompt in prompts.items():
                try:
                    analysis_prompt = topic_prompt.format(input_description=input_description, granularity=granularity, input=test_prompt)
                    response = prompt_gpt(prompt = analysis_prompt)
                    response_dict.update(jsonify_response(response)) 
                    # display(pd.DataFrame.from_dict(response_dict))
                except Exception as e:
                    individual_broken[topic] += 1
                    response_dict.update({f"skipped-{topic}": str(e)})
                    print("====================")
                    print(topic, e)
                    print(test_prompt)
                    print(response)
                    print("====================")
        df = pd.concat([df, pd.DataFrame.from_dict([response_dict])])

    print(f"Broken: {individual_broken}")

    df["LoggedAt"] = df["LoggedAt"].dt.date
    df.reset_index(drop=True, inplace=True)

df.head(2)

 ### Same logic but on session level
### DISCLAIMER: The analysis did not work well on session level due to the misunderstanding of the conversation feature and the (ab)use of sessions for multiple conversations

In [None]:
try:
    session_df = pd.read_csv(output_file_csv.format(level="session"), index_col=0)

except:
    session_df = pd.DataFrame()

    input_description = "de gebruikersprompten van een gesprek"
    granularity = "gesprek"

    session_broken = defaultdict(int)

    session_prompts = logs.groupby(["cid", "uhash", logs["LoggedAt"].dt.date])["Prompt"].agg(lambda x: "\n".join(x)).reset_index()

    # for test_prompt in tqdm(session_prompts):
    for idx, entry in tqdm(session_prompts.iterrows(), total=session_prompts.shape[0]):
        response_dict = entry.to_dict()
        test_prompt = entry["Prompt"]
        if len(test_prompt) < 15:
            response_dict.update({"skipped-session": "session-too-short"})
        else:
            for topic, topic_prompt in prompts.items():
                try:
                    analysis_prompt = topic_prompt.format(input_description=input_description, granularity=granularity, input=test_prompt)
                    response = prompt_gpt(prompt = analysis_prompt)
                    response_dict.update(jsonify_response(response)) 
                    # display(pd.DataFrame.from_dict(response_dict))
                except Exception as e:
                    session_broken[topic] += 1
                    response_dict.update({f"skipped-{topic}": str(e)})
                    print("====================")
                    print(topic, e)
                    print(test_prompt)
                    print(response)
                    print("====================")
        session_df = pd.concat([session_df, pd.DataFrame.from_dict([response_dict])])

    print(f"Broken: {session_broken}")
    session_df.reset_index(drop=True, inplace=True)

session_df.tail(2)

### Fix some common inconsistencies

##### Adjust based on own data
##### In our case, there were often english terms in between the Dutch ones

In [None]:
for field in ["task", "domain", "question"]:
    df[field] = df[field].str.lower()


for field in ["task", "domain", "question"]:
    session_df[field] = session_df[field].str.lower()


domain_map = {
    "general knowledge": "algemene kennis",
    "mode": "algemene kennis",
    "muziek": "algemene kennis",
    "cultuur": "algemene kennis",
    "wetenschap": "algemene kennis",
    "city knowledge": "stadskennis",

    "taal": "communicatie",

    "technology": "technologie",
    "gis": "programmeren",
    "programming": "programmeren",

    "natuur": "milieu",
    # "milieu": "duurzamheid",

    "statistiek": "wiskunde",
}

for original, correct in domain_map.items():
    df.domain = df.domain.map(lambda x: correct if x == original else x)
    session_df.domain = session_df.domain.map(lambda x: correct if x == original else x)

task_map = {
    "geef samenvatting": "tekst samenvatten",
    "text summarization": "tekst samenvatten",
    "text rewriting": "tekst samenvatten",
    "give answer": "geef antwoord",
    "answer question": "geef antwoord",
    "beantwoord vraag": "geef antwoord",
    # "geef voordelen voor gebruiker en gemeente": "",
    # "geef voorbeeld": "",
    "explain": "geef uitleg",
}

for original, correct in task_map.items():
    df.task = df.task.map(lambda x: correct if x == original else x)
    session_df.task = session_df.task.map(lambda x: correct if x == original else x)

df.task = df.task.map(lambda x: "geef voorbeelden" if (isinstance(x, str) and "voorbeeld" in x) else x)
session_df.task = session_df.task.map(lambda x: "geef voorbeelden" if (isinstance(x, str) and "voorbeeld" in x) else x)

df.task = df.task.map(lambda x: x if x in map(lambda x : x.lower(), tasks) else "anders")
df.domain = df.domain.map(lambda x: x if x in map(lambda x : x.lower(), domains) else "anders")


##### Inspect (on prompt and session level) newly introduced tasks, domains or question types (beyond the ones we've specified) 

In [None]:
# df[~df.task.isin(tasks)]

##### Newly introduced tasks & domains on prompt and session level (used in development to improve the lists)

In [None]:
df[~df.task.isin(map(lambda x : x.lower(), tasks))].task.value_counts()

In [None]:
session_df[~session_df.task.isin(map(lambda x : x.lower(), tasks))].task.value_counts()

In [None]:
df[~df.domain.isin(map(lambda x: x.lower(), domains))].domain.value_counts()

In [None]:
session_df[~session_df.domain.isin(map(lambda x: x.lower(), domains))].domain.value_counts()

In [None]:
df[~df.question.isin(map(lambda x: x.lower(), questions))].question.value_counts()

In [None]:
session_df[~session_df.question.isin(map(lambda x: x.lower(), questions))].question.value_counts()

## Inspect risky or harmful

In [None]:
# pd.set_option('display.max_colwidth', None)

df[df["risk"] == True][["Prompt", "risk", "risk_word", "harmful"]]

In [None]:
df[df["harmful"] == True][["Prompt", "risk", "risk_word", "harmful"]]

##### Also on session level

In [None]:
# pd.set_option('display.max_colwidth', None)

# session_df[session_df["risk"] == True][["Prompt", "risk", "risk_word", "harmful"]]

In [None]:
# pd.set_option('display.max_colwidth', None)

# session_df[session_df["harmful"] == True][["Prompt", "risk", "risk_word", "harmful"]]

### Output into (pretty) html & csv & xsls for manual analysis by domain experts

In [None]:
# df.to_html(output_file_html)

In [None]:
from pretty_html_table import build_table


# Save individual to html file
pretty_html_table = build_table(df, "red_light")
with open(output_file_html.format(level="prompt"), "w") as f:
    f.write(pretty_html_table)

# Save to html file
pretty_html_table = build_table(session_df, "red_light")
with open(output_file_html.format(level="session"), "w") as f:
    f.write(pretty_html_table)

In [None]:
df.to_csv(output_file_csv.format(level="prompt"), index=False)
session_df.to_csv(output_file_csv.format(level="session"), index=False)

In [None]:
with pd.ExcelWriter(output_file_xlsx, engine="xlsxwriter") as writer:
    df.to_excel(writer, sheet_name="prompts")
    session_df.to_excel(writer, sheet_name="convos")

# A few example plots

In [None]:
df = pd.read_csv(output_file_csv.format(level="prompt"), index_col=0)
session_df = pd.read_csv(output_file_csv.format(level="session"), index_col=0)

### Tasks

https://community.plotly.com/t/nested-pie-charts/24011/3

In [None]:
import plotly
import plotly.graph_objects as go

task_counts = df.task.value_counts()
known_tasks = df.task.unique()

values_per_task = [sum([task_counts[subtask] if subtask in task_counts else 0 for subtask in subtasks]) for task, subtasks in task_types.items()]
ordered_parents, values_per_task = map(list, zip(*sorted(zip(task_types.keys(), values_per_task), key=lambda x: x[1], reverse=True)))
labels = sum([task_types[task] for task in ordered_parents], [])
parents = sum([[key] * len(task_types[key]) for key in ordered_parents], [])
values = [task_counts[label] if label in known_tasks else 0 for label in labels]
# values_per_task = [sum([val for idx, val in enumerate(values) if parents[idx] == task]) for task in task_types.keys()]
plotting_values = (values_per_task + values) / sum(values_per_task)

colors_original = [
    "#D92720", "#EF9120", 
    "#00AEEF", 
    "#00AEEF", "#D92720",
    "#D92720", "#EF9120", "#00AB4E", "#00AEEF", "#D92720", "00AB4E"  
    ]
colors_final = ["#004699", "#949CCC", "#53B361", "#D6ECD6", "#BED200", "#F6F6D4", "#FF9100", "#FFC88E", "#EC0000", "#FFD4E2"]
# red prominant / move pink on the back
colors_final.reverse()
colors_final = colors_final[1:] + [colors_final[0]]
# colors_ordered = ["#EC0000", "#FF9100", "#FFC88E", "#F6F6D4", "#BED200", "#D6ECD6", "#53B361", "#004699", "#949CCC", "#FFD4E2"]

trace = go.Sunburst(
    labels=ordered_parents + labels,
    parents=[""] * len(task_types.keys())  + parents,
    values= (values_per_task + values) / sum(values_per_task),
    branchvalues="total",
    insidetextfont = {"size": 200},
    outsidetextfont = {"size": 200, "color": "#130E1D"},
    marker = {"line": {"width": 2}, "colors": colors_final},
)

layout = go.Layout(
    margin = go.layout.Margin(t=0, l=0, r=0, b=0),
    autosize=True
)

figure = {
    'data': [trace],
    'layout': layout,
}


In [None]:
# plotly.offline.iplot(figure)
go.Figure(figure, layout={"width":5000, "height":5000})

In [None]:
fig = go.Figure(figure)
fig.write_html(f"{input_file}-analyzed-tasks-interactive.html")
fig.write_image(f"{input_file}-analyzed-tasks-interactive.png", width=5000, height=5000)

### Domains

In [None]:
import plotly.express as px

figure = px.pie(df.domain.value_counts().reset_index(), values='count', names="domain")

figure.update_traces(textposition='inside', textinfo='percent+label', textfont_size=200)
figure.show()

In [None]:
fig = go.Figure(figure)
fig.write_html(f"{input_file}-analyzed-domain-interactive.html")
fig.write_image(f"{input_file}-analyzed-domain-interactive.png", width=5000, height=5000)

### Domains (split factual)

In [None]:
import seaborn as sns

domain_counts = df.domain.value_counts()
common_domains = domain_counts[domain_counts > 1].index

domains_fact_plot = sns.catplot(
    data=df[df.domain.isin(common_domains)], y="domain", 
    # hue="harmful",
    # hue="risk",
    hue="factual",
    kind="count",
    palette="coolwarm", edgecolor=".6",
)
domains_fact_plot.figure.savefig(f"{input_file}-analyzed-domains-fact.png")
domains_fact_plot

In [None]:
domain_counts = session_df.domain.value_counts()
common_domains = domain_counts[domain_counts > 0].index

domains_fact_plot = sns.catplot(
    data=session_df[session_df.domain.isin(common_domains)], y="domain", 
    # hue="harmful",
    # hue="risk",
    hue="factual",
    kind="count",
    palette="coolwarm", edgecolor=".6",
)
domains_fact_plot.figure.savefig(f"{input_file}-analyzed-domains-fact-session.png")
domains_fact_plot

### Types of questions

In [None]:
import plotly.express as px

# figure = px.pie(df.question.value_counts().reset_index(), values='count', names="question")
figure = px.pie(df.map(lambda x: "geen van toepassing" if pd.isnull(x) or not x else x)["question"].value_counts().reset_index(), values='count', names="question")
figure.update_traces(textposition='inside', textinfo='percent+label')
figure.show()

In [None]:
go.Figure(figure).write_html(f"{input_file}-analyzed-questions-interactive.html")

# Wordclouds based on GPTs free form topic

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

stopwords = set(list(STOPWORDS) + stopwords.words("dutch"))
stopwords.update(["en", "Amsterdam", "Amsterdamse", "gemeente"])

## Prompt topics

In [None]:
text = " ".join(df.topic.map(str).str.lower())

# Create and generate a word cloud image:
wordcloud = WordCloud(stopwords=stopwords, background_color="white", colormap="PuRd", width=1200, height=900).generate(text)

# Display the generated image:
plt.figure(figsize=(12,9))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
wordcloud.to_file(f"{input_file}-wordcloud-prompt.png")

## Session topics

In [None]:
text = " ".join(session_df.topic.map(str).str.lower())

# Create and generate a word cloud image:
wordcloud = WordCloud(stopwords=stopwords, background_color="white", colormap="PuRd", width=1200, height=900).generate(text)

# Display the generated image:
plt.figure(figsize=(12,9))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
wordcloud.to_file(f"{input_file}-wordcloud-session.png")