## 1. Import van libs

In [1]:
%load_ext autoreload
%autoreload 2

import os
from dotenv import load_dotenv

# Import GorgiasClient module from clients library

from clients import GorgiasClient

## 2. Configuraitie voor Gorgias API

In [2]:
# Load env vars
load_dotenv()

# Configuratie voor Gorgias API
config = {
    "domain": os.environ.get("GORGIAS_DOMAIN"),
    "username": os.environ.get("GORGIAS_USERNAME"),
    "api_key": os.environ.get("GORGIAS_API_KEY"),
}

# Validate elke key value pair, loopt door elke key value en pakt de key als de waarde None is.
missing = [key for key, value in config.items() if value is None]
if missing:
    raise ValueError(f"Missing environment variables: {', '.join(missing)}")

print(f"Configuration loaded. Domain: {config['domain']}, Username: {config['username']}")

Configuration loaded. Domain: northenoak.gorgias.com, Username: admin@northenoak.com


## 3. Maak Gorgias client instance en haal ticket details op

In [21]:
# Maak client instance, geeft client object van GorgiasClientClass terug dat gebruikt kan worden om requests te maken naar Gorgias API
client = GorgiasClient(
    domain=config["domain"],
    username=config["username"],
    api_key=config["api_key"]
)
print(f"✓ Gorgias client initialized for {config['domain']}")

try:
    
    ###### Stap 1 Tickets ophalen ######
    # Tickets ophalen met paginatie
    print("\nTickets ophalen...")
    tickets = client.paginate(
        endpoint="tickets",
        params={"order_by": "created_datetime:desc"},
        limit=50,
        max_pages=1  # Verwijder deze regel om ALLE tickets op te halen
    )

    print(f"\n✓ {len(tickets)} tickets opgehaald")
    print(f"  Aantal API-aanroepen: {client.request_count}")
    
    ###### Stap 2 Details ophalen per ticket ######
    
    # Tickets ophalen van de lijst met tickets
    # Voor elke ticket in tickets, pak de ticket id als die bestaat
    # ticket_ids = [t["id"] for t in tickets if t.get("id")]
    
    ticket_ids = ["34163798"]
    
    # Max aantal tickets om op te halen (None voor alle tickets)
    max_tickets = 5  

    # Lijst om tickets met details op te slaan
    detailed_tickets = []

    # Bepaal welke ticket ids opgehaald moeten worden als de max_tickets is ingesteld
    ticket_ids_to_fetch = ticket_ids[:max_tickets] if max_tickets else ticket_ids
    print(f"\nDetails ophalen voor {len(ticket_ids_to_fetch)} tickets...")

    # Haal details op voor elk ticket id in ticket_ids_to_fetch
    # Voor elke ticket id, maak een GET request naar de tickets endpoint
    for ticket_idx, ticket_id in enumerate(ticket_ids_to_fetch):
        try:
            ticket = client.request("GET", f"tickets/{ticket_id}")
            # Voeg de opgehaalde ticket details toe aan de lijst
            detailed_tickets.append(ticket)
            # Print voortgang elke 10 tickets
            # Is ticket + 1 deelbaar door 10 zonder rest (omdat ticket bij 0 begint) print voortgang bij 10, 20, 30, etc.
            # Handig bij lange loops om voortgang te zien
            if (ticket_idx + 1) % 10 == 0:
                print(f"  Voortgang: {ticket_idx + 1}/{len(ticket_ids_to_fetch)} tickets")
                
        except Exception as e:
            print(f"  Fout bij ophalen ticket {ticket_id}: {e}")

finally:
    client.close()
    print(f"\n✓ Client closed. Total API calls: {client.request_count}")

✓ Gorgias client initialized for northenoak.gorgias.com

Tickets ophalen...
  Fetched page 1: 50 items (total: 50)
  Stopped at max_pages=1

✓ 50 tickets opgehaald
  Aantal API-aanroepen: 1

Details ophalen voor 1 tickets...

✓ Client closed. Total API calls: 2


## 4 Controleer output voor ticket met ticket details

In [11]:
print(type(detailed_tickets)) 
print(type(detailed_tickets[0]))
print(detailed_tickets[:1])  # Print de eerste ticket met details als voorbeeld 

<class 'list'>
<class 'dict'>
[{'id': 37563346, 'uri': '/api/tickets/37563346/', 'external_id': None, 'events': [], 'status': 'open', 'priority': 'normal', 'channel': 'email', 'via': 'email', 'from_agent': False, 'spam': False, 'customer': {'id': 263971325, 'email': 'joke.de.cocker@live.be', 'name': 'Joke De cocker', 'firstname': 'Joke', 'lastname': 'De cocker', 'meta': {'name_set_via': 'shopify'}, 'channels': [{'id': 332023064, 'type': 'email', 'address': 'joke.de.cocker@live.be', 'preferred': True, 'created_datetime': '2025-12-13T15:32:57.586806+00:00', 'updated_datetime': '2025-12-13T15:32:57.586810+00:00', 'deleted_datetime': None, 'user': {'id': 263971325, 'name': 'Joke De cocker'}, 'customer': {'id': 263971325, 'name': 'Joke De cocker'}}], 'data': None, 'customer': None, 'integrations': {'13646': {'orders': [{'id': 7128872878423, 'admin_graphql_api_id': 'gid://shopify/Order/7128872878423', 'app_id': 580111, 'browser_ip': '81.244.115.190', 'buyer_accepts_marketing': False, 'cancel

## 7. EDA Schema Discovery

In [12]:
for key in detailed_tickets[0].keys():
    print(key)

id
uri
external_id
events
status
priority
channel
via
from_agent
spam
customer
assignee_user
assignee_user_id
assignee_team
assignee_team_id
language
subject
summary
meta
tags
custom_fields
messages
created_datetime
opened_datetime
last_received_message_datetime
last_message_datetime
updated_datetime
closed_datetime
trashed_datetime
snooze_datetime
satisfaction_survey
reply_options
requester
is_unread


In [23]:
import json
print(json.dumps(detailed_tickets[0], indent=2, default=str))

{
  "id": 34163798,
  "uri": "/api/tickets/34163798/",
  "external_id": null,
  "events": [],
  "status": "closed",
  "priority": "normal",
  "channel": "email",
  "via": "email",
  "from_agent": false,
  "spam": false,
  "customer": {
    "id": 273583527,
    "email": "els@qbix.be",
    "name": "Els De Cubber",
    "firstname": "Els",
    "lastname": "De Cubber",
    "meta": {
      "name_set_via": "email"
    },
    "channels": [
      {
        "id": 343914803,
        "type": "email",
        "address": "els@qbix.be",
        "preferred": true,
        "created_datetime": "2025-12-27T14:17:58.516379+00:00",
        "updated_datetime": "2025-12-27T14:17:58.516384+00:00",
        "deleted_datetime": null,
        "user": {
          "id": 273583527,
          "name": "Els De Cubber"
        },
        "customer": {
          "id": 273583527,
          "name": "Els De Cubber"
        }
      }
    ],
    "data": null,
    "customer": null,
    "integrations": {
      "15054": {
      

In [24]:
import pandas as pd
import json

df = pd.DataFrame([{
    "ticket_id": t["id"],
    "message_count": len(t.get("messages", [])),
    "customer_name": t.get("customer", {}).get("name"),
    "messages_json": json.dumps([
        {
            "id": msg["id"], 
            "body_text": msg.get("body_text"),
            "from_agent": msg.get("from_agent", False),
            "is_internal_note": msg.get("public") == False
        } 
        for msg in t.get("messages", [])
    ], ensure_ascii=False),
    "tags": [tag["name"] for tag in t.get("tags", [])],
    "contact_reason": list(t.get("custom_fields", {}).values())[0]["value"] 
                      if t.get("custom_fields") else None
} for t in detailed_tickets])

df.head()

Unnamed: 0,ticket_id,message_count,customer_name,messages_json,tags,contact_reason
0,34163798,15,Els De Cubber,"[{""id"": 88740509, ""body_text"": ""Beste Kimberly...","[ORDER-STATUS, feedback, negative, store-mivero]",Order Support::Damaged Item


# Door ai agent gegenereerde code voor de distributie van de contact reden

## Stap 1: Contact Reason Distributie

Eerst kijken we welke classificaties er zijn en hoe vaak ze voorkomen.

In [25]:
# Bekijk de verdeling van contact_reason
print("=== Contact Reason Distributie ===\n")
print(df["contact_reason"].value_counts(dropna=False))
print(f"\n--- Totaal: {len(df)} tickets ---")
print(f"--- Unieke categorieën: {df['contact_reason'].nunique()} ---")
print(f"--- Ontbrekend (None/NaN): {df['contact_reason'].isna().sum()} ---")

=== Contact Reason Distributie ===

contact_reason
Order Support::Damaged Item    1
Name: count, dtype: int64

--- Totaal: 1 tickets ---
--- Unieke categorieën: 1 ---
--- Ontbrekend (None/NaN): 0 ---


⚠️ **We hebben maar 1 ticket!** 

Om de classificaties goed te valideren hebben we meer data nodig. Laten we meer tickets ophalen.

In [26]:
# Meer tickets ophalen voor classificatie validatie
# Herinitialiseer client en haal meer tickets op

client = GorgiasClient(
    domain=config["domain"],
    username=config["username"],
    api_key=config["api_key"]
)

try:
    # Haal meer tickets op
    print("Meer tickets ophalen voor EDA...")
    tickets = client.paginate(
        endpoint="tickets",
        params={"order_by": "created_datetime:desc"},
        limit=50,
        max_pages=4  # 4 pagina's = ~200 tickets
    )
    print(f"✓ {len(tickets)} tickets opgehaald")
    
    # Gebruik alle ticket IDs (niet alleen 1)
    ticket_ids = [t["id"] for t in tickets if t.get("id")]
    
    # Haal details op voor max 100 tickets (genoeg voor validatie)
    max_tickets = 100
    detailed_tickets = []
    ticket_ids_to_fetch = ticket_ids[:max_tickets]
    
    print(f"\nDetails ophalen voor {len(ticket_ids_to_fetch)} tickets...")
    for idx, tid in enumerate(ticket_ids_to_fetch):
        try:
            ticket = client.request("GET", f"tickets/{tid}")
            detailed_tickets.append(ticket)
            if (idx + 1) % 20 == 0:
                print(f"  Voortgang: {idx + 1}/{len(ticket_ids_to_fetch)}")
        except Exception as e:
            print(f"  Fout bij ticket {tid}: {e}")
            
    print(f"\n✓ {len(detailed_tickets)} tickets met details opgehaald")
    
finally:
    client.close()
    print(f"✓ Client closed. API calls: {client.request_count}")

Meer tickets ophalen voor EDA...
  Fetched page 1: 50 items (total: 50)
  Fetched page 2: 50 items (total: 100)
  Fetched page 3: 50 items (total: 150)
  Fetched page 4: 50 items (total: 200)
  Stopped at max_pages=4
✓ 200 tickets opgehaald

Details ophalen voor 100 tickets...
  Voortgang: 20/100
  Voortgang: 40/100
  Voortgang: 60/100
  Voortgang: 80/100
  Voortgang: 100/100

✓ 100 tickets met details opgehaald
✓ Client closed. API calls: 104


In [27]:
# Rebuild DataFrame met de nieuwe data
df = pd.DataFrame([{
    "ticket_id": t["id"],
    "message_count": len(t.get("messages", [])),
    "customer_name": t.get("customer", {}).get("name"),
    "messages_json": json.dumps([
        {
            "id": msg["id"], 
            "body_text": msg.get("body_text"),
            "from_agent": msg.get("from_agent", False),
            "is_internal_note": msg.get("public") == False
        } 
        for msg in t.get("messages", [])
    ], ensure_ascii=False),
    "tags": [tag["name"] for tag in t.get("tags", [])],
    "contact_reason": list(t.get("custom_fields", {}).values())[0]["value"] 
                      if t.get("custom_fields") else None
} for t in detailed_tickets])

print(f"✓ DataFrame met {len(df)} tickets")

✓ DataFrame met 100 tickets


## Stap 2: Contact Reason Analyse

Nu we meer data hebben, analyseren we de classificaties om te valideren.

In [28]:
# Contact Reason distributie analyse
print("=== Contact Reason Distributie ===\n")

reason_counts = df["contact_reason"].value_counts(dropna=False)
print(reason_counts)

print(f"\n--- Statistieken ---")
print(f"Totaal tickets: {len(df)}")
print(f"Unieke categorieën: {df['contact_reason'].nunique()}")
print(f"Ontbrekend (None): {df['contact_reason'].isna().sum()}")

=== Contact Reason Distributie ===

contact_reason
Other::No Reply::Other                    43
None                                      20
Return::Request::Other                     9
Order::Status::Other                       8
Order Support::Return Order                6
Order Support::Order Delay                 4
Product::Details::Other                    2
Other                                      2
Order::Cancel::Other                       1
Customer Service::Information provided     1
Wholesale::Information::Other              1
Order::Refund::Other                       1
Order::Wrong Item::Other                   1
Exchange::Request::Other                   1
Name: count, dtype: int64

--- Statistieken ---
Totaal tickets: 100
Unieke categorieën: 13
Ontbrekend (None): 20


### Observaties:
1. **43% "No Reply"** - Grote groep, mogelijk automatisch gelabeld
2. **20% zonder classificatie** - Moeten we handmatig bekijken
3. **Hiërarchische structuur** - `Category::Subcategory::Detail` format

Laten we de tickets zonder classificatie en de "Other" tickets bekijken om de kwaliteit te beoordelen.

In [29]:
# Bekijk tickets ZONDER classificatie - wat staat erin?
print("=== Tickets zonder contact_reason ===\n")

no_reason = df[df["contact_reason"].isna()]
for _, row in no_reason.head(5).iterrows():
    # Parse eerste bericht uit messages_json
    messages = json.loads(row["messages_json"])
    first_customer_msg = next(
        (m["body_text"][:200] for m in messages if not m["from_agent"] and not m["is_internal_note"]), 
        "Geen klant bericht gevonden"
    )
    print(f"Ticket {row['ticket_id']} | Tags: {row['tags']}")
    print(f"  Bericht: {first_customer_msg}...")
    print("-" * 80)

=== Tickets zonder contact_reason ===

Ticket 37541150 | Tags: []
  Bericht: Find attached the DMARC Aggregate Report....
--------------------------------------------------------------------------------
Ticket 37541127 | Tags: []
  Bericht: Find attached the DMARC Aggregate Report....
--------------------------------------------------------------------------------
Ticket 37541116 | Tags: []
  Bericht: Find attached the DMARC Aggregate Report....
--------------------------------------------------------------------------------
Ticket 37536206 | Tags: ['store-mivero']
  Bericht: ...
--------------------------------------------------------------------------------
Ticket 37536200 | Tags: ['store-mivero']
  Bericht: ...
--------------------------------------------------------------------------------


In [30]:
# Valideer: Komen classificaties overeen met de inhoud?
# Bekijk steekproef per categorie

print("=== Classificatie Validatie Steekproef ===\n")

# Selecteer een paar categorieën om te valideren
categories_to_check = [
    "Order Support::Order Delay",
    "Order Support::Return Order", 
    "Order::Wrong Item::Other"
]

for cat in categories_to_check:
    subset = df[df["contact_reason"] == cat]
    if len(subset) == 0:
        continue
        
    print(f"\n📌 {cat} ({len(subset)} tickets)")
    print("=" * 60)
    
    for _, row in subset.head(2).iterrows():
        messages = json.loads(row["messages_json"])
        first_msg = next(
            (m["body_text"][:300] for m in messages if not m["from_agent"] and not m["is_internal_note"]), 
            "Geen bericht"
        )
        print(f"\nTicket {row['ticket_id']}:")
        print(f"  {first_msg}...")
        print("-" * 60)

=== Classificatie Validatie Steekproef ===


📌 Order Support::Order Delay (4 tickets)

Ticket 37518935:
  Hii!! I didn’t receive my order yet ?? 

On Mon, Jan 12, 2026 at 07:23 Mivero < info@mivero.nl> wrote: (mailto:info@mivero.nl) 
 
  
   
    
   
  
 Mivero (https://mivero.nl/_t/c/v3/AABmxT23mt6grBIc6WTTXQKA9Z6TXzWKXqhz8_Jq9uGzJ4RRosPuV6pvQTYtOa-KznzwBZ1BC9v1cy4YCaS469tKuQ0-X-eZ1tJICVWWrDH2Is7ZkVDg...
------------------------------------------------------------

Ticket 37387356:
  Hoi ik kan mij bestelling niet openen op de mail. Het duurt nu best lang. En vraag me af of het nog wel komt....
------------------------------------------------------------

📌 Order Support::Return Order (6 tickets)

Ticket 37526345:
  Hallo, 

Ik heb deze jasjes besteld volgens de maat tabel en toch zijn ze veel te klein. Hoe kan ik deze retour zenden en mijn geld terug krijgen? 

Ik hoor het graag. 

Met vriendelijke groet, 
Miranda Baas 

Verzonden vanaf mijn Galaxy 

 
-------- Oorspronkelijk berich

### 🔍 Validatie Bevindingen

| Categorie | Sample | Correct? | Opmerking |
|-----------|--------|----------|-----------|
| Order Support::Order Delay | "didn't receive my order yet" | ✅ | Klopt - vertraging |
| Order Support::Return Order | "te klein, retour zenden" | ✅ | Klopt - retour verzoek |
| **Order::Wrong Item::Other** | "bestelling nog niet binnen" | ❌ | **Fout!** Dit is Order Delay, niet Wrong Item |

**Probleem gevonden:** Ticket 37425200 is verkeerd geclassificeerd als "Wrong Item" terwijl het "Order Delay" is.

Laten we een systematische check doen om meer misclassificaties te vinden.

In [31]:
# Maak een validatie dataset om classificaties te reviewen/corrigeren
# Export naar CSV zodat je handmatig kunt valideren

validation_df = df[df["contact_reason"].notna()].copy()

# Voeg eerste klantbericht toe voor review
def get_first_customer_message(messages_json):
    try:
        messages = json.loads(messages_json)
        for msg in messages:
            if not msg["from_agent"] and not msg["is_internal_note"]:
                text = msg["body_text"] or ""
                # Clean en verkort
                text = text.replace("\n", " ").strip()[:300]
                return text
    except:
        pass
    return ""

validation_df["first_customer_message"] = validation_df["messages_json"].apply(get_first_customer_message)

# Voeg kolom toe voor correctie
validation_df["corrected_reason"] = ""  # Leeg - vul je handmatig in
validation_df["is_correct"] = ""  # Vul in: ja/nee

# Selecteer relevante kolommen voor review
review_cols = ["ticket_id", "contact_reason", "first_customer_message", "tags", "is_correct", "corrected_reason"]
validation_df[review_cols].head(10)

Unnamed: 0,ticket_id,contact_reason,first_customer_message,tags,is_correct,corrected_reason
0,37570741,Order::Status::Other,"Goedendag, Ik probeer de link om de bestelli...","[ORDER-STATUS, urgent, store-mivero]",,
1,37563346,Return::Request::Other,Beste Het jasje dat ik bij jullie bestelde ...,[store-mivero],,
2,37546250,Return::Request::Other,Nieuw klantbericht op 18 januari 2026 om 02:04...,"[feedback, negative, urgent, store-mivero]",,
3,37541170,Other::No Reply::Other,Find attached the DMARC Aggregate Report.,"[auto-close, non-support-related, store-mivero]",,
7,37540823,Other::No Reply::Other,Stop paying $2000+ a year. Swap your subscript...,"[auto-close, non-support-related, store-mivero]",,
10,37535646,Other::No Reply::Other,"Hallo mivero, Alles is naar wens. Mand is bin...","[auto-close, non-support-related, store-mivero]",,
11,37531785,Other::No Reply::Other,Rating: 5 | Review Text: Makkelijk in gebruik....,"[auto-close, non-support-related, store-mivero]",,
13,37526345,Order Support::Return Order,"Hallo, Ik heb deze jasjes besteld volgens de...","[RETURN/EXCHANGE, store-mivero]",,
14,37518935,Order Support::Order Delay,Hii!! I didn’t receive my order yet ?? On Mo...,"[ORDER-STATUS, feedback, negative, store-mivero]",,
15,37518200,Product::Details::Other,Productvraag ------------------------------- S...,[store-mivero],,


In [32]:
# Exporteer naar CSV voor handmatige validatie
output_path = "classification_validation.csv"
validation_df[review_cols].to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"✓ Validatie dataset geëxporteerd naar: {output_path}")
print(f"  {len(validation_df)} tickets om te reviewen")
print(f"\nInstructies:")
print("  1. Open classification_validation.csv in Excel")
print("  2. Bekijk 'first_customer_message' en 'contact_reason'")
print("  3. Vul 'is_correct' in: ja of nee")
print("  4. Bij 'nee', vul de juiste categorie in bij 'corrected_reason'")

✓ Validatie dataset geëxporteerd naar: classification_validation.csv
  80 tickets om te reviewen

Instructies:
  1. Open classification_validation.csv in Excel
  2. Bekijk 'first_customer_message' en 'contact_reason'
  3. Vul 'is_correct' in: ja of nee
  4. Bij 'nee', vul de juiste categorie in bij 'corrected_reason'


## Stap 3: Keyword Extractie voor Classificatie

We extraheren keywords uit de berichten die kenmerkend zijn per categorie. Deze kunnen we later gebruiken voor automatische classificatie door de AI agent.

In [33]:
# Keywords per categorie - handmatig geïdentificeerd uit de data
# Deze kunnen later gebruikt worden voor automatische classificatie

CATEGORY_KEYWORDS = {
    "Order Support::Order Delay": {
        "nl": ["niet ontvangen", "nog niet binnen", "wacht", "lang", "wanneer", "track", "trace", "tracking", "werkt niet", "duurt", "blijft"],
        "en": ["didn't receive", "not received", "waiting", "where is"]
    },
    "Order Support::Return Order": {
        "nl": ["retour", "terugsturen", "te klein", "te groot", "ruilen", "retouradres", "retourlabel", "geld terug"],
        "en": ["return", "too small", "too big", "exchange"]
    },
    "Return::Request::Other": {
        "nl": ["retourneren", "terugsturen", "te klein", "te groot", "retour", "terug", "past niet"],
        "en": ["return", "send back"]
    },
    "Exchange::Request::Other": {
        "nl": ["ruilen", "omruilen", "andere maat", "verkeerde maat"],
        "en": ["exchange", "swap", "different size"]
    },
    "Order::Status::Other": {
        "nl": ["status", "waar blijft", "tracking", "volgen", "wanneer", "geleverd", "bezorging"],
        "en": ["status", "where is", "tracking", "when"]
    },
    "Order::Cancel::Other": {
        "nl": ["annuleren", "cancel", "anuleren", "niet meer nodig"],
        "en": ["cancel"]
    },
    "Order::Wrong Item::Other": {
        "nl": ["verkeerd", "fout", "andere", "niet besteld", "niet wat ik"],
        "en": ["wrong", "incorrect", "not what I ordered"]
    },
    "Order::Refund::Other": {
        "nl": ["terugbetaling", "geld terug", "refund"],
        "en": ["refund", "money back"]
    },
    "Product::Details::Other": {
        "nl": ["vraag", "maat", "formaat", "past", "welke", "geschikt"],
        "en": ["question", "size", "which", "suitable"]
    },
    "Other::No Reply::Other": {
        "nl": ["dank", "bedankt", "ontvangen", "alles goed"],
        "en": ["thank", "received", "all good"],
        "system": ["DMARC", "Rating:", "PostNL", "Trustpilot", "review"]
    }
}

print("✓ Keywords dictionary aangemaakt")
print(f"  {len(CATEGORY_KEYWORDS)} categorieën gedefinieerd")

✓ Keywords dictionary aangemaakt
  10 categorieën gedefinieerd


In [34]:
# Functie om classificatie te suggereren op basis van keywords
def suggest_classification(message_text):
    """
    Analyseert bericht en suggereert classificatie(s) op basis van keywords.
    Returns: list van (categorie, score, matched_keywords)
    """
    if not message_text:
        return []
    
    text_lower = message_text.lower()
    suggestions = []
    
    for category, keyword_groups in CATEGORY_KEYWORDS.items():
        matched_keywords = []
        
        # Check alle talen/groepen
        for lang, keywords in keyword_groups.items():
            for keyword in keywords:
                if keyword.lower() in text_lower:
                    matched_keywords.append(keyword)
        
        if matched_keywords:
            # Score = aantal matches
            score = len(matched_keywords)
            suggestions.append((category, score, matched_keywords))
    
    # Sorteer op score (hoogste eerst)
    suggestions.sort(key=lambda x: x[1], reverse=True)
    return suggestions

# Test de functie
test_messages = [
    "Ik heb mijn bestelling nog niet ontvangen, waar blijft het?",
    "Dit jasje is te klein, kan ik het terugsturen?",
    "Ik wil graag mijn bestelling annuleren"
]

print("=== Keyword Matching Test ===\n")
for msg in test_messages:
    suggestions = suggest_classification(msg)
    print(f"Bericht: {msg[:60]}...")
    if suggestions:
        top = suggestions[0]
        print(f"  → Suggestie: {top[0]}")
        print(f"    Keywords: {top[2]}")
    print()

=== Keyword Matching Test ===

Bericht: Ik heb mijn bestelling nog niet ontvangen, waar blijft het?...
  → Suggestie: Order Support::Order Delay
    Keywords: ['niet ontvangen', 'blijft']

Bericht: Dit jasje is te klein, kan ik het terugsturen?...
  → Suggestie: Return::Request::Other
    Keywords: ['terugsturen', 'te klein', 'terug']

Bericht: Ik wil graag mijn bestelling annuleren...
  → Suggestie: Order::Cancel::Other
    Keywords: ['annuleren']



In [35]:
# Voeg keyword-gebaseerde suggesties toe aan validation_df
def get_suggested_category(message):
    suggestions = suggest_classification(message)
    if suggestions:
        return suggestions[0][0]  # Beste match
    return ""

def get_matched_keywords(message):
    suggestions = suggest_classification(message)
    if suggestions:
        return ", ".join(suggestions[0][2])  # Keywords van beste match
    return ""

validation_df["suggested_reason"] = validation_df["first_customer_message"].apply(get_suggested_category)
validation_df["matched_keywords"] = validation_df["first_customer_message"].apply(get_matched_keywords)

# Check hoeveel overeenkomen
validation_df["match"] = validation_df["contact_reason"] == validation_df["suggested_reason"]

print("=== Keyword Suggestie Analyse ===\n")
print(f"Tickets met suggestie: {(validation_df['suggested_reason'] != '').sum()}/{len(validation_df)}")
print(f"Suggestie = huidige classificatie: {validation_df['match'].sum()}")
print(f"Suggestie ≠ huidige classificatie: {(~validation_df['match']).sum()}")
print(f"\n--- Potentiële misclassificaties ---")

=== Keyword Suggestie Analyse ===

Tickets met suggestie: 67/80
Suggestie = huidige classificatie: 43
Suggestie ≠ huidige classificatie: 37

--- Potentiële misclassificaties ---


In [36]:
# Bekijk potentiële misclassificaties (waar suggestie ≠ huidige en suggestie niet leeg)
mismatches = validation_df[
    (validation_df["suggested_reason"] != "") & 
    (~validation_df["match"])
][["ticket_id", "contact_reason", "suggested_reason", "matched_keywords", "first_customer_message"]]

print(f"=== {len(mismatches)} Potentiële Misclassificaties ===\n")
for _, row in mismatches.head(10).iterrows():
    print(f"Ticket {row['ticket_id']}")
    print(f"  Huidige:   {row['contact_reason']}")
    print(f"  Suggestie: {row['suggested_reason']}")
    print(f"  Keywords:  {row['matched_keywords']}")
    print(f"  Bericht:   {row['first_customer_message'][:100]}...")
    print("-" * 70)

=== 24 Potentiële Misclassificaties ===

Ticket 37546250
  Huidige:   Return::Request::Other
  Suggestie: Product::Details::Other
  Keywords:  maat
  Bericht:   Nieuw klantbericht op 18 januari 2026 om 02:04       Je hebt een nieuw bericht ontvangen via het con...
----------------------------------------------------------------------
Ticket 37540823
  Huidige:   Other::No Reply::Other
  Suggestie: Exchange::Request::Other
  Keywords:  swap
  Bericht:   Stop paying $2000+ a year. Swap your subscription for one lifetime payment. ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ͏ ...
----------------------------------------------------------------------
Ticket 37514518
  Huidige:   Other::No Reply::Other
  Suggestie: Order::Status::Other
  Keywords:  volgen
  Bericht:   Dank je wel !   Met vriendelijke groeten,   Tamara Hesseling   Op di 13 jan 2026, 08:31 schreef Info...
----------------------------------------------------------------------
Ticket 37509221
  Huidige:   Return::Request::Other
  Suggestie: Order Sup

In [37]:
# Exporteer verbeterde CSV met suggesties
export_cols = [
    "ticket_id", 
    "contact_reason",      # Huidige classificatie
    "suggested_reason",    # Keyword-gebaseerde suggestie
    "matched_keywords",    # Welke keywords gevonden
    "first_customer_message",
    "tags",
    "is_correct",          # Leeg - vul handmatig in
    "corrected_reason"     # Leeg - vul handmatig in
]

output_file = "classification_validation.csv"
validation_df[export_cols].to_csv(output_file, index=False, encoding="utf-8-sig")

print(f"✓ Verbeterde validatie CSV geëxporteerd: {output_file}")
print(f"  {len(validation_df)} tickets")
print(f"\nNieuwe kolommen toegevoegd:")
print("  • suggested_reason - AI suggestie op basis van keywords")
print("  • matched_keywords - Welke keywords gevonden zijn")
print(f"\n📋 Open het bestand om te valideren!")

✓ Verbeterde validatie CSV geëxporteerd: classification_validation.csv
  80 tickets

Nieuwe kolommen toegevoegd:
  • suggested_reason - AI suggestie op basis van keywords
  • matched_keywords - Welke keywords gevonden zijn

📋 Open het bestand om te valideren!


## Stap 3: Referentie Categorieën (uit SOPs)

Gebruik deze categorieën bij het corrigeren van classificaties:

| SOP Bestand | Categorie | Keywords |
|-------------|-----------|----------|
| order-delay.md | Order Delay | vertraagd, nog niet ontvangen, waar is mijn bestelling |
| lost-in-transit.md | Lost in Transit | kwijt, verloren, niet aangekomen |
| manage-quality-issue*.md | Quality Issue / Damaged | beschadigd, kapot, kwaliteit |
| manage-wrong-order*.md | Wrong Order | verkeerd product, niet besteld |
| manage-wrong-size*.md | Wrong Size | te klein, te groot, maat klopt niet |
| custom-charge.md | Custom Charge | douane, invoerrechten |
| process-damaged-item*.md | Damaged Item | beschadigd bij ontvangst |
| out-of-stock*.md | Out of Stock | niet op voorraad |