# Dataset Simulation (Sanjida)

In [None]:
RUN_LLM = False                # True to load Mistral / generate seeds. Set False after seeds are produced.
RUN_LLM_STRUCT = False        # If True, attempt to use LLM to produce structured sample rows
HF_TOKEN = "YOUR_TOKEN"     # Replace with your HF token if RUN_LLM True

N_FINAL = 1000                # final dataset size
SEED_STRUCT_N = 200           # structured sample rows for CTGAN training
URGENT_PROB = 0.35            # probability a ticket is urgent in generated samples
RNG_SEED = 42                 # reproducibility
OUT_CSV = "synthetic_support_tickets.csv"

In [None]:
!pip install sdv transformers accelerate bitsandbytes huggingface_hub --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.5/198.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m54.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.4/74.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.3/198.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import random, re, math
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# SDV / CTGAN imports
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

# RNG
rng = np.random.default_rng(RNG_SEED)

# LLM setup (guarded)
tokenizer = None
model = None
device = "cpu"

if RUN_LLM:
    import os
    import torch
    from huggingface_hub import login
    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, logging as hf_logging

    # login (programmatic)
    os.environ["HUGGINGFACE_TOKEN"] = HF_TOKEN
    try:
        login(token=HF_TOKEN)
    except Exception:
        pass

    # Silence transformer info logs (prevents repeated pad messages)
    hf_logging.set_verbosity_error()

    model_name = "mistralai/Mistral-7B-Instruct-v0.1"
    bnb = BitsAndBytesConfig(load_in_4bit=True)

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, token=os.environ.get("HUGGINGFACE_TOKEN"))
    # Ensure pad_token exists
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb,
        device_map="auto",
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

# Safe generate that returns only the generated text (no prompt)
def generate_reply_only(prompt, max_new_tokens=100, retries=2):
    """Generate and return only the model-generated text (without the prompt)."""
    assert RUN_LLM and model is not None and tokenizer is not None, "LLM not enabled"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    input_ids = inputs["input_ids"]
    input_len = input_ids.shape[1]

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.8,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        top_p=0.95
    )
    gen_ids = outputs[0, input_len:]
    text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    tries = 0
    while (not text or text.strip() == "") and tries < retries:
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            top_p=0.95
        )
        gen_ids = outputs[0, input_len:]
        text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
        tries += 1

    # Collapse whitespace/newlines into single line and return
    text = " ".join(text.split())
    return text

tokenizer_config.json: 0.00B [00:00, ?B/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [None]:
# Helper lists
first_names = ["Aisha","Adam","Nur","Sanjida","Hannah","Ibrahim","Amina","Daniel","Siti","Ravi","Wei","Sara","Amir","Lina"]
product_categories = ["Electronics","Clothing","Home","Beauty","Groceries"]
issue_types = ["Payment","Delivery","Product","Refund","Other"]

def make_structured_row(i, urgent_prob=URGENT_PROB):
    urgent = int(rng.random() < urgent_prob)
    ticket_dt = (datetime.now() - timedelta(days=int(rng.integers(0, 90)),
                                           seconds=int(rng.integers(0, 86400))))
    ticket_dt_str = ticket_dt.strftime("%Y-%m-%d %H:%M:%S")  # no microseconds
    return {
        "ticket_id": f"T{i:05d}",
        "customer_id": f"C{rng.integers(1000, 9999)}",
        "ticket_date": ticket_dt_str,
        "account_age_days": int(rng.integers(30, 2000)),
        "num_prev_tickets": int(rng.poisson(2) if urgent else int(rng.poisson(1))),
        "avg_response_time_prev": round(float(rng.uniform(2, 48)), 2),
        "product_category": random.choice(product_categories),
        "issue_type": random.choice(issue_types),
        "priority": "High" if urgent else random.choice(["Low", "Medium"]),
        "urgent_flag": int(urgent)
    }

# Build sample
structured_sample = []
if RUN_LLM_STRUCT and RUN_LLM:
    pass

for i in range(SEED_STRUCT_N):
    structured_sample.append(make_structured_row(i))

df_seed_struct = pd.DataFrame(structured_sample)
print("Structured seed shape:", df_seed_struct.shape)
df_seed_struct.head()

Structured seed shape: (200, 10)


Unnamed: 0,ticket_id,customer_id,ticket_date,account_age_days,num_prev_tickets,avg_response_time_prev,product_category,issue_type,priority,urgent_flag
0,T00000,C4896,2025-11-06 07:23:26,1721,1,46.88,Beauty,Delivery,Medium,0
1,T00001,C5618,2025-10-30 23:03:30,282,1,44.63,Electronics,Refund,Low,0
2,T00002,C5908,2025-11-27 22:10:39,903,0,27.51,Electronics,Other,Medium,0
3,T00003,C3490,2025-10-17 22:03:38,1274,5,23.47,Clothing,Delivery,High,1
4,T00004,C7689,2025-11-15 14:13:15,1375,3,23.6,Clothing,Other,High,1


In [None]:
# Use only structured columns for CTGAN
ctgan_columns = ["customer_id","ticket_date","account_age_days",
                 "num_prev_tickets","avg_response_time_prev",
                 "product_category","issue_type","priority","urgent_flag"]

df_ctgan_train = df_seed_struct[ctgan_columns].copy()

# For CTGAN, ticket_date should be convertible to a numeric or categorical representation.
# Option: convert ticket_date to "days_ago" integer for CTGAN, then convert back later.
def date_to_days_ago(dt_str):
    dt = datetime.strptime(dt_str, "%Y-%m-%d %H:%M:%S")
    return (datetime.now() - dt).days

df_ctgan_train["days_ago"] = df_ctgan_train["ticket_date"].apply(date_to_days_ago)
df_ctgan_train = df_ctgan_train.drop(columns=["ticket_date"])

# Ensure correct dtypes
df_ctgan_train["account_age_days"] = df_ctgan_train["account_age_days"].astype(int)
df_ctgan_train["num_prev_tickets"] = df_ctgan_train["num_prev_tickets"].astype(int)
df_ctgan_train["avg_response_time_prev"] = df_ctgan_train["avg_response_time_prev"].astype(float)
df_ctgan_train["urgent_flag"] = df_ctgan_train["urgent_flag"].astype(int)

# Build metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df_ctgan_train)

# Train CTGAN
ctgan = CTGANSynthesizer(metadata)
ctgan.fit(df_ctgan_train)

to_generate = N_FINAL - len(df_ctgan_train)
print(f"Generating {to_generate} synthetic structured rows with CTGAN...")
synthetic_struct = ctgan.sample(to_generate)

# Combine seed + synthetic
df_struct_all = pd.concat([df_ctgan_train, synthetic_struct], ignore_index=True)
# Reconstruct ticket_date from days_ago (randomize time-of-day)
def days_ago_to_date(days):
    # randomize time-of-day
    secs = int(rng.integers(0, 86400))
    dt = datetime.now() - timedelta(days=int(days), seconds=secs)
    return dt.strftime("%Y-%m-%d %H:%M:%S")

df_struct_all["ticket_date"] = df_struct_all["days_ago"].apply(days_ago_to_date)
df_struct_all = df_struct_all.drop(columns=["days_ago"])  # done

# Reorder and ensure columns
df_struct_all = df_struct_all[["customer_id","ticket_date","account_age_days",
                               "num_prev_tickets","avg_response_time_prev",
                               "product_category","issue_type","priority","urgent_flag"]]

# Assign ticket_id sequentially T00000..T00999 and keep consistent
df_struct_all.insert(0, "ticket_id", [f"T{i:05d}" for i in range(len(df_struct_all))])

print("Combined structured dataset shape:", df_struct_all.shape)
df_struct_all.head()



Generating 800 synthetic structured rows with CTGAN...
Combined structured dataset shape: (1000, 10)


Unnamed: 0,ticket_id,customer_id,ticket_date,account_age_days,num_prev_tickets,avg_response_time_prev,product_category,issue_type,priority,urgent_flag
0,T00000,C4896,2025-11-06 02:10:45,1721,1,46.88,Beauty,Delivery,Medium,0
1,T00001,C5618,2025-10-30 19:14:40,282,1,44.63,Electronics,Refund,Low,0
2,T00002,C5908,2025-11-27 21:36:16,903,0,27.51,Electronics,Other,Medium,0
3,T00003,C3490,2025-10-18 08:31:02,1274,5,23.47,Clothing,Delivery,High,1
4,T00004,C7689,2025-11-15 15:26:34,1375,3,23.6,Clothing,Other,High,1


In [None]:
rows = []
first_names = ["Aisha","Adam","Nur","Sanjida","Hannah","Ibrahim","Amina","Daniel","Siti","Ravi","Wei","Sara","Amir","Lina"]

PROMPT_TEMPLATE = """You are a customer writing a support ticket for an e-commerce platform.
Tone: {tone}
Context: {issue_type} issue for a {product_category} item.
Write a 1–2 sentence realistic ticket (avoid placeholders like [Your Name] or [Order number])."""

for idx, r in df_struct_all.reset_index(drop=True).iterrows():
    urgent = int(r["urgent_flag"])
    tone = "urgent and frustrated" if urgent == 1 else "calm and non-urgent"
    prompt = PROMPT_TEMPLATE.format(tone=tone, issue_type=r["issue_type"], product_category=r["product_category"])

    # Generate ticket_text per row (unique)
    ticket_text = generate_reply_only(prompt, max_new_tokens=100)

    # Replace any leftover placeholders
    name = random.choice(first_names)
    order_no = f"ORD{rng.integers(100000,999999)}"
    ticket_text = re.sub(r"\[Your Name\]|\{Your Name\}", name, ticket_text)
    ticket_text = re.sub(r"\[Order number\]|\{Order number\}|\[order number\]", order_no, ticket_text)
    ticket_text = " ".join(ticket_text.split())

    row = {
        "ticket_id": r["ticket_id"],
        "customer_id": r["customer_id"],
        "ticket_date": r["ticket_date"],
        "account_age_days": int(r["account_age_days"]),
        "num_prev_tickets": int(r["num_prev_tickets"]),
        "avg_response_time_prev": float(r["avg_response_time_prev"]),
        "product_category": r["product_category"],
        "issue_type": r["issue_type"],
        "priority": r["priority"],
        "ticket_text": ticket_text,
        "urgent_flag": urgent
    }
    rows.append(row)

df_final = pd.DataFrame(rows)
df_final.to_csv("synthetic_support_tickets_unique.csv", index=False)
print("Saved 1000 fully unique tickets in synthetic_support_tickets_unique.csv")

Saved 1000 fully unique tickets in synthetic_support_tickets_unique.csv


In [None]:
df_final.head()

Unnamed: 0,ticket_id,customer_id,ticket_date,account_age_days,num_prev_tickets,avg_response_time_prev,product_category,issue_type,priority,ticket_text,urgent_flag
0,T00000,C4896,2025-11-06 02:10:45,1721,1,46.88,Beauty,Delivery,Medium,Subject: Beauty item delivery issue Dear Suppo...,0
1,T00001,C5618,2025-10-30 19:14:40,282,1,44.63,Electronics,Refund,Low,I purchased an Electronics item on your websit...,0
2,T00002,C5908,2025-11-27 21:36:16,903,0,27.51,Electronics,Other,Medium,"""I recently purchased an Electronics item from...",0
3,T00003,C3490,2025-10-18 08:31:02,1274,5,23.47,Clothing,Delivery,High,Subject: Urgent Delivery Issue for Clothing It...,1
4,T00004,C7689,2025-11-15 15:26:34,1375,3,23.6,Clothing,Other,High,Subject: Urgent: Problem with Clothing item - ...,1


# Simple EDA (Sanjida)

# Feature Engineering 1 (Syarifah)

# Feature Engineering 2 (Bushra)

# Decision Tree (Sanjida)

# Logistic Regression (Syarifah)

# Random Forest (Bushra)

# XGBoost (Adhia)

# k-NN (Hani)

# Model Comparison (Adhia)

# LLM Interpretation & Insights (Hani)