# Dataset Simulation (Sanjida)

In [None]:
RUN_LLM = False                # True to load Mistral / generate seeds. Set False after seeds are produced.
RUN_LLM_STRUCT = False        # If True, attempt to use LLM to produce structured sample rows
HF_TOKEN = "YOUR_TOKEN"     # Replace with your HF token if RUN_LLM True

N_FINAL = 1000                # final dataset size
SEED_STRUCT_N = 200           # structured sample rows for CTGAN training
URGENT_PROB = 0.35            # probability a ticket is urgent in generated samples
RNG_SEED = 42                 # reproducibility
OUT_CSV = "synthetic_support_tickets.csv"

In [None]:
!pip install sdv transformers accelerate bitsandbytes huggingface_hub --quiet

In [None]:
import random, re, math
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# SDV / CTGAN imports
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata

# RNG
rng = np.random.default_rng(RNG_SEED)

# LLM setup (guarded)
tokenizer = None
model = None
device = "cpu"

if RUN_LLM:
    import os
    import torch
    from huggingface_hub import login
    from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, logging as hf_logging

    # login (programmatic)
    os.environ["HUGGINGFACE_TOKEN"] = HF_TOKEN
    try:
        login(token=HF_TOKEN)
    except Exception:
        pass

    # Silence transformer info logs (prevents repeated pad messages)
    hf_logging.set_verbosity_error()

    model_name = "mistralai/Mistral-7B-Instruct-v0.1"
    bnb = BitsAndBytesConfig(load_in_4bit=True)

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, token=os.environ.get("HUGGINGFACE_TOKEN"))
    # Ensure pad_token exists
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb,
        device_map="auto",
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

# Safe generate that returns only the generated text (no prompt)
def generate_reply_only(prompt, max_new_tokens=100, retries=2):
    """Generate and return only the model-generated text (without the prompt)."""
    assert RUN_LLM and model is not None and tokenizer is not None, "LLM not enabled"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
    input_ids = inputs["input_ids"]
    input_len = input_ids.shape[1]

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.8,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        top_p=0.95
    )
    gen_ids = outputs[0, input_len:]
    text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    tries = 0
    while (not text or text.strip() == "") and tries < retries:
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            top_p=0.95
        )
        gen_ids = outputs[0, input_len:]
        text = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
        tries += 1

    # Collapse whitespace/newlines into single line and return
    text = " ".join(text.split())
    return text

In [None]:
# Helper lists
first_names = ["Aisha","Adam","Nur","Sanjida","Hannah","Ibrahim","Amina","Daniel","Siti","Ravi","Wei","Sara","Amir","Lina"]
product_categories = ["Electronics","Clothing","Home","Beauty","Groceries"]
issue_types = ["Payment","Delivery","Product","Refund","Other"]

def make_structured_row(i, urgent_prob=URGENT_PROB):
    urgent = int(rng.random() < urgent_prob)
    ticket_dt = (datetime.now() - timedelta(days=int(rng.integers(0, 90)),
                                           seconds=int(rng.integers(0, 86400))))
    ticket_dt_str = ticket_dt.strftime("%Y-%m-%d %H:%M:%S")  # no microseconds
    return {
        "ticket_id": f"T{i:05d}",
        "customer_id": f"C{rng.integers(1000, 9999)}",
        "ticket_date": ticket_dt_str,
        "account_age_days": int(rng.integers(30, 2000)),
        "num_prev_tickets": int(rng.poisson(2) if urgent else int(rng.poisson(1))),
        "avg_response_time_prev": round(float(rng.uniform(2, 48)), 2),
        "product_category": random.choice(product_categories),
        "issue_type": random.choice(issue_types),
        "priority": "High" if urgent else random.choice(["Low", "Medium"]),
        "urgent_flag": int(urgent)
    }

# Build sample
structured_sample = []
if RUN_LLM_STRUCT and RUN_LLM:
    pass

for i in range(SEED_STRUCT_N):
    structured_sample.append(make_structured_row(i))

df_seed_struct = pd.DataFrame(structured_sample)
print("Structured seed shape:", df_seed_struct.shape)
df_seed_struct.head()

In [None]:
# Use only structured columns for CTGAN
ctgan_columns = ["customer_id","ticket_date","account_age_days",
                 "num_prev_tickets","avg_response_time_prev",
                 "product_category","issue_type","priority","urgent_flag"]

df_ctgan_train = df_seed_struct[ctgan_columns].copy()

# For CTGAN, ticket_date should be convertible to a numeric or categorical representation.
# Option: convert ticket_date to "days_ago" integer for CTGAN, then convert back later.
def date_to_days_ago(dt_str):
    dt = datetime.strptime(dt_str, "%Y-%m-%d %H:%M:%S")
    return (datetime.now() - dt).days

df_ctgan_train["days_ago"] = df_ctgan_train["ticket_date"].apply(date_to_days_ago)
df_ctgan_train = df_ctgan_train.drop(columns=["ticket_date"])

# Ensure correct dtypes
df_ctgan_train["account_age_days"] = df_ctgan_train["account_age_days"].astype(int)
df_ctgan_train["num_prev_tickets"] = df_ctgan_train["num_prev_tickets"].astype(int)
df_ctgan_train["avg_response_time_prev"] = df_ctgan_train["avg_response_time_prev"].astype(float)
df_ctgan_train["urgent_flag"] = df_ctgan_train["urgent_flag"].astype(int)

# Build metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df_ctgan_train)

# Train CTGAN
ctgan = CTGANSynthesizer(metadata)
ctgan.fit(df_ctgan_train)

to_generate = N_FINAL - len(df_ctgan_train)
print(f"Generating {to_generate} synthetic structured rows with CTGAN...")
synthetic_struct = ctgan.sample(to_generate)

# Combine seed + synthetic
df_struct_all = pd.concat([df_ctgan_train, synthetic_struct], ignore_index=True)
# Reconstruct ticket_date from days_ago (randomize time-of-day)
def days_ago_to_date(days):
    # randomize time-of-day
    secs = int(rng.integers(0, 86400))
    dt = datetime.now() - timedelta(days=int(days), seconds=secs)
    return dt.strftime("%Y-%m-%d %H:%M:%S")

df_struct_all["ticket_date"] = df_struct_all["days_ago"].apply(days_ago_to_date)
df_struct_all = df_struct_all.drop(columns=["days_ago"])  # done

# Reorder and ensure columns
df_struct_all = df_struct_all[["customer_id","ticket_date","account_age_days",
                               "num_prev_tickets","avg_response_time_prev",
                               "product_category","issue_type","priority","urgent_flag"]]

# Assign ticket_id sequentially T00000..T00999 and keep consistent
df_struct_all.insert(0, "ticket_id", [f"T{i:05d}" for i in range(len(df_struct_all))])

print("Combined structured dataset shape:", df_struct_all.shape)
df_struct_all.head()

In [None]:
rows = []
first_names = ["Aisha","Adam","Nur","Sanjida","Hannah","Ibrahim","Amina","Daniel","Siti","Ravi","Wei","Sara","Amir","Lina"]

PROMPT_TEMPLATE = """You are a customer writing a support ticket for an e-commerce platform.
Tone: {tone}
Context: {issue_type} issue for a {product_category} item.
Write a 1–2 sentence realistic ticket (avoid placeholders like [Your Name] or [Order number])."""

for idx, r in df_struct_all.reset_index(drop=True).iterrows():
    urgent = int(r["urgent_flag"])
    tone = "urgent and frustrated" if urgent == 1 else "calm and non-urgent"
    prompt = PROMPT_TEMPLATE.format(tone=tone, issue_type=r["issue_type"], product_category=r["product_category"])

    # Generate ticket_text per row (unique)
    ticket_text = generate_reply_only(prompt, max_new_tokens=100)

    # Replace any leftover placeholders
    name = random.choice(first_names)
    order_no = f"ORD{rng.integers(100000,999999)}"
    ticket_text = re.sub(r"\[Your Name\]|\{Your Name\}", name, ticket_text)
    ticket_text = re.sub(r"\[Order number\]|\{Order number\}|\[order number\]", order_no, ticket_text)
    ticket_text = " ".join(ticket_text.split())

    row = {
        "ticket_id": r["ticket_id"],
        "customer_id": r["customer_id"],
        "ticket_date": r["ticket_date"],
        "account_age_days": int(r["account_age_days"]),
        "num_prev_tickets": int(r["num_prev_tickets"]),
        "avg_response_time_prev": float(r["avg_response_time_prev"]),
        "product_category": r["product_category"],
        "issue_type": r["issue_type"],
        "priority": r["priority"],
        "ticket_text": ticket_text,
        "urgent_flag": urgent
    }
    rows.append(row)

df_final = pd.DataFrame(rows)
df_final.to_csv("synthetic_support_tickets_unique.csv", index=False)
print("Saved 1000 fully unique tickets in synthetic_support_tickets_unique.csv")

In [None]:
df_final.head()

# Simple EDA (Sanjida)

# Feature Engineering 1 (Syarifah)

In [3]:
from google.colab import files

uploaded = files.upload()

Saving synthetic_support_tickets_unique.csv to synthetic_support_tickets_unique.csv


In [4]:
import pandas as pd

# Load the CSV
df = pd.read_csv('synthetic_support_tickets_unique.csv')

# Preview the data
df.head()

Unnamed: 0,ticket_id,customer_id,ticket_date,account_age_days,num_prev_tickets,avg_response_time_prev,product_category,issue_type,priority,ticket_text,urgent_flag
0,T00000,C4896,2025-11-06 02:10:45,1721,1,46.88,Beauty,Delivery,Medium,Subject: Beauty item delivery issue Dear Suppo...,0
1,T00001,C5618,2025-10-30 19:14:40,282,1,44.63,Electronics,Refund,Low,I purchased an Electronics item on your websit...,0
2,T00002,C5908,2025-11-27 21:36:16,903,0,27.51,Electronics,Other,Medium,"""I recently purchased an Electronics item from...",0
3,T00003,C3490,2025-10-18 08:31:02,1274,5,23.47,Clothing,Delivery,High,Subject: Urgent Delivery Issue for Clothing It...,1
4,T00004,C7689,2025-11-15 15:26:34,1375,3,23.6,Clothing,Other,High,Subject: Urgent: Problem with Clothing item - ...,1


In [5]:
# imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    roc_auc_score,
    classification_report
)

# Sentiment analysis
from textblob import TextBlob


In [6]:
# FE Task 1 — Sentiment Score

def compute_sentiment(text):
    """
    Compute sentiment polarity score using TextBlob.
    Returns value in range [-1, 1].
    """
    if pd.isna(text) or text.strip() == "":
        return 0.0
    return TextBlob(text).sentiment.polarity

df["sentiment_score"] = df["ticket_text"].apply(compute_sentiment)

df["sentiment_score"].describe()

Unnamed: 0,sentiment_score
count,1000.0
mean,-0.071125
std,0.183227
min,-0.75
25%,-0.181027
50%,-0.072222
75%,0.0
max,0.775


In [7]:
# FE Task 1 — Urgent Keyword Flag

urgent_keywords = [
    "urgent", "asap", "immediately", "critical",
    "important", "failure", "system down",
    "error", "cannot access", "crash", "down"
]

def urgent_flag(text):
    """
    Detect presence of urgency-related keywords.
    Returns 1 if urgent keyword found, else 0.
    """
    if pd.isna(text):
        return 0
    text = text.lower()
    return int(any(keyword in text for keyword in urgent_keywords))

df["urgent_keywords_flag"] = df["ticket_text"].apply(urgent_flag)


df["urgent_keywords_flag"].value_counts()


Unnamed: 0_level_0,count
urgent_keywords_flag,Unnamed: 1_level_1
0,656
1,344


In [8]:
# Target & Feature Selection

df["priority_binary"] = df["priority"].map({"Low": 0, "Medium": 0, "High": 1})

features = [
    "sentiment_score",
    "urgent_keywords_flag"
]

X = df[features]
y = df["priority_binary"]

X.head(), y.value_counts()


(   sentiment_score  urgent_keywords_flag
 0         0.111111                     0
 1        -0.070000                     0
 2        -0.033333                     1
 3         0.058333                     1
 4        -0.051042                     1,
 priority_binary
 0    532
 1    468
 Name: count, dtype: int64)

# Feature Engineering 2 (Bushra)

In [11]:
# Account Age Bucket

def bucket_account_age(days):
    if days < 90:
        return "New"
    elif days <= 365:
        return "Medium"
    else:
        return "Long"

df["account_age_category"] = df["account_age_days"].apply(bucket_account_age)

# Customer Activity Risk
def activity_risk(row):
    tickets = row["num_prev_tickets"]
    response = row["avg_response_time_prev"]

    if tickets >= 4 or response >= 36:
        return "High"
    elif tickets >= 2 or response >= 24:
        return "Medium"
    else:
        return "Low"

df["activity_level"] = df.apply(activity_risk, axis=1)


df.head()

Unnamed: 0,ticket_id,customer_id,ticket_date,account_age_days,num_prev_tickets,avg_response_time_prev,product_category,issue_type,priority,ticket_text,urgent_flag,sentiment_score,urgent_keywords_flag,priority_binary,account_age_category,activity_level
0,T00000,C4896,2025-11-06 02:10:45,1721,1,46.88,Beauty,Delivery,Medium,Subject: Beauty item delivery issue Dear Suppo...,0,0.111111,0,0,Long,High
1,T00001,C5618,2025-10-30 19:14:40,282,1,44.63,Electronics,Refund,Low,I purchased an Electronics item on your websit...,0,-0.07,0,0,Medium,High
2,T00002,C5908,2025-11-27 21:36:16,903,0,27.51,Electronics,Other,Medium,"""I recently purchased an Electronics item from...",0,-0.033333,1,0,Long,Medium
3,T00003,C3490,2025-10-18 08:31:02,1274,5,23.47,Clothing,Delivery,High,Subject: Urgent Delivery Issue for Clothing It...,1,0.058333,1,1,Long,High
4,T00004,C7689,2025-11-15 15:26:34,1375,3,23.6,Clothing,Other,High,Subject: Urgent: Problem with Clothing item - ...,1,-0.051042,1,1,Long,Medium


# Decision Tree (Sanjida)

# Logistic Regression (Syarifah)

In [12]:
# Feature Selection & Target
features = ["sentiment_score", "urgent_keywords_flag"]
X = df[features]

# Binary target: High priority vs others
y = df["priority"].map({"Low": 0, "Medium": 0, "High": 1})

In [13]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Fit Logistic Regression
log_reg = LogisticRegression(
    max_iter=1000,
    random_state=42
)

log_reg.fit(X_train_scaled, y_train)


In [None]:
# Predictions
y_pred = log_reg.predict(X_test_scaled)
y_proba = log_reg.predict_proba(X_test_scaled)[:, 1]


In [None]:
# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

metrics = {
    "Model": "Logistic Regression",
    "Accuracy": accuracy,
    "F1 Score": f1,
    "ROC-AUC": roc_auc
}

metrics


{'Model': 'Logistic Regression',
 'Accuracy': 0.525,
 'F1 Score': 0.40993788819875776,
 'ROC-AUC': np.float64(0.5183159373745484)}

In [None]:
# Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.68      0.60       106
           1       0.49      0.35      0.41        94

    accuracy                           0.53       200
   macro avg       0.52      0.52      0.51       200
weighted avg       0.52      0.53      0.51       200



### Logistic Regression Results

The Logistic Regression model achieved modest performance, with an accuracy of 52.5% and a ROC-AUC score slightly above random baseline. This indicates that sentiment polarity and urgent keyword features capture limited but non-negligible signal for predicting high-priority support tickets.

As this model serves as a baseline, the results highlight the need for more advanced features and models to improve predictive performance in subsequent experiments.

In [None]:
# Saving Metrics (for report & GitHub)

metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv("logistic_regression_metrics.csv", index=False)

metrics_df


Unnamed: 0,Model,Accuracy,F1 Score,ROC-AUC
0,Logistic Regression,0.525,0.409938,0.518316


# Random Forest (Bushra)

# XGBoost (Adhia)

In [17]:
# Encode categorical engineered features for XGBoost

df_encoded = pd.get_dummies(
    df,
    columns=["account_age_category", "activity_level"],
    drop_first=True
)

In [19]:
df_encoded.head()

Unnamed: 0,ticket_id,customer_id,ticket_date,account_age_days,num_prev_tickets,avg_response_time_prev,product_category,issue_type,priority,ticket_text,urgent_flag,sentiment_score,urgent_keywords_flag,priority_binary,account_age_category_Medium,account_age_category_New,activity_level_Low,activity_level_Medium
0,T00000,C4896,2025-11-06 02:10:45,1721,1,46.88,Beauty,Delivery,Medium,Subject: Beauty item delivery issue Dear Suppo...,0,0.111111,0,0,False,False,False,False
1,T00001,C5618,2025-10-30 19:14:40,282,1,44.63,Electronics,Refund,Low,I purchased an Electronics item on your websit...,0,-0.07,0,0,True,False,False,False
2,T00002,C5908,2025-11-27 21:36:16,903,0,27.51,Electronics,Other,Medium,"""I recently purchased an Electronics item from...",0,-0.033333,1,0,False,False,False,True
3,T00003,C3490,2025-10-18 08:31:02,1274,5,23.47,Clothing,Delivery,High,Subject: Urgent Delivery Issue for Clothing It...,1,0.058333,1,1,False,False,False,False
4,T00004,C7689,2025-11-15 15:26:34,1375,3,23.6,Clothing,Other,High,Subject: Urgent: Problem with Clothing item - ...,1,-0.051042,1,1,False,False,False,True


In [20]:
# Feature selection

xgb_features = [
    "sentiment_score",
    "urgent_keywords_flag",
    "num_prev_tickets",
    "avg_response_time_prev",
    "account_age_days",
    "account_age_category_Medium",
    "activity_level_Medium"
]

X_xgb = df_encoded[xgb_features]
y_xgb = df_encoded["urgent_flag"]

In [1]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [14]:
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

# k-NN (Hani)

# Model Comparison (Adhia)

# LLM Interpretation & Insights (Hani)