# Ticket Classification — Supervised Models
This notebook demonstrates how to classify support tickets using traditional and transformer-based models.

### Available Options:
- Logistic Regression with TF-IDF
- XLM-RoBERTa (via Hugging Face Transformers)

_All input goes through privacy filtering and optional augmentation._

## Environment Setup and Imports

In [None]:
# === Optional: Install Dependencies (if not already installed) ===
# !pip install pandas numpy matplotlib seaborn scikit-learn plotly wordcloud torch transformers datasets

In [None]:
# === General Purpose ===
import os
import numpy as np
import pandas as pd

# === Visualization ===
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.graph_objects as go
import plotly.express as px

# === Scikit-learn ===
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report, confusion_matrix, f1_score, accuracy_score,
    roc_curve, auc, precision_recall_curve, calibration_curve
)
from sklearn.preprocessing import label_binarize, LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

# === Hugging Face Transformers ===
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments,
    DataCollatorWithPadding, EarlyStoppingCallback
)

# === Miscellaneous ===
os.environ["TOKENIZERS_PARALLELISM"] = "false"


## Configuration

In [None]:
# General configuration
MODEL_NAME = "xlm-roberta-base"
TRAIN_EPOCHS = 10
BATCH_SIZE = 16
MAX_LENGTH = 256
FAST_RUN = False
DATA_PATH = "cleaned.csv"
USE_TFIDF_MODEL = True  # Set False to skip logistic regression

# Model config tweaks
config = AutoConfig.from_pretrained(MODEL_NAME)
config.hidden_dropout_prob = 0.3
config.attention_probs_dropout_prob = 0.3

## Load and Prepare Data

In [None]:
# Load dataset
# (Ensure this is privacy-filtered and optionally augmented)
df = pd.read_csv(DATA_PATH, sep=',')

X = df["combined_text"].astype(str)
y = df["Issue Type"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

## Train/Test Split (for Augmented Datasets)

If your dataset already includes a `split` column (e.g., from backtranslation or manual splitting), use the code below to separate training and testing data accordingly.


In [None]:
# Split the dataset based on pre-defined 'split' labels
train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']

X_train = train_df["combined_text"].astype(str)
y_train = train_df["Issue Type"]

X_test = test_df["combined_text"].astype(str)
y_test = test_df["Issue Type"]


## Word Cloud Shape (Optional)

This creates a circular mask for the word cloud. You can later replace it with a custom image if needed.


In [None]:
# Generate a simple circular mask for word cloud
h, w = 400, 400
sy, sx = np.ogrid[:h, :w]
center_x, center_y = w // 2, h // 2
radius = w // 2

circle = (sx - center_x) ** 2 + (sy - center_y) ** 2 > radius ** 2
mask = np.zeros((h, w), dtype=np.uint8)
mask[circle] = 255


## Label Distribution Check

Quickly verify how labels are distributed across the training and test sets.


In [None]:
# Count labels in each split
train_label_counts = y_train.value_counts()
test_label_counts = y_test.value_counts()

print("Training Set Label Counts:")
print(train_label_counts)

print("\nTest Set Label Counts:")
print(test_label_counts)


## TF-IDF + Logistic Regression Baseline

This baseline uses a traditional machine learning pipeline:

- **TF-IDF vectorization** on Dutch ticket text (with unigrams and bigrams)
- **Logistic Regression** with class weighting to handle label imbalance

Use this as a benchmark for comparing with transformer-based models.


In [None]:
if USE_TFIDF_MODEL:
    print("Training TF-IDF + Logistic Regression baseline...")

    # Vectorize input text using unigrams + bigrams
    tfidf = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    # Fit logistic regression with class balancing
    clf = LogisticRegression(
        max_iter=5000,
        class_weight='balanced',
        C=4
    )
    clf.fit(X_train_tfidf, y_train)

    # Predict and evaluate
    y_pred_tfidf = clf.predict(X_test_tfidf)

    print("\nTF-IDF Model Evaluation:")
    print(classification_report(y_test, y_pred_tfidf))

    # Plot confusion matrix
    sns.heatmap(
        confusion_matrix(y_test, y_pred_tfidf),
        annot=True, fmt='d', cmap='Blues'
    )
    plt.title("TF-IDF + Logistic Regression Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()


## TF-IDF Model Visualizations

This section provides a deeper analysis of the TF-IDF + Logistic Regression model:

- ROC and Precision-Recall curves (multiclass)
- Class-specific feature importance (top TF-IDF coefficients)
- Word clouds per class
- Calibration curves to evaluate probability reliability


### ROC and Precision-Recall Curves (TF-IDF Model)


In [None]:
# Binarize y for multiclass evaluation
classes = clf.classes_
y_test_bin = label_binarize(y_test, classes=classes)
y_score = clf.decision_function(X_test_tfidf)

# === Multiclass ROC ===
plt.figure(figsize=(8, 6))
for i, cls in enumerate(classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    plt.plot(fpr, tpr, label=f"{cls} (AUC={auc(fpr, tpr):.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass ROC Curves')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

# === Multiclass Precision-Recall ===
plt.figure(figsize=(8, 6))
for i, cls in enumerate(classes):
    precision, recall, _ = precision_recall_curve(y_test_bin[:, i], y_score[:, i])
    plt.plot(recall, precision, label=f"{cls} (AP={auc(recall, precision):.2f})")
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Multiclass Precision-Recall Curves')
plt.legend(loc='lower left')
plt.tight_layout()
plt.show()


### Feature Importance and Word Clouds (TF-IDF Model)


In [None]:
# Visualize most influential TF-IDF features per class
features = tfidf.get_feature_names_out()
coefs = clf.coef_

for i, cls in enumerate(classes):
    imp_df = pd.DataFrame({'feature': features, 'coef': coefs[i]})
    top_pos = imp_df.nlargest(10, 'coef')
    top_neg = imp_df.nsmallest(10, 'coef')

    # === Barplot ===
    plt.figure(figsize=(8, 5))
    sns.barplot(
        data=pd.concat([top_pos, top_neg]),
        x='coef', y='feature'
    )
    plt.title(f"{cls} — Top Positive/Negative TF-IDF Coefficients")
    plt.tight_layout()
    plt.show()

    # === Word Cloud (Positive Features) ===
    text = ' '.join(top_pos['feature'])
    wc = WordCloud(
        width=400, height=200,
        background_color='white',
        mask=mask
    ).generate(text)
    plt.figure(figsize=(6, 3))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"{cls} — Word Cloud (Positive Features)")
    plt.tight_layout()
    plt.show()


### Calibration Curves (TF-IDF Model)


In [None]:
plt.figure(figsize=(10, 8))
for i, cls in enumerate(classes):
    prob_pos = 1 / (1 + np.exp(-y_score[:, i]))  # sigmoid to convert to prob
    frac_pos, mean_pred_val = calibration_curve(y_test_bin[:, i], prob_pos, n_bins=10)

    plt.plot(mean_pred_val, frac_pos, "s-", label=f"{cls}")

plt.plot([0, 1], [0, 1], "k--", label="Perfect Calibration")
plt.xlabel("Mean Predicted Probability")
plt.ylabel("Fraction of Positives")
plt.title("Calibration Curves per Class")
plt.legend(loc="best")
plt.tight_layout()
plt.show()


## Visualizing TF-IDF Model in 3D

This section reduces the high-dimensional TF-IDF features to 3 components using TruncatedSVD, and visualizes:

- Class distributions in reduced space
- Logistic regression decision surfaces (predicted probabilities)

>**Note:** This is a projection of the original feature space. While useful for interpretation, it does **not fully represent** how the classifier behaves in the full-dimensional space.



In [None]:
# === Dimensionality Reduction ===
svd = TruncatedSVD(n_components=3, random_state=42)
X_vis_3d = svd.fit_transform(X_test_tfidf)

# === Train Logistic Regression on 3D projection ===
clf_3d = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=10000
)
clf_3d.fit(X_vis_3d, y_test)

# === Label encoding for coloring ===
label_encoder = LabelEncoder().fit(y_test)
y_encoded = label_encoder.transform(y_test)
class_names = label_encoder.classes_
colors = px.colors.qualitative.Light24

# === Plot setup ===
fig = go.Figure()

# Plot points per class
for i, class_name in enumerate(class_names):
    idx = (y_encoded == i)
    fig.add_trace(go.Scatter3d(
        x=X_vis_3d[idx, 0], y=X_vis_3d[idx, 1], z=X_vis_3d[idx, 2],
        mode='markers',
        name=class_name,
        marker=dict(size=3, color=colors[i % len(colors)], opacity=0.9)
    ))

# Create grid for 3D surface
x_range = np.linspace(X_vis_3d[:, 0].min(), X_vis_3d[:, 0].max(), 30)
y_range = np.linspace(X_vis_3d[:, 1].min(), X_vis_3d[:, 1].max(), 30)
xx, yy = np.meshgrid(x_range, y_range)
zz_base = np.zeros_like(xx)
grid_flat = np.c_[xx.ravel(), yy.ravel()]
grid_3d = np.c_[grid_flat, zz_base.ravel()]

# Add predicted probability surfaces (1 per class)
for i in range(len(class_names)):
    probs = clf_3d.predict_proba(grid_3d)[:, i].reshape(xx.shape)
    fig.add_trace(go.Surface(
        x=xx, y=yy, z=probs,
        opacity=0.4,
        colorscale='Viridis',
        name=f"Prob Surface: {class_names[i]}",
        showlegend=True,
        visible="legendonly",
        colorbar=dict(x=0.9)
    ))

# Layout and export
fig.update_layout(
    title="3D Probability Surfaces (TF-IDF + Logistic Regression)",
    scene=dict(
        xaxis_title="SVD Component 1",
        yaxis_title="SVD Component 2",
        zaxis_title="Predicted Probability",
        aspectmode='manual',
        aspectratio=dict(x=1, y=1, z=0.8)
    ),
    legend=dict(
        font=dict(size=10),
        itemsizing='constant',
        itemdoubleclick="toggle"
    )
)

fig.write_html("tfidf_logreg_3d_decision_planes.html")
fig.show()


## Transformer-Based Classification (XLM-R)

This section uses the multilingual **XLM-RoBERTa** model (`xlm-roberta-base`) for fine-tuning on support ticket data.

Key steps:
- Tokenization and padding using Hugging Face's `AutoTokenizer`
- Fine-tuning with `Trainer` and early stopping
- Evaluation on the same test split used for the TF-IDF baseline


### Data Preparation

In [None]:
# Disable tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

print("Preparing data for transformer model...")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Map labels to integers
label2id = {label: i for i, label in enumerate(sorted(y.unique()))}
id2label = {i: label for label, i in label2id.items()}

# Prepare DataFrames for Hugging Face Datasets
df_train = pd.DataFrame({"text": X_train, "label": y_train.map(label2id)})
df_test = pd.DataFrame({"text": X_test, "label": y_test.map(label2id)})

ds_train = Dataset.from_pandas(df_train)
ds_test = Dataset.from_pandas(df_test)

# Tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=MAX_LENGTH)

# Apply tokenization
ds_train = ds_train.map(tokenize, batched=True)
ds_test = ds_test.map(tokenize, batched=True)


### Model and Metric Setup

In [None]:
# Load model with appropriate label mapping
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

# Metric function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "f1": f1_score(labels, preds, average='macro'),
        "accuracy": accuracy_score(labels, preds)
    }


### Trainer Setup

In [None]:
# Define training hyperparameters
training_args = TrainingArguments(
    output_dir="./results",
    logging_dir="./logs",
    seed=42,
    num_train_epochs=TRAIN_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    logging_steps=10,
    report_to="none",
    metric_for_best_model="f1",
    greater_is_better=True,
    weight_decay=0.01,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_test,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer),
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


## Train and Evaluate XLM-R

After defining the model and trainer, we now:

- Fine-tune the transformer on the training set
- Evaluate predictions on the test set
- Visualize model performance with a confusion matrix and loss curves


### Training + Evaluation + Confusion Matrix

In [None]:
print("Training transformer model...")
trainer.train()

print("\nEvaluating transformer model...")
preds = trainer.predict(ds_test)
y_pred = np.argmax(preds.predictions, axis=1)

# Print classification metrics
print(classification_report(
    df_test['label'],
    y_pred,
    target_names=[id2label[i] for i in range(len(id2label))]
))

# Plot confusion matrix
sns.heatmap(
    confusion_matrix(df_test['label'], y_pred),
    annot=True,
    fmt='d',
    cmap='Greens'
)
plt.title("XLM-R Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()


### Training and Validation Loss Plot

In [None]:
# Convert Hugging Face log history to DataFrame
log_df = pd.DataFrame(trainer.state.log_history)

# Extract and align training and validation loss per epoch
train_loss_df = log_df[['epoch', 'loss']].dropna().groupby('epoch').last().reset_index()
val_loss_df = log_df[['epoch', 'eval_loss']].dropna().groupby('epoch').last().reset_index()

# Plot losses
plt.figure(figsize=(8, 5))
plt.plot(train_loss_df['epoch'], train_loss_df['loss'], label='Training Loss', marker='o')
plt.plot(val_loss_df['epoch'], val_loss_df['eval_loss'], label='Validation Loss', marker='o')
plt.title("Training and Validation Loss (XLM-R)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


## Extra Visuals: XLM-R Model

The following cells provide deeper insight into the transformer model:

1. Embedding projection with t-SNE
2. Calibration curves per class
3. Most confident misclassifications
4. Word clouds per class (based on raw token frequency)

> Note: Embedding plots are based on a reduced-dimensional view of transformer output and may not fully represent real model decision boundaries.


### Embedding Projection (t-SNE)

In [None]:
# Generate embeddings from last hidden state (mean pooled)
with torch.no_grad():
    enc2 = tokenizer(df_test['text'].tolist(), return_tensors='pt', padding=True, truncation=True, max_length=MAX_LENGTH)
    enc2 = {k: v.to(model.device) for k, v in enc2.items()}
    out2 = model.base_model(**enc2)

    if hasattr(out2, 'pooler_output') and out2.pooler_output is not None:
        embeds = out2.pooler_output.cpu().numpy()
    else:
        embeds = out2.last_hidden_state.mean(dim=1).cpu().numpy()

# Project with t-SNE
tsne = TSNE(n_components=2, random_state=42)
proj = tsne.fit_transform(embeds)

# Plot
plt.figure(figsize=(6, 6))
sns.scatterplot(x=proj[:, 0], y=proj[:, 1], hue=df_test['label'].map(id2label), palette='tab10', alpha=0.7)
plt.title('t-SNE of XLM-R Embeddings')
plt.tight_layout()
plt.show()


### Calibration Curves

In [None]:
# Compute class probabilities from logits
probs = torch.softmax(torch.tensor(preds.predictions), dim=-1).numpy()

# Plot calibration curve per class
plt.figure(figsize=(6, 4))
for class_id in range(len(label2id)):
    true_labels = (df_test['label'] == class_id).astype(int)
    prob_true, prob_pred = calibration_curve(true_labels, probs[:, class_id], n_bins=10)
    plt.plot(prob_pred, prob_true, marker='o', label=id2label[class_id])

plt.plot([0, 1], [0, 1], '--', color='gray')
plt.title('Calibration Curves per Class')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Fraction of Positives')
plt.legend()
plt.tight_layout()
plt.show()


### Top Misclassifications

In [None]:
# Identify most confident wrong predictions
mis_idx = np.where(y_pred != df_test['label'].values)[0]
conf = np.max(probs, axis=1)
top_mis = mis_idx[np.argsort(-conf[mis_idx])][:5]

# Show in DataFrame
mis_df = pd.DataFrame({
    'text': df_test['text'].iloc[top_mis].values,
    'true': [id2label[i] for i in df_test['label'].iloc[top_mis]],
    'pred': [id2label[i] for i in y_pred[top_mis]],
    'conf': conf[top_mis]
})

display(mis_df)


### Word Clouds per Class

In [None]:
# Generate simple word clouds for each predicted class
for class_id, label_str in id2label.items():
    texts = df_test.loc[df_test['label'] == class_id, 'text']
    if texts.empty:
        continue

    combined = " ".join(texts.tolist())
    wc = WordCloud(width=400, height=200, background_color='white').generate(combined)

    plt.figure(figsize=(6, 3))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"Word Cloud for Class: {label_str}")
    plt.tight_layout()
    plt.show()
