# Customer Support Chatbot — Notebook

This notebook documents the whole workflow: data loading, preprocessing, training, evaluation, and a small demo.

## 1. Environment & quick checks

In [None]:
import sys
import os
print('Python:', sys.version.splitlines()[0])
try:
    import tensorflow as tf
    print('TensorFlow:', tf.__version__)
except Exception as e:
    print('TensorFlow import error:', e)

try:
    import transformers
    print('transformers:', transformers.__version__)
except Exception as e:
    print('transformers import error:', e)

import pandas as pd
print('pandas:', pd.__version__)

## 2. Load dataset

Make sure your CSV is at `data/customer_support_data.csv`. The notebook will show a few rows. If your CSV has a different name, update the path below.

In [None]:
DATA_CSV = 'data/customer_support_data.csv'

import os
assert os.path.exists(DATA_CSV), f"Dataset file not found: {DATA_CSV}"
import pandas as pd
df = pd.read_csv(DATA_CSV, encoding='utf-8', dtype=str).fillna('')
df.head(20)

## 3. Preprocessing

We will try to use the project's `src/data_prep.py` utilities if present; otherwise run a small fallback cleaning here.

In [None]:
USE_SRC = os.path.exists('src/data_prep.py')
print('src/data_prep.py exists:', USE_SRC)

if USE_SRC:
    from src.data_prep import load_and_prepare
    df_preview, encodings, decodings, tokenizer = load_and_prepare(DATA_CSV)
    print('Loaded and tokenized via src.data_prep.load_and_prepare')
    print('Sample inputs from df_preview:')
    display(df_preview.head(10))
else:
    # Fallback: build simple input_text and target_text
    df['input_text'] = ('context: ' + df['context'].astype(str) + ' user: ' + df['user'].astype(str)).str.strip()
    df['target_text'] = df['agent'].astype(str)
    print('Built input_text and target_text columns in-place')
    display(df.head(10))

## 4. Fine-tune the model

This notebook includes an optional training cell. Training T5 locally can be slow. If you already trained using `src/train.py`, skip this cell.

In [None]:
# Optional training cell - commented out by default. Remove surrounding comments to run.
# from transformers import T5Tokenizer, TFT5ForConditionalGeneration
# import tensorflow as tf
#
# MODEL_NAME = 't5-small'
# tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
# model = TFT5ForConditionalGeneration.from_pretrained(MODEL_NAME)
#
# # Prepare small dataset example (use df)
# inputs = tokenizer((df['input_text'][:200].tolist()), truncation=True, padding='max_length', max_length=128, return_tensors='tf')
# targets = tokenizer((df['target_text'][:200].tolist()), truncation=True, padding='max_length', max_length=64, return_tensors='tf')
#
# dataset = tf.data.Dataset.from_tensor_slices(({'input_ids':inputs['input_ids'],'attention_mask':inputs['attention_mask']}, targets['input_ids'])).batch(4)
# optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
# model.compile(optimizer=optimizer, loss=model.compute_loss)
# model.fit(dataset, epochs=1)
# # Save to checkpoints if desired
# model.save_pretrained('./checkpoints_notebook')
# tokenizer.save_pretrained('./checkpoints_notebook')


## 5. Load fine-tuned checkpoint (or pre-trained model)

If you trained earlier and saved to `checkpoints/`, we'll load that. Otherwise load `t5-small` base model.

In [None]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

CKPT_DIR = 'checkpoints'
if os.path.exists(CKPT_DIR):
    print('Loading model from', CKPT_DIR)
    tokenizer = T5Tokenizer.from_pretrained(CKPT_DIR)
    model = TFT5ForConditionalGeneration.from_pretrained(CKPT_DIR)
else:
    print('No checkpoints found; loading base t5-small from Hugging Face')
    tokenizer = T5Tokenizer.from_pretrained('t5-small')
    model = TFT5ForConditionalGeneration.from_pretrained('t5-small')

print('Model and tokenizer ready')

## 6. Sample inference

Run a few examples to see model outputs.

In [None]:
def generate_reply(text, max_length=64):
    input_text = text
    inputs = tokenizer(input_text, return_tensors='tf', truncation=True, padding=True)
    gen = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_length, num_beams=4)
    return tokenizer.decode(gen[0], skip_special_tokens=True)

# Test a few
examples = [
    'Hi, I need help with my order.',
    'How can I return a product?',
    'My package arrived damaged.'
]
for ex in examples:
    print('>>', ex)
    print('->', generate_reply('question: ' + ex))
    print('')

## 7. Evaluation (BLEU & simple exact-match)

This runs a quick BLEU and exact-match accuracy on the validation split. For better metrics use sacrebleu/rouge in scripts.

In [None]:
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu

# Build dataset in-memory if not already
if 'input_text' not in df.columns:
    df['input_text'] = ('context: ' + df['context'].astype(str) + ' user: ' + df['user'].astype(str)).str.strip()
    df['target_text'] = df['agent'].astype(str)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

preds = []
refs = []
for i, row in val_df.iterrows():
    src = row['input_text']
    tgt = row['target_text']
    out = generate_reply('question: ' + src, max_length=64)
    preds.append(out.split())
    refs.append([tgt.split()])

# corpus_bleu expects list of references per sentence
bleu_score = corpus_bleu(refs, preds) if len(preds)>0 else 0.0

# simple exact-match accuracy
exact = sum(1 for p, r in zip([' '.join(p) for p in preds], [r[0] for r in refs]) if p.strip()==r.strip())
acc = exact / len(preds) if preds else 0.0

print('BLEU (corpus):', bleu_score)
print('Exact-match accuracy:', acc)

## 8. Save sample predictions (for the report)

We'll save a small CSV of predictions vs references to `notebooks/predictions_sample.csv`.

In [None]:
out_rows = []
for i, row in val_df.head(50).iterrows():
    src = row['input_text']
    tgt = row['target_text']
    out = generate_reply('question: ' + src, max_length=64)
    out_rows.append({'input': src, 'reference': tgt, 'prediction': out})
import csv
os.makedirs('notebooks', exist_ok=True)
with open('notebooks/predictions_sample.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['input','reference','prediction'])
    writer.writeheader()
    writer.writerows(out_rows)
print('Saved sample predictions to notebooks/predictions_sample.csv')