# Core Topics AI LLM assignment

### Imports and loading the data

In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Added import after presentation to improve generative performance of the model
from transformers import pipeline


# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the dataset
csv_file_path = "formatted_llm_data_cleaned_v2.csv"  # Path to the CSV export
data = pd.read_csv(csv_file_path)

print(data["Role"].unique())

### Labeling

In [None]:
# Replace 'end_user' with 'End-User' and 'admin' with 'Sales Admin' in the "Role" column
data["Role"] = data["Role"].replace({
    "end_user": "End-User",
    "admin": "Sales Admin"
})

# Check the unique values to verify
print(data["Role"].unique())


# Label Mapping
label_mapping = {"End-User": 0, "Sales Admin": 1}
label_mapping_inv = {0: "End-User", 1: "Sales Admin"}
# CHANGED TO BELOW
# label_mapping = {"End User": 0, "Admin": 1}
# label_mapping_inv = {0: "End User", 1: "Admin"}

data["labels"] = data["Role"].map(label_mapping)

#data["labels"] = data["labels"].astype(int)
#print(data['Role'])

### Split and tokenizer

In [None]:
# Split dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data.apply(lambda row: f"Question: {row['Question']} Answer: {row['Answer']}", axis=1).tolist(),
    data["labels"].tolist(),
    test_size=0.2,
    random_state=42
)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

### PyTorch

In [None]:
# Define a Dataset class for PyTorch
class SalesforceDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the input text
        tokens = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Return tokenized data and label
        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "attention_mask": tokens["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long)
        }


#### Datasets, dataloaders and model

In [None]:
# Create Datasets and DataLoaders
train_dataset = SalesforceDataset(train_texts, train_labels, tokenizer)
val_dataset = SalesforceDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Load pre-trained BERT model for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_mapping))
model.to(device)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

### Training

In [None]:
# Training function
def train_epoch(model, loader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    predictions, targets = [], []

    for batch in tqdm(loader, desc="Training"):
        optimizer.zero_grad()

        # Move data to device
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass
        loss.backward()
        optimizer.step()

        # Accumulate metrics
        epoch_loss += loss.item()
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        targets.extend(labels.cpu().numpy())

    accuracy = accuracy_score(targets, predictions)
    return epoch_loss / len(loader), accuracy


### Final Training and Evaluation

In [None]:
# Evaluation function
def evaluate(model, loader, criterion, device):
    model.eval()
    epoch_loss = 0
    predictions, targets = [], []

    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            # Move data to device
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            # Forward pass
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            # Accumulate metrics
            epoch_loss += loss.item()
            predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
            targets.extend(labels.cpu().numpy())

    accuracy = accuracy_score(targets, predictions)
    return epoch_loss / len(loader), accuracy, classification_report(targets, predictions, target_names=label_mapping.keys())

# Training loop
epochs = 15
best_accuracy = 0

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_accuracy, val_report = evaluate(model, val_loader, criterion, device)

    print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    print("\nValidation Classification Report:\n", val_report)

    # Save the best model
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        model.save_pretrained("best_model")
        tokenizer.save_pretrained("best_model")
        print("Best model saved.")

print("Training completed.")

### Best model and tokenizer

In [None]:
# Load the fine-tuned model and tokenizer
model = BertForSequenceClassification.from_pretrained("best_model")
tokenizer = BertTokenizer.from_pretrained("best_model")
model.to(device)

### Generating output answers

In [None]:
# Function to predict and retrieve answer from dataset
def predict_question_and_answer(question: str, max_length=128):
    # Tokenize the input question
    inputs = tokenizer(
        question,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )

    # Move model and inputs to device
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Get the predicted label (class with the highest probability)
    prediction = torch.argmax(logits, dim=1).item()
    predicted_role = label_mapping_inv[prediction]  # Return label as string (e.g., "End-User")

    # Find the corresponding answer in the dataset
    relevant_answer = data[data["Role"] == predicted_role].iloc[0]["Answer"]

    return predicted_role, relevant_answer

### Experimenting with prompt engineering

### User Interaction

In [None]:
# Interactive loop to take user input and show predictions
print("Welcome to the Salesforce Chatbot")
print("Type 'exit' to quit.")

while True:
    # Get input question from the user
    question = input("\nEnter your question: ")

    # Exit condition
    if question.lower() == 'exit':
        print("Exiting... Goodbye!")
        break
    
    # Get model's prediction (role and answer)
    predicted_role, relevant_answer = predict_question_and_answer(question)
    
    # Conditional Role-Playing Prompting
    if predicted_role == "End-User":
        # Modify the answer for end-user role (e.g., friendly, simple)
        chatbot_response = f"As an end-user, here’s my response to your question: {relevant_answer}"
    elif predicted_role == "Sales Admin":
        # Modify the answer for sales admin role (e.g., professional, business-focused)
        chatbot_response = f"As a sales admin, here’s a more detailed response to your question: {relevant_answer}"
    
    # Display the result
    print(f"Question asked: {question}")
    print(f"Model predicted role: {predicted_role}")
    print(f"Chatbot response: {chatbot_response}")
