In [1]:
!pip install tqdm



In [48]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import time
from tqdm.auto import tqdm

In [80]:
# Configuration
MODEL_NAME = 'bert-base-uncased'
MAX_LEN = 96
BATCH_SIZE = 16
EPOCHS = 20
LEARNING_RATE = 5e-5
PATIENCE = 4

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device} {'- ' + torch.cuda.get_device_name(0) if torch.cuda.is_available() else ''}")

Using device: cuda - Tesla T4


In [81]:
# Load dataset
print("Loading dataset...")
df = pd.read_csv('/content/combined_student_queries_all.csv')

# Print dataset stats
print(f"Dataset shape: {df.shape}")
print(f"Class distribution:\n{df['Intent'].value_counts()}")

# Convert labels to numerical values
df['label'] = df['Intent'].map({'genuine': 0, 'manipulative': 1})

# Split dataset
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Initialize tokenizer
print(f"Loading {MODEL_NAME} tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Loading dataset...
Dataset shape: (2000, 3)
Class distribution:
Intent
manipulative    1011
genuine          989
Name: count, dtype: int64
Loading bert-base-uncased tokenizer...


In [82]:
# Dataset class
class QueryDataset(Dataset):
    def __init__(self, queries, labels, tokenizer, max_len):
        self.queries = queries
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        query = str(self.queries[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            query,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create data loaders
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = QueryDataset(
        queries=df['Query'].values, #change to Query later
        labels=df['label'].values,
        tokenizer=tokenizer,
        max_len=max_len
    )

    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=0,
        pin_memory=True,
        shuffle=True if df is train_df else False
    )

In [83]:
print("Creating data loaders...")
train_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)

Creating data loaders...


In [84]:
# Initialize model
print(f"Loading {MODEL_NAME} model...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
)
model = model.to(device)

Loading bert-base-uncased model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [85]:
# Set up optimizer and scheduler with weight decay
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# Scheduler with warmup
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=total_steps
)

In [86]:
import torch.cuda.amp as amp

# Initialize gradient scaler for mixed precision training
scaler = torch.amp.GradScaler('cuda')

In [87]:
# Training function
def train_epoch(model, data_loader, optimizer, device, scheduler, epoch):
    model = model.train()
    losses = []
    correct_predictions = 0
    total_samples = 0

    # Progress bar with batch tracking
    progress_bar = tqdm(
        enumerate(data_loader),
        total=len(data_loader),
        desc=f"Epoch {epoch + 1}/{EPOCHS} [Train]",
        leave=True,
    )

    for batch_idx, batch in progress_bar:
        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        labels = batch['label'].to(device, non_blocking=True)

        # Clear gradients
        optimizer.zero_grad()

        # Mixed precision forward pass
        with torch.amp.autocast(device_type='cuda'):
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            loss = outputs.loss
            logits = outputs.logits

        # Mixed precision backward pass
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_samples += labels.size(0)
        losses.append(loss.item())

        # Update progress less frequently to reduce overhead
        if batch_idx % 5 == 0 or batch_idx == len(data_loader) - 1:
            current_loss = np.mean(losses[-10:]) if losses else 0
            current_acc = (correct_predictions.double() / total_samples).item() if total_samples > 0 else 0

            progress_bar.set_postfix({
                "Loss": f"{current_loss:.4f}",
                "Acc": f"{current_acc:.4f}",
                "LR": f"{scheduler.get_last_lr()[0]:.6f}"
            })

    epoch_loss = np.mean(losses)
    epoch_acc = correct_predictions.double() / len(data_loader.dataset)

    return epoch_acc.item(), epoch_loss


In [88]:
# Evaluation function
def eval_model(model, data_loader, device, epoch):
    model.eval()
    losses = []
    correct_predictions = 0

    # Progress bar for validation
    progress_bar = tqdm(
        enumerate(data_loader),
        total=len(data_loader),
        desc=f"Epoch {epoch + 1}/{EPOCHS} [Val]",
        leave=False,
    )

    with torch.no_grad():
        for batch_idx, batch in progress_bar:
            input_ids = batch['input_ids'].to(device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
            labels = batch['label'].to(device, non_blocking=True)

            # Can still use mixed precision for inference
            with torch.amp.autocast(device_type='cuda'):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )

                loss = outputs.loss
                logits = outputs.logits

            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

            # Update less frequently
            if batch_idx % 10 == 0 or batch_idx == len(data_loader) - 1:
                progress_bar.set_postfix({
                    "Val Loss": f"{np.mean(losses[-10:]) if losses else 0:.4f}"
                })

    epoch_loss = np.mean(losses)
    epoch_acc = correct_predictions.double() / len(data_loader.dataset)

    return epoch_acc.item(), epoch_loss

In [89]:
# Training loop with timing
best_accuracy = 0
best_val_loss = float('inf')
epochs_without_improvement = 0
total_start_time = time.time()

print(f"\n{'='*50}")
print("Starting training...")
print(f"{'='*50}\n")

for epoch in range(EPOCHS):
    epoch_start_time = time.time()

    train_acc, train_loss = train_epoch(model, train_loader, optimizer, device, scheduler, epoch)
    val_acc, val_loss = eval_model(model, val_loader, device, epoch)

    epoch_time = time.time() - epoch_start_time

    print(
        f"\nEpoch {epoch + 1}/{EPOCHS} Summary:\n"
        f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}\n"
        f"Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f}\n"
        f"Epoch Time: {epoch_time:.2f}s | Total Time: {(time.time() - total_start_time):.2f}s\n"
    )

    # Save model if validation accuracy improves
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        torch.save(model.state_dict(), 'best_model_acc.bin')
        print(f"✓ New best accuracy: {best_accuracy:.4f} - Model saved!\n")

    # Early stopping based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), 'best_model_loss.bin')
        print(f"✓ New best validation loss: {best_val_loss:.4f} - Model saved!\n")
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= PATIENCE:
            print(f"❌ Early stopping at epoch {epoch+1}")
            break

# Final training statistics
total_time = time.time() - total_start_time
print(f"\n{'='*50}")
print(f"Training complete!")
print(f"Total Training Time: {total_time:.2f}s ({total_time/60:.2f} minutes)")
print(f"Best Validation Accuracy: {best_accuracy:.4f}")
print(f"Best Validation Loss: {best_val_loss:.4f}")
print(f"{'='*50}\n")


Starting training...



Epoch 1/20 [Train]:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1/20 [Val]:   0%|          | 0/25 [00:00<?, ?it/s]


Epoch 1/20 Summary:
Train Loss: 0.6782 | Train Acc: 0.5425
Val Loss:   0.6469 | Val Acc:   0.5850
Epoch Time: 11.63s | Total Time: 11.63s

✓ New best accuracy: 0.5850 - Model saved!

✓ New best validation loss: 0.6469 - Model saved!



Epoch 2/20 [Train]:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch 2/20 [Val]:   0%|          | 0/25 [00:00<?, ?it/s]


Epoch 2/20 Summary:
Train Loss: 0.6252 | Train Acc: 0.6075
Val Loss:   0.5941 | Val Acc:   0.6250
Epoch Time: 11.57s | Total Time: 30.86s

✓ New best accuracy: 0.6250 - Model saved!

✓ New best validation loss: 0.5941 - Model saved!



Epoch 3/20 [Train]:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch 3/20 [Val]:   0%|          | 0/25 [00:00<?, ?it/s]


Epoch 3/20 Summary:
Train Loss: 0.5542 | Train Acc: 0.6750
Val Loss:   0.6644 | Val Acc:   0.6100
Epoch Time: 11.56s | Total Time: 49.31s



Epoch 4/20 [Train]:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch 4/20 [Val]:   0%|          | 0/25 [00:00<?, ?it/s]


Epoch 4/20 Summary:
Train Loss: 0.4695 | Train Acc: 0.7063
Val Loss:   0.6538 | Val Acc:   0.6700
Epoch Time: 11.48s | Total Time: 60.78s

✓ New best accuracy: 0.6700 - Model saved!



Epoch 5/20 [Train]:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch 5/20 [Val]:   0%|          | 0/25 [00:00<?, ?it/s]


Epoch 5/20 Summary:
Train Loss: 0.4189 | Train Acc: 0.7206
Val Loss:   0.8934 | Val Acc:   0.6100
Epoch Time: 11.57s | Total Time: 79.96s



Epoch 6/20 [Train]:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch 6/20 [Val]:   0%|          | 0/25 [00:00<?, ?it/s]


Epoch 6/20 Summary:
Train Loss: 0.3686 | Train Acc: 0.7644
Val Loss:   0.9772 | Val Acc:   0.6300
Epoch Time: 11.54s | Total Time: 91.51s

❌ Early stopping at epoch 6

Training complete!
Total Training Time: 91.51s (1.53 minutes)
Best Validation Accuracy: 0.6700
Best Validation Loss: 0.5941



In [90]:
# Load best model
print("Loading best model for inference...")
model.load_state_dict(torch.load('best_model_acc.bin'))
model.eval()

Loading best model for inference...


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [91]:
# Inference function
def predict(query, max_len=MAX_LEN):
    # Clean the text first
    cleaned_query = query

    encoding = tokenizer.encode_plus(
        cleaned_query,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        with torch.amp.autocast(device_type='cuda'):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    _, prediction = torch.max(logits, dim=1)

    intent = 'genuine' if prediction.item() == 0 else 'manipulative'
    confidence = probabilities[0][prediction.item()].item()

    return {
        'intent': intent,
        'confidence': confidence,
        'genuine_prob': probabilities[0][0].item(),
        'manipulative_prob': probabilities[0][1].item()
    }

In [92]:
# Test prediction
test_queries = [
    "Why does my program keep crashing?",
    "Can you give me the answer to this question?",
    "I need help understanding this concept, can you explain it to me?",
    "Just give me the answer, I don't care about learning"
]

print("\nTesting model on example queries:")
for query in test_queries:
    result = predict(query)
    print(f"\nQuery: '{query}'")
    print(f"Prediction: {result['intent']} (Confidence: {result['confidence']:.4f})")
    print(f"Probabilities - Genuine: {result['genuine_prob']:.4f}, Manipulative: {result['manipulative_prob']:.4f}")


Testing model on example queries:

Query: 'Why does my program keep crashing?'
Prediction: genuine (Confidence: 0.9946)
Probabilities - Genuine: 0.9946, Manipulative: 0.0054

Query: 'Can you give me the answer to this question?'
Prediction: genuine (Confidence: 0.8735)
Probabilities - Genuine: 0.8735, Manipulative: 0.1265

Query: 'I need help understanding this concept, can you explain it to me?'
Prediction: genuine (Confidence: 0.9556)
Probabilities - Genuine: 0.9556, Manipulative: 0.0446

Query: 'Just give me the answer, I don't care about learning'
Prediction: genuine (Confidence: 0.9624)
Probabilities - Genuine: 0.9624, Manipulative: 0.0378
