In [58]:
# This script outlines the emergency process for fast classification fine-tuning.
# It uses a small non-quantized model and a pure PyTorch loop to avoid the
# 'accelerate' dependency and the resulting ImportError.

import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForSequenceClassification, 
    AutoTokenizer, 
)
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm.auto import tqdm 
import os 


# --- 0. Configuration ---
MODEL_NAME = "google/electra-small-discriminator" # FAST, SMALL model
DATASET_NAME = "tweet_eval" 
TASK_NAME = "sentiment"    
OUTPUT_DIR = "./results_sentiment_hackathon_emergency" 
LABELS = ['negative', 'neutral', 'positive'] 
NUM_EPOCHS = 5 
TRAIN_BATCH_SIZE = 16 
LEARNING_RATE = 2e-5 

# --- GPU/Device Configuration (Crucial for Local GPU Use) ---
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE_STR = "cuda" if torch.cuda.is_available() else "cpu" 
print(f"--- Running on device: {DEVICE} ({DEVICE_STR}) ---")

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)


# --- 1. Load Model and Tokenizer (Standard Load) ---
print("--- 1. Loading Model and Tokenizer (Standard Load) ---")
try:
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=len(LABELS),
    )
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Move model to device
    model.to(DEVICE)
    print(f"Loaded {MODEL_NAME} successfully on {DEVICE_STR}.")
    
except Exception as e:
    print(f"FATAL ERROR LOADING MODEL: {e}")
    print("Could not load base model. Check internet or model name.")


# --- 2. Load and Prepare Dataset ---
print("\n--- 2. Loading and Preprocessing Dataset ---")
def load_and_prepare_dataset(tokenizer):
    try:
        dataset = load_dataset(DATASET_NAME, TASK_NAME)
        # FIX: Ensure no invisible characters here
        label_mapping = {0: 0, 1: 1, 2: 2} 
        
        def preprocess_function(examples):
            tokenized_examples = tokenizer(
                examples['text'], 
                truncation=True, 
                padding='max_length', 
                max_length=128
            )
            tokenized_examples["labels"] = [label_mapping[label] for label in examples["label"]]
            return tokenized_examples

        tokenized_datasets = dataset.map(preprocess_function, batched=True)
        
        # Use a small subset for a fast hackathon run
        train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(5000)).remove_columns(['text', 'label'])
        eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500)).remove_columns(['text', 'label'])
        
        # Set format for PyTorch DataLoader
        train_dataset.set_format("torch")
        eval_dataset.set_format("torch")

        print(f"Dataset splits loaded. Train size (subset): {len(train_dataset)}, Test size (subset): {len(eval_dataset)}")
        return train_dataset, eval_dataset

    except Exception as e:
        print(f"Error loading or preprocessing dataset: {e}")
        return None, None

train_data, eval_data = load_and_prepare_dataset(tokenizer)

# --- 3. PyTorch Training Loop (REPLACES Trainer) ---
if train_data and eval_data:
    
    # Create DataLoaders
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=TRAIN_BATCH_SIZE)
    
    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
    # Set model to training mode
    model.train()
    
    print(f"\n--- 3. Starting PyTorch Fine-Tuning for {NUM_EPOCHS} epochs ---")
    
    for epoch in range(NUM_EPOCHS):
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
        
        for batch in progress_bar:
            # Move tensors to the device
            batch = {k: v.to(DEVICE) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss
            
            # Backward pass
            loss.backward()
            
            # Optimization step
            optimizer.step()
            optimizer.zero_grad()
            
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})
            
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch+1} finished. Average Loss: {avg_loss:.4f}")
        
    # --- 4. Save Fine-Tuned Model ---
    print("\n--- 4. Saving Model ---")
    model.save_pretrained(f"{OUTPUT_DIR}/final_fine_tuned_model")
    tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_fine_tuned_model")
    print("\nFine-Tuning complete and saved successfully.")

--- Running on device: cuda (cuda) ---
--- 1. Loading Model and Tokenizer (Standard Load) ---


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded google/electra-small-discriminator successfully on cuda.

--- 2. Loading and Preprocessing Dataset ---


Map: 100%|██████████| 2000/2000 [00:00<00:00, 14185.74 examples/s]


Dataset splits loaded. Train size (subset): 5000, Test size (subset): 500

--- 3. Starting PyTorch Fine-Tuning for 5 epochs ---


Epoch 1: 100%|██████████| 313/313 [00:10<00:00, 29.01it/s, loss=0.815]


Epoch 1 finished. Average Loss: 0.9693


Epoch 2:  70%|██████▉   | 219/313 [00:07<00:03, 30.05it/s, loss=0.731]


KeyboardInterrupt: 

In [59]:
import torch
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import numpy as np

# --- 0. Configuration ---
OUTPUT_DIR = "./results_sentiment_hackathon_emergency/final_fine_tuned_model"
LABELS = ['negative', 'neutral', 'positive'] 
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"--- Running inference on device: {DEVICE} ---")

# --- 1. Load Model and Tokenizer ---
def load_inference_model(output_path):
    """Loads the fine-tuned model and tokenizer from the saved path."""
    try:
        # Load the tokenizer and model from the saved directory
        tokenizer = AutoTokenizer.from_pretrained(output_path)
        model = AutoModelForSequenceClassification.from_pretrained(output_path)
        model.to(DEVICE)
        model.eval() # Set model to evaluation mode
        print(f"Inference model loaded successfully from {output_path}.")
        return model, tokenizer
    except Exception as e:
        print(f"Error loading inference model: {e}")
        print("Ensure the training script finished successfully and the directory path is correct.")
        return None, None

model, tokenizer = load_inference_model(OUTPUT_DIR)

# --- 2. Inference Function ---
def classify_text(texts, model, tokenizer, device, max_length=128):
    """Tokenizes a list of texts and returns the predicted label and score."""
    if model is None or tokenizer is None:
        return ["Error: Model not loaded"] * len(texts)

    # Tokenize the input texts
    encoding = tokenizer(
        texts, 
        padding=True, 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
    )

    # Move tensors to the appropriate device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # Get probabilities (softmax) and predicted classes
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=1).cpu().numpy()
    
    predictions = []
    for probs in probabilities:
        # Get the index of the highest probability
        pred_index = np.argmax(probs)
        # Get the label and confidence score
        label = LABELS[pred_index]
        score = probs[pred_index]
        
        predictions.append({
            "text": texts[probabilities.tolist().index(probs.tolist())], # Retrieve the original text
            "prediction": label,
            "confidence": f"{score:.4f}",
            "probabilities": {l: f"{p:.4f}" for l, p in zip(LABELS, probs)}
        })
        
    return predictions

# --- 3. Example Execution ---
if model and tokenizer:
    example_texts = [
        "This hackathon is the absolute best; I've learned so much!",
        "The server crashed right before the deadline. So frustrating.",
        "I guess the results are fine, nothing spectacular, nothing bad.",
        "That's the worst error I've ever seen.",
    ]

    results = classify_text(example_texts, model, tokenizer, DEVICE)

    print("\n--- Inference Results ---")
    for result in results:
        print("-" * 50)
        print(f"Text: \"{result['text']}\"")
        print(f"CLASSIFICATION: {result['prediction'].upper()} (Confidence: {result['confidence']})")
        print(f"Detail: {result['probabilities']}")

--- Running inference on device: cuda ---
Inference model loaded successfully from ./results_sentiment_hackathon_emergency/final_fine_tuned_model.

--- Inference Results ---
--------------------------------------------------
Text: "This hackathon is the absolute best; I've learned so much!"
CLASSIFICATION: POSITIVE (Confidence: 0.9732)
Detail: {'negative': '0.0074', 'neutral': '0.0194', 'positive': '0.9732'}
--------------------------------------------------
Text: "The server crashed right before the deadline. So frustrating."
CLASSIFICATION: NEGATIVE (Confidence: 0.8886)
Detail: {'negative': '0.8886', 'neutral': '0.0934', 'positive': '0.0180'}
--------------------------------------------------
Text: "I guess the results are fine, nothing spectacular, nothing bad."
CLASSIFICATION: POSITIVE (Confidence: 0.9246)
Detail: {'negative': '0.0191', 'neutral': '0.0562', 'positive': '0.9246'}
--------------------------------------------------
Text: "That's the worst error I've ever seen."
CLASSI

In [66]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import os

MODEL_NAME = "distilbert-base-uncased"
OUTPUT_DIR = "./lora_finetuned_model"
NUM_LABELS = 2
EPOCHS = 3
BATCH_SIZE = 16
LEARNING_RATE = 2e-4
LORA_R = 8
LORA_ALPHA = 16

def run_lora_fine_tuning():
    print("--- Running on device:", "cuda" if torch.cuda.is_available() else "cpu", "---")

    # --- 1. Load model and tokenizer ---
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    # --- 2. Apply LoRA ---
    lora_config = LoraConfig(
        r=LORA_R,
        lora_alpha=LORA_ALPHA,
        target_modules=["q_lin", "v_lin"],
        lora_dropout=0.1,
        bias="none",
        task_type=TaskType.SEQ_CLS,
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # --- 3. Load dataset (example: IMDb) ---
    dataset = load_dataset("imdb")

    def tokenize(batch):
        return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)

    tokenized = dataset.map(tokenize, batched=True)
    tokenized = tokenized.remove_columns(["text"])
    tokenized.set_format("torch")

    train_data = tokenized["train"]
    eval_data = tokenized["test"]

    # --- 4. Metric ---
    metric = evaluate.load("f1")

    def compute_metrics(eval_pred):
        preds = np.argmax(eval_pred.predictions, axis=1)
        return metric.compute(predictions=preds, references=eval_pred.label_ids, average="weighted")

    # --- 5. Training setup ---
    training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    num_train_epochs=EPOCHS,
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=50,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    report_to=[],  # disables W&B or TensorBoard
)


    # --- 6. Trainer ---
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=eval_data,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    model.save_pretrained(f"{OUTPUT_DIR}/final_model")
    tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")

    print("✅ Fine-tuning complete!")
    return model, tokenizer


# --- Main ---
if __name__ == "__main__":
    if not os.path.exists(OUTPUT_DIR):
        trained_model, trained_tokenizer = run_lora_fine_tuning()
    else:
        print("Model already fine-tuned, loading from disk...")
        trained_model = AutoModelForSequenceClassification.from_pretrained(f"{OUTPUT_DIR}/final_model")
        trained_tokenizer = AutoTokenizer.from_pretrained(f"{OUTPUT_DIR}/final_model")


--- Running on device: cuda ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 739,586 || all params: 67,694,596 || trainable%: 1.0925


Map: 100%|██████████| 50000/50000 [00:11<00:00, 4491.21 examples/s]


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'