In [None]:
# <xaiArtifact artifact_id="7fca53d3-eeb9-4e51-9992-096280baf133" title="train_sentiment_model.ipynb" contentType="text/x-python">
# Set memory allocation configuration for PyTorch
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Install required packages
!pip install transformers datasets pandas numpy torch accelerate

import pandas as pd
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score
import os
import numpy as np
from tqdm import tqdm
import gc

# Configuration
class Config:
    MAX_LENGTH = 128
    MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment'  # Twitter-specific model
    BASE_BATCH_SIZE = 8
    NUM_EPOCHS = 3
    LEARNING_RATE = 2e-5
    WARMUP_STEPS = 500
    LOGGING_STEPS = 100
    SAVE_STEPS = 1000
    EARLY_STOPPING_PATIENCE = 2
    EARLY_STOPPING_THRESHOLD = 0.01

# Enhanced data loading with validation for headerless CSV
def load_and_validate_data(file_path, has_headers=True):
    """Load and validate dataset with comprehensive checks"""
    try:
        # Try reading with different encodings if needed
        try:
            if has_headers:
                df = pd.read_csv(file_path)
            else:
                df = pd.read_csv(file_path, header=None)
                # Assign column names for headerless CSV
                df.columns = ['Tweet ID', 'entity', 'sentiment', 'Tweet content']
        except UnicodeDecodeError:
            if has_headers:
                df = pd.read_csv(file_path, encoding='latin1')
            else:
                df = pd.read_csv(file_path, header=None, encoding='latin1')
                df.columns = ['Tweet ID', 'entity', 'sentiment', 'Tweet content']
            
        # Validate required columns
        required_columns = {'Tweet ID', 'entity', 'sentiment', 'Tweet content'}
        if not required_columns.issubset(df.columns):
            missing = required_columns - set(df.columns)
            raise ValueError(f"Missing required columns: {missing}")
            
        # Filter and validate sentiments
        valid_sentiments = {'Positive', 'Negative', 'Neutral', 'Irrelevant'}
        invalid_sentiments = set(df['sentiment'].unique()) - valid_sentiments
        if invalid_sentiments:
            raise ValueError(f"Invalid sentiment values found: {invalid_sentiments}")
            
        # Filter out irrelevant and missing data
        df = df[df['sentiment'] != 'Irrelevant']
        df = df.dropna(subset=['Tweet content', 'sentiment'])
        
        # Remove duplicates
        df = df.drop_duplicates(subset=['Tweet ID'], keep='first')
        
        # Map sentiments
        sentiment_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
        df['label'] = df['sentiment'].map(sentiment_map)
        
        # Validate label mapping
        if df['label'].isna().any():
            raise ValueError("Some sentiments couldn't be mapped to labels")
            
        return df
    
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        raise

# Memory-optimized dataset class
class OptimizedTweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = {
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
        }
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

# Enhanced training function with Twitter-RoBERTa model
def train_model(train_df, output_dir="sentiment_model"):
    try:
        # Clear memory before starting
        gc.collect()
        torch.cuda.empty_cache()
        
        # Initialize device
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Using device: {device}")
        
        # Initialize tokenizer and model with Twitter-RoBERTa
        tokenizer = AutoTokenizer.from_pretrained(Config.MODEL_NAME)
        model = AutoModelForSequenceClassification.from_pretrained(
            Config.MODEL_NAME, 
            num_labels=3
        ).to(device)
        
        # Enable gradient checkpointing to save memory
        model.gradient_checkpointing_enable()
        
        # Convert to HuggingFace Dataset
        dataset = Dataset.from_pandas(train_df)
        
        # Tokenization function
        def tokenize_function(examples):
            return tokenizer(
                examples['Tweet content'],
                truncation=True,
                padding='max_length',
                max_length=Config.MAX_LENGTH
            )
            
        # Tokenize dataset
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            batch_size=Config.BASE_BATCH_SIZE
        )
        
        # Split into train and validation (10% for validation)
        train_val_split = tokenized_dataset.train_test_split(test_size=0.1)
        train_dataset = train_val_split['train']
        val_dataset = train_val_split['test']
        
        # Compute metrics function
        def compute_metrics(p):
            predictions, labels = p
            predictions = np.argmax(predictions, axis=1)
            
            return {
                'accuracy': accuracy_score(labels, predictions),
                'f1': f1_score(labels, predictions, average='weighted')
            }
            
        # Training arguments with memory optimizations
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=Config.NUM_EPOCHS,
            per_device_train_batch_size=Config.BASE_BATCH_SIZE,
            per_device_eval_batch_size=Config.BASE_BATCH_SIZE,
            gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
            learning_rate=Config.LEARNING_RATE,
            warmup_steps=Config.WARMUP_STEPS,
            logging_dir='./logs',
            logging_steps=Config.LOGGING_STEPS,
            eval_strategy="steps",  # Use eval_strategy instead of evaluation_strategy
            eval_steps=Config.LOGGING_STEPS,
            save_steps=Config.SAVE_STEPS,
            save_total_limit=1,  # Save only best model to save disk space
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            fp16=True,  # Use mixed precision training
            gradient_checkpointing=True,  # Enable gradient checkpointing
            dataloader_num_workers=2,  # Parallel data loading
            report_to="none"
        )
        
        # Initialize Trainer with early stopping
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            callbacks=[
                EarlyStoppingCallback(
                    early_stopping_patience=Config.EARLY_STOPPING_PATIENCE,
                    early_stopping_threshold=Config.EARLY_STOPPING_THRESHOLD
                )
            ]
        )
        
        # Clear memory before training
        gc.collect()
        torch.cuda.empty_cache()
        
        # Train model
        print("Starting training...")
        trainer.train()
        
        # Save best model
        os.makedirs(output_dir, exist_ok=True)
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)
        
        # Clean up
        del model
        del trainer
        torch.cuda.empty_cache()
        gc.collect()
        
        return True
        
    except Exception as e:
        print(f"Error during training: {str(e)}")
        raise

# Main execution
if __name__ == "__main__":
    try:
        # Check GPU memory before starting
        if torch.cuda.is_available():
            print(f"GPU Memory before loading data: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
            print(f"Total GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
        
        # Load data (update path to your dataset)
        print("Loading and validating data...")
        train_df = load_and_validate_data("twitter_training.csv", has_headers=False)
        
        # Train model
        success = train_model(train_df)
        
        if success:
            print("Training completed successfully!")
            # Zip model for download
            !zip -r sentiment_model.zip sentiment_model
            print("Model zipped and ready for download.")
            
    except Exception as e:
        print(f"Error in main execution: {str(e)}")