# Extracts hidden states of QWEN model given different datasets

In [None]:
from Get_Go_Emo import get_go
from Get_Isear import get_isr

In [None]:
goEmo = get_go()
goEmo = goEmo[:32]
goEmo

In [None]:
isear = get_isr()
isear = isear[:32]
isear

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import json
import time
import os

def extract_hidden_states(df, model_names, text_column='clean_text', batch_size=16, dataset_name="no_name", device='cuda' if torch.cuda.is_available() else 'cpu'):
    """
    Extracts hidden states for each text in the DataFrame using specified models.
    
    Args:
        df (pd.DataFrame): Input DataFrame containing the text data.
        model_names (list): List of model names to extract hidden states from.
        text_column (str): Name of the column containing text data.
        batch_size (int): Batch size for processing.
        device (str): Device to run the model on ('cuda' or 'cpu').
    
    Returns:
        pd.DataFrame: DataFrame with added columns for each model's hidden states.
    """
    os.makedirs('hidden_states', exist_ok=True)
    
    for model_name in model_names:
        print(f"\nProcessing model: {model_name} for Dataset : {dataset_name}")
        model_start_time = time.time()

        # Load tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
        model.eval()
        model.to(device)
        
        # Tokenize texts
        texts = df[text_column].tolist()
        tokenized = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        
        # Create DataLoader
        input_ids = tokenized['input_ids']
        attention_mask = tokenized['attention_mask']
        dataset = TensorDataset(input_ids, attention_mask)
        dataloader = DataLoader(dataset, batch_size=batch_size)
        
        all_hidden_dicts = []
        total_batches = len(dataloader)
        
        with torch.no_grad():
            for batch_idx, batch in enumerate(dataloader):
                batch_start_time = time.time()
                
                input_ids_batch, attention_mask_batch = [t.to(device) for t in batch]
                
                outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
                hidden_states = outputs.hidden_states
                
                current_batch_size = input_ids_batch.size(0)
                for i in range(current_batch_size):
                    example_hidden = {}
                    for layer_idx, layer in enumerate(hidden_states):
                        cls_embedding = layer[i, 0, :].cpu().numpy().tolist()
                        example_hidden[f'layer_{layer_idx}'] = cls_embedding
                    all_hidden_dicts.append(example_hidden)
                
                batch_time = time.time() - batch_start_time
                print(
                    f"Batch {batch_idx + 1}/{total_batches} | "
                    f"Time: {batch_time:.2f}s | "
                    f"Avg: {(time.time() - model_start_time)/(batch_idx + 1):.2f}s/batch", 
                    end='\r'
                )
        
        # Save to JSON
        output_filename = os.path.join('hidden_states', f"{model_name}_{dataset_name}.json")
        with open(output_filename, 'w') as f:
            json.dump(all_hidden_dicts, f, indent=2)
        
        # Print summary
        total_time = time.time() - model_start_time
        print(f"\nCompleted {model_name} in {total_time:.2f}s ({total_time/len(df):.4f}s/sample)")
        print(f"Saved hidden states to {output_filename}")
        
        # Cleanup
        del model, tokenizer
        torch.cuda.empty_cache()
    
    # Save complete DataFrame
    final_output_path = os.path.join('hidden_states', f'all_hidden_states_{dataset_name}.json')
    df.to_json(final_output_path, orient='records', indent=2)
    print(f"\nSaved complete DataFrame with all hidden states to {final_output_path}")

In [None]:
# Example usage
model_names = ['bert-base-uncased', 'gpt2']

# Process goEmo dataset
goEmo_with_hidden = extract_hidden_states(goEmo, model_names, dataset_name="goEmo")

# Process isear dataset
isear_with_hidden = extract_hidden_states(isear, model_names, dataset_name="isear")

# Add plots for each of the model names, a series of plots

In [None]:
from analysis import describe_hidden_states, analyze_hidden_states, describe_all_hidden_states

In [None]:
# Example usage
analysis = analyze_hidden_states()

In [None]:
# Describe all files in the default hidden_states directory
describe_all_hidden_states()