# Extracts hidden states of Any model given their names and different datasets

In [1]:
from Get_Go_Emo import get_go
from Get_Isear import get_isr

num_samples = 1000

In [2]:
goEmo = get_go()
goEmo = goEmo[:num_samples]
goEmo

Unnamed: 0,labels,clean_text
0,[27],my favourite food is anything i didnt have to ...
1,[27],"now if he does off himself, everyone will thin..."
2,[2],why the fuck is bayless isoing
3,[14],to make her feel threatened
4,[3],dirty southern wankers
...,...,...
995,[18],i love name
996,[20],woman here 50 per orgasm please! i could easil...
997,[27],a royal with creme
998,[ 4 15],oh god yes. top quality cringe. thank you for ...


In [3]:
isear = get_isr()
isear = isear[:num_samples]
isear

Unnamed: 0,clean_text,labels
0,during the period of falling in love each time...,1
1,when i was involved in a traffic accident,2
2,when i was driving home after several days of ...,3
3,when i lost the person who meant the most to me,4
4,the time i knocked a deer down the sight of th...,5
...,...,...
995,when i came to know that a girl i was fond of ...,4
996,a too eager approach by a dirty drunken person...,5
997,i made a major mistake while learning how to u...,6
998,i reproached my mothers cooking and criticised...,7


In [4]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import json
import time
import os  # Import os module for directory operations

def extract_hidden_states(df, model_names, text_column='clean_text', batch_size=16, dataset_name = "no_dataset_selected", device='cuda' if torch.cuda.is_available() else 'cpu'):
    """
    Extracts hidden states for each text in the DataFrame using specified models.
    
    Args:
        df (pd.DataFrame): Input DataFrame containing the text data.
        model_names (list): List of model names to extract hidden states from.
        text_column (str): Name of the column containing text data.
        batch_size (int): Batch size for processing.
        device (str): Device to run the model on ('cuda' or 'cpu').
    
    Returns:
        pd.DataFrame: DataFrame with added columns for each model's hidden states.
    """
    # Create hidden_states directory if it doesn't exist
    os.makedirs('hidden_states', exist_ok=True)
    
    for model_name in model_names:
        print(f"\nProcessing model: {model_name}")
        model_start_time = time.time()

        # Load tokenizer and model
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
        model.eval()
        model.to(device)
        
        # Handle missing padding token
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        # Tokenize all texts
        texts = df[text_column].tolist()
        tokenized = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        
        # Create DataLoader
        input_ids = tokenized['input_ids']
        attention_mask = tokenized['attention_mask']
        dataset = TensorDataset(input_ids, attention_mask)
        dataloader = DataLoader(dataset, batch_size=batch_size)
        
        all_hidden_dicts = []
        total_batches = len(dataloader)
        
        with torch.no_grad():
            for batch_idx, batch in enumerate(dataloader):
                batch_start_time = time.time()
                
                input_ids_batch, attention_mask_batch = [t.to(device) for t in batch]
                
                # Get model outputs
                outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
                hidden_states = outputs.hidden_states
                
                # Process each example in the batch
                current_batch_size = input_ids_batch.size(0)
                for i in range(current_batch_size):
                    example_hidden = {}
                    for layer_idx, layer in enumerate(hidden_states):
                        cls_embedding = layer[i, 0, :].cpu().numpy().tolist()
                        example_hidden[f'layer_{layer_idx}'] = cls_embedding
                    all_hidden_dicts.append(example_hidden)
                
                # Calculate batch processing time
                batch_time = time.time() - batch_start_time
                
                # Print progress with time information
                print(
                    f"Batch {batch_idx + 1}/{total_batches} | "
                    f"Time: {batch_time:.2f}s | "
                    f"Avg: {(time.time() - model_start_time)/(batch_idx + 1):.2f}s/batch", 
                    end='\r'
                )
        
        # Save individual model's hidden states to JSON file in hidden_states folder
        output_filename = os.path.join('hidden_states', f"{model_name}_{dataset_name}.json")
        with open(output_filename, 'w') as f:
            json.dump(all_hidden_dicts, f, indent=2)  # indent for pretty-printing
        
        # Print final summary
        total_time = time.time() - model_start_time
        print(f"\nCompleted {model_name} in {total_time:.2f}s ({total_time/len(df):.4f}s/sample)")
        print(f"Saved hidden states to {output_filename}")
        
        # Cleanup
        del model, tokenizer
        torch.cuda.empty_cache()
    
    # Save complete DataFrame with all hidden states to a single JSON file
    final_output_path = os.path.join('hidden_states', 'all_hidden_states.json')
    df.to_json(final_output_path, orient='records', indent=2)
    print(f"\nSaved complete DataFrame with all hidden states to {final_output_path}")
    
    return df

In [5]:

# Example usage
model_names = ['bert-base-uncased', 'gpt2']  # Replace with your models

# Process goEmo dataset
goEmo_with_hidden = extract_hidden_states(goEmo, model_names, dataset_name="goEmo")

# Process isear dataset
isear_with_hidden = extract_hidden_states(isear, model_names, dataset_name="isear")



Processing model: bert-base-uncased
Batch 63/63 | Time: 0.32s | Avg: 0.74s/batch
Completed bert-base-uncased in 62.90s (0.0629s/sample)
Saved hidden states to hidden_states/bert-base-uncased_goEmo.json

Processing model: gpt2
Batch 63/63 | Time: 0.35s | Avg: 0.72s/batch
Completed gpt2 in 58.46s (0.0585s/sample)
Saved hidden states to hidden_states/gpt2_goEmo.json

Saved complete DataFrame with all hidden states to hidden_states/all_hidden_states.json

Processing model: bert-base-uncased
Batch 63/63 | Time: 0.56s | Avg: 1.08s/batch
Completed bert-base-uncased in 81.43s (0.0814s/sample)
Saved hidden states to hidden_states/bert-base-uncased_isear.json

Processing model: gpt2
Batch 63/63 | Time: 0.62s | Avg: 1.43s/batch
Completed gpt2 in 104.75s (0.1048s/sample)
Saved hidden states to hidden_states/gpt2_isear.json

Saved complete DataFrame with all hidden states to hidden_states/all_hidden_states.json


# Add plots for each of the model names, a series of plots

In [6]:
from analysis import describe_hidden_states, analyze_hidden_states, describe_all_hidden_states

In [7]:
# Example usage
analysis = analyze_hidden_states()


Analyzing gpt2_goEmo.json...
Model: gpt2
Dataset: 
Total samples: 1000
Number of layers: 13
Hidden dimension size: 768
Consistent layers across samples: True
Consistent dimensions: True

Analyzing bert-base-uncased_isear.json...
Model: bert-base-uncased
Dataset: 
Total samples: 1000
Number of layers: 13
Hidden dimension size: 768
Consistent layers across samples: True
Consistent dimensions: True

Analyzing gpt2_isear.json...
Model: gpt2
Dataset: 
Total samples: 1000
Number of layers: 13
Hidden dimension size: 768
Consistent layers across samples: True
Consistent dimensions: True

Analyzing bert-base-uncased_goEmo.json...
Model: bert-base-uncased
Dataset: 
Total samples: 1000
Number of layers: 13
Hidden dimension size: 768
Consistent layers across samples: True
Consistent dimensions: True


In [8]:
# Describe all files in the default hidden_states directory
describe_all_hidden_states()


Found 5 hidden state files to analyze:

File 1/5: gpt2_goEmo.json

=== Hidden States Data Structure ===
File: gpt2_goEmo.json
Total samples: 1000
Number of layers: 13
Hidden dimension size: 768

=== Structure Details ===
1. Top level: List of samples (order matches input DataFrame)
2. Each sample: Dictionary with layer-wise CLS token embeddings
3. Layer keys: 'layer_0' to 'layer_N' where N = num_layers-1
4. Each layer: List of floats (length = hidden_dimension)

=== Example Samples ===

Sample 1:
  layer_0: [0.10145839303731918, -0.17585629224777222, 0.11894845217466354, 0.10073922574520111, -0.03555705398321152, ...] (total 768 values)
  layer_1: [-0.12574833631515503, -0.9124440550804138, -0.830676794052124, -0.00699269026517868, -0.7556575536727905, ...] (total 768 values)
  layer_10: [-0.32650119066238403, -1.6769062280654907, -0.5606088638305664, -0.1685718595981598, 1.2347315549850464, ...] (total 768 values)
  layer_11: [-0.4182337522506714, -1.421185851097107, -0.8745468854904