# Extracts hidden states of QWEN model given different datasets

In [1]:
from Get_Go_Emo import get_go
from Get_Isear import get_isr

In [2]:
goEmo = get_go()
goEmo

Unnamed: 0,text,labels,id,clean_text,emotions,sentiment
0,My favourite food is anything I didn't have to...,[27],eebbqej,my favourite food is anything i didnt have to ...,[neutral],ambiguous
1,"Now if he does off himself, everyone will thin...",[27],ed00q6i,"now if he does off himself, everyone will thin...",[neutral],ambiguous
2,WHY THE FUCK IS BAYLESS ISOING,[2],eezlygj,why the fuck is bayless isoing,[anger],negative
3,To make her feel threatened,[14],ed7ypvh,to make her feel threatened,[fear],negative
4,Dirty Southern Wankers,[3],ed0bdzj,dirty southern wankers,[annoyance],negative
...,...,...,...,...,...,...
54258,It's pretty dangerous when the state decides w...,[14],edyrazk,its pretty dangerous when the state decides wh...,[fear],negative
54259,I filed for divorce this morning. Hoping he mo...,[20],edi2z3y,i filed for divorce this morning. hoping he mo...,[optimism],positive
54260,"The last time it happened I just said, ""No"" an...",[10],eewbqtx,"the last time it happened i just said, no and ...",[disapproval],negative
54261,I can’t stand this arrogant prick he’s no bett...,[3],eefx57m,i cant stand this arrogant prick hes no better...,[annoyance],negative


In [3]:
isear = get_isr()
isear

Unnamed: 0,ID,CITY,COUN,SUBJ,SEX,AGE,RELI,PRAC,FOCC,MOCC,...,RELA,VERBAL,NEUTRO,EMOT_T,Field3,Field2,MYKEY,SIT,STATE,clean_text
0,11001,1,1,1,1,33,1,2,6,1,...,3,2,0,joy,4,3,110011,"During the period of falling in love, each tim...",1,during the period of falling in love each time...
1,11001,1,1,1,1,33,1,2,6,1,...,2,0,0,fear,3,2,110012,When I was involved in a traffic accident.,1,when i was involved in a traffic accident
2,11001,1,1,1,1,33,1,2,6,1,...,1,0,0,anger,1,3,110013,When I was driving home after several days of...,1,when i was driving home after several days of ...
3,11001,1,1,1,1,33,1,2,6,1,...,1,0,2,sadness,4,4,110014,When I lost the person who meant the most to me.,1,when i lost the person who meant the most to me
4,11001,1,1,1,1,33,1,2,6,1,...,2,0,0,disgust,4,4,110015,The time I knocked a deer down - the sight of ...,1,the time i knocked a deer down the sight of th...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7661,331062,1,33,62,2,21,2,1,7,7,...,2,3,0,anger,1,2,3310623,Two years back someone invited me to be the tu...,1,two years back someone invited me to be the tu...
7662,331062,1,33,62,2,21,2,1,7,7,...,0,1,1,sadness,4,3,3310624,I had taken the responsibility to do something...,1,i had taken the responsibility to do something...
7663,331062,1,33,62,2,21,2,1,7,7,...,2,0,0,disgust,1,2,3310625,I was at home and I heard a loud sound of spit...,1,i was at home and i heard a loud sound of spit...
7664,331062,1,33,62,2,21,2,1,7,7,...,0,2,0,shame,1,3,3310626,I did not do the homework that the teacher had...,1,i did not do the homework that the teacher had...


In [4]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd

def extract_hidden_states(df, model_names, text_column='text', batch_size=8, device='cuda' if torch.cuda.is_available() else 'cpu'):
    """
    Extracts hidden states for each text in the DataFrame using specified models.
    
    Args:
        df (pd.DataFrame): Input DataFrame containing the text data.
        model_names (list): List of model names to extract hidden states from.
        text_column (str): Name of the column containing text data.
        batch_size (int): Batch size for processing.
        device (str): Device to run the model on ('cuda' or 'cpu').
    
    Returns:
        pd.DataFrame: DataFrame with added columns for each model's hidden states.
    """
    for model_name in model_names:
        print(f"Processing model: {model_name}")

        # Load tokenizer and handle padding token
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as PAD token for models like GPT-2

        model = AutoModel.from_pretrained(model_name, output_hidden_states=True)
        model.eval()
        model.to(device)
        
        # Tokenize all texts
        texts = df[text_column].tolist()
        tokenized = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
        
        # Create DataLoader
        input_ids = tokenized['input_ids']
        attention_mask = tokenized['attention_mask']
        dataset = TensorDataset(input_ids, attention_mask)
        dataloader = DataLoader(dataset, batch_size=batch_size)
        
        all_hidden_dicts = []
        
        with torch.no_grad():
            for batch in dataloader:
                input_ids_batch, attention_mask_batch = [t.to(device) for t in batch]
                
                outputs = model(input_ids=input_ids_batch, attention_mask=attention_mask_batch)
                hidden_states = outputs.hidden_states  # Tuple of (layer 0, 1, ..., n)
                
                batch_size = input_ids_batch.size(0)
                for i in range(batch_size):
                    example_hidden = {}
                    for layer_idx, layer in enumerate(hidden_states):
                        # Extract [CLS] token embedding (first token) for BERT-style models
                        # For GPT-2, use last token embedding (since there's no CLS token)
                        if 'gpt' in model_name.lower():
                            last_token_idx = attention_mask_batch[i].sum() - 1
                            embedding = layer[i, last_token_idx, :].cpu().numpy().tolist()
                        else:
                            embedding = layer[i, 0, :].cpu().numpy().tolist()
                        example_hidden[f'layer_{layer_idx}'] = embedding
                    all_hidden_dicts.append(example_hidden)
        
        # Add hidden states as a new column
        df[model_name] = all_hidden_dicts
        
        # Cleanup to free memory
        del model, tokenizer
        torch.cuda.empty_cache()
    
    return df

In [5]:
# Example usage
model_names = ['bert-base-uncased', 'gpt2']  # Replace with your models

# Process goEmo dataset
goEmo_with_hidden = extract_hidden_states(goEmo, model_names)

# Process isear dataset
isear_with_hidden = extract_hidden_states(isear, model_names)

Processing model: bert-base-uncased


In [None]:
# Save to JSON files
goEmo_with_hidden.to_json('goEmo_hidden_states.json', orient='records', lines=True)
isear_with_hidden.to_json('isear_hidden_states.json', orient='records', lines=True)

# Add plots for each of the model names, a series of plots