Creativity Development

In [14]:
import pandas as pd
import numpy as np
import torch
from scipy.spatial.distance import cosine
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer

path_to_dataset_file = 'dataset/gpt-4/writing_prompts_train_subset.csv'

df = pd.read_csv(path_to_dataset_file)
model = SentenceTransformer('all-MiniLM-L6-v2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
df['prompt'] = df['prompt'].astype(str)
df['completion'] = df['completion'].astype(str)

novelty (cosine similarity)

In [3]:
def compute_cosine_similarity(text1, text2):
    embeddings1 = model.encode(text1, convert_to_tensor=True).cpu()
    embeddings2 = model.encode(text2, convert_to_tensor=True).cpu()
    return 1 - cosine(embeddings1, embeddings2)

model = SentenceTransformer('all-MiniLM-L6-v2')

df['cosine'] = df.apply(lambda row: compute_cosine_similarity(row['prompt'], row['completion']), axis=1)

surprise (perplexity)

In [4]:
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

def compute_perplexity(text):
    input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model(input_ids, labels=input_ids)
        loss = output[0]
        perplexity = torch.exp(loss)
    return perplexity.item()

df['ppl'] = df['completion'].apply(compute_perplexity)

originality and flexibility (n-gram overlap)

In [5]:
from collections import Counter

def compute_n_gram_overlap(text1, text2, n=3):
    tokens1 = text1.split()
    tokens2 = text2.split()
    
    if len(tokens1) < n or len(tokens2) < n:
        return 0.0  
    
    counter1 = Counter(zip(*[tokens1[i:] for i in range(n)]))
    counter2 = Counter(zip(*[tokens2[i:] for i in range(n)]))
    
    intersection = sum((counter1 & counter2).values())
    union = sum((counter1 | counter2).values())
    
    if union == 0:
        return 0.0
        
    return intersection / union

df['n-gram_overlap'] = df.apply(lambda row: compute_n_gram_overlap(row['prompt'], row['completion']), axis=1)

fluency (n-gram transition probabilities)

In [6]:
from collections import defaultdict

def compute_n_gram_transition_probs(text, n=3):
    tokens = text.split()
    if len(tokens) < n:
        return {}
    
    transitions = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
    transition_counts = defaultdict(int)
    
    for transition in transitions:
        transition_counts[transition] += 1
    
    total_transitions = len(transitions)
    transition_probs = {transition: count / total_transitions for transition, count in transition_counts.items()}
    return transition_probs

def average_transition_probability(text, n=3):
    probs = compute_n_gram_transition_probs(text, n)
    return sum(probs.values()) / len(probs) if probs else 0

df['n-gram_transition'] = df['completion'].apply(average_transition_probability)

elaboration (Self-BLEU)

In [15]:
import torch
from collections import Counter
from typing import List, Tuple, Dict
import numpy as np
from nltk.util import ngrams

def compute_self_bleu(text: List[str], ngram_weights: List[float] = [0.25, 0.25, 0.25, 0.25]) -> float:
    """
    Compute self-BLEU score for a list of sentences using GPU acceleration.
    
    Args:
        text (List[str]): List of sentences to compute self-BLEU for
        ngram_weights (List[float]): Weights for different n-gram orders (default: equal weights for 1-4 grams)
    
    Returns:
        float: Self-BLEU score
    """
    if not torch.cuda.is_available():
        raise RuntimeError("GPU not available. Please ensure CUDA is installed and a GPU is accessible.")
    
    device = torch.cuda.current_device()
    
    def get_ngrams(sentence: str, n: int) -> Counter:
        """Generate n-grams from a sentence."""
        tokens = sentence.lower().split()
        return Counter(tuple(gram) for gram in ngrams(tokens, n))
    
    def prepare_ngram_matches(candidates: List[str], n: int) -> Tuple[torch.Tensor, Dict]:
        """
        Prepare n-gram match tensors for GPU computation.
        Returns tensor of shape (num_sentences, unique_ngrams) and ngram mapping.
        """
        # Collect all unique n-grams
        all_ngrams = set()
        for sent in candidates:
            all_ngrams.update(get_ngrams(sent, n).keys())
        
        # Create mapping from n-gram to index
        ngram_to_idx = {gram: idx for idx, gram in enumerate(all_ngrams)}
        
        # Create tensor of n-gram counts
        ngram_counts = torch.zeros((len(candidates), len(ngram_to_idx)), dtype=torch.float32, device=device)
        
        for i, sent in enumerate(candidates):
            sent_ngrams = get_ngrams(sent, n)
            for gram, count in sent_ngrams.items():
                if gram in ngram_to_idx:
                    ngram_counts[i, ngram_to_idx[gram]] = count
        
        return ngram_counts, ngram_to_idx

    def calculate_bleu_stats_gpu(ref_ngrams: torch.Tensor, hyp_ngrams: torch.Tensor) -> Tuple[float, float]:
        """Calculate BLEU statistics using GPU tensor operations."""
        # Calculate matches and totals
        matches = torch.minimum(ref_ngrams, hyp_ngrams).sum().item()
        total = hyp_ngrams.sum().item()
        
        return matches, total

    scores = []
    
    # Calculate self-BLEU for each n-gram order
    for n, weight in enumerate(ngram_weights, start=1):
        if weight == 0:
            continue
            
        total_precision = 0
        ngram_counts, ngram_map = prepare_ngram_matches(text, n)
        
        # For each sentence, compare with all other sentences
        for i in range(len(text)):
            hypothesis = ngram_counts[i:i+1]  # Keep dimension for broadcasting
            references = torch.cat([ngram_counts[:i], ngram_counts[i+1:]], dim=0)
            
            # Calculate maximum matches with any reference
            matches, total = calculate_bleu_stats_gpu(references, hypothesis)
            
            if total > 0:
                precision = matches / total
                total_precision += precision
        
        if len(text) > 1:
            avg_precision = total_precision / (len(text))
            scores.append((weight, avg_precision))
    
    # Calculate final weighted BLEU score
    final_score = 0
    sum_weights = sum(weight for weight, _ in scores)
    
    if sum_weights > 0:
        final_score = sum(weight * score for weight, score in scores) / sum_weights
        
    # Clear GPU memory
    torch.cuda.empty_cache()
    
    return final_score


df['self-bleu'] = df['completion'].apply(compute_self_bleu)

elaboration (length by number of words and chars)

In [8]:
df['len_word'] = df['completion'].str.split().str.len()
df['len_char'] = df['completion'].str.len()

In [17]:
df.to_csv(path_to_dataset_file, index=False)