In [1]:
import random
from prediction_analysis import *
from getting_examples import get_activation_data_for_feature
import concurrent.futures

ModuleNotFoundError: No module named 'prediction_analysis'

In [79]:
import numpy as np
from openai import OpenAI
import re
import pprint
from getting_examples import get_activation_data_for_feature

def find_first_number(text):
    # Return the first number in a string
    match = re.search(r'\b\d+(\.\d+)?', text)
    return float(match.group(0)) if match else None

client = OpenAI()

def predict_activations(feature_index, test_number=20, show_examples=0):
    # Get and parse JSON data from the url corresponding to the requested feature
    url = f"https://www.neuronpedia.org/api/feature/gpt2-small/9-res-jb/{feature_index}"
    data = get_activation_data_for_feature(url)
    explanation = data['explanations'][0]['description']

    assert (len(data['examples']) >= (test_number + show_examples))

    # Randomly select some sentences to use as examples and test data
    random_indices = np.random.choice(len(data['examples']), size=test_number + show_examples, replace=False)
    sentences = [
        {
            'sentence_string': ''.join(data['examples'][i]['tokens']), 
            'activation':  data['examples'][i]['maxValue'], 
            'max_index': data['examples'][i]['maxValueTokenIndex'],
            'max_token': data['examples'][i]['tokens'][data['examples'][i]['maxValueTokenIndex']]
         } 
        for i in random_indices]
    example_sentences = sentences[:show_examples]
    test_sentences = sentences[show_examples:] 

    highest_activation = data['examples'][0]['maxValue']

    # Create a system prompt dependning on how many example sentences are provided
    # system_prompt = f'You are evaluating an english description of an autoencoder feature. The description should correspond to sentences which result in high activation. The english description of the feature is: "{explanation}"\n'
    system_prompt = f'You are evaluating an english description of an autoencoder feature. The description should correspond to sentences which result in high activation. The english description of the feature is: "{explanation}"\n'

    if show_examples:
        system_prompt += f'Here are {show_examples} examples of sentences and their corresponding activations:\n '
        for sentence in example_sentences:
            sentence_string = sentence['sentence_string']
            activation = sentence['activation']
            system_prompt += f'Example: "{sentence_string}" had an activation of {activation:.2f} on token "{sentence["max_token"]}"\n'
        system_prompt += 'Use the provided samples and the provided description to predict the activation on a new sentence.'

    else:
        system_prompt += f'The value of the highest activation on the dataset is {highest_activation:.2f}. You must predict the activation on a new sentence based off of the provided description. If the description matches the provided sentence, the activation may be closer to {highest_activation:.2f}, while if it does not match the activation will be nearly 0.'

    system_prompt += '\nYou MUST respond with ONLY a number and NO OTHER content.'

    predictions = []

    # Have the model predict activations on each test sentence
    for sentence in test_sentences:
        sentence_string = sentence['sentence_string']
        user_message = f'Please predict the activation on this sentence, responding with a number between 0 and {highest_activation:.2f}.\n\nSentence: "{sentence_string}" Remember, the english description of the feature is: "{explanation}" and the most common words that resulted in a high activation were {", ".join(["\"" + s["max_token"] + "\"" for s in example_sentences])}'
        print(user_message)
        completion = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message}
            ]
        )
        
        predicted = find_first_number(completion.choices[0].message.content)
        # (true, pred)
        predictions.append((sentence['activation'], predicted))

    pprint.pprint(system_prompt)
    # pprint.pprint(test_sentences)

    for i in range(len(predictions)):
        print(predictions[i], test_sentences[i]['sentence_string'])

    return predictions



In [90]:
def get_predictions(feature_num):
    # activation_data = get_activation_data_for_feature(f"https://www.neuronpedia.org/api/feature/gpt2-small/9-res-jb/{feature_num}")
    predictions = predict_activations(feature_num, test_number=10, show_examples=8)
    return predictions

def run():
    data = get_predictions(991) #806
    # for i in range(len(all_data)):
        # data = all_data[i]
        # print(feature_nums[i])
    print()
    pprint.pprint(data)
    custom = custom_accuracy(data)

    print(custom)

    # custom = [custom_accuracy(data, eps = 0.1) for data in all_data]

for _ in range(1):
    run()

Please predict the activation on this sentence, responding with a number between 0 and 7.48.

Sentence: " the words "I'll set this plane on fire" and a seat number.Ċ" Remember, the english description of the feature is: "words related to technical processes or actions" and the most common words that resulted in a high activation were " on", " and", " If", " is", " a", "'s", " knowledge", " to"
Please predict the activation on this sentence, responding with a number between 0 and 7.48.

Sentence: "," Watts said.ĊĊRubio's disclosure sheds new light on his comments in" Remember, the english description of the feature is: "words related to technical processes or actions" and the most common words that resulted in a high activation were " on", " and", " If", " is", " a", "'s", " knowledge", " to"
Please predict the activation on this sentence, responding with a number between 0 and 7.48.

Sentence: ".ĊĊRubio's disclosure sheds new light on his comments in October, when" Remember, the englis

In [73]:
import matplotlib.pyplot as plt

### Losses
def mse(data, normalize = False):
    values = ([((elem[0]-elem[1])/(elem[0] if normalize else 1))**2 for elem in data])
    return sum(values)/len(values)

def nll_variant(data, eps = 1e-1):
    values = ([np.log((min(elem) + eps)/(max(elem) + eps)) for elem in data])
    return -sum(values)/len(values)

def l1(data, normalize = True, eps = 0.1):
    values = ([((eps + abs(elem[0]-elem[1]))/((max(elem) if normalize else 1) + eps))  for elem in data])
    return sum(values)/len(values)

### Plots
def plot_mses_cdf(mses):
    # Plotting the Mean Squared Errors (MSE) for each dataset
    mses_sorted = np.sort(mses)
    cdf = np.arange(1, len(mses_sorted)+1) / len(mses_sorted)
    plt.plot(mses_sorted, cdf)
    plt.title('Cumulative Distribution Function of MSEs')
    plt.xlabel('MSE')
    plt.ylabel('CDF')
    plt.grid(True)
    plt.show()

def plot_probability_distribution(data, bins='auto', density=True, title = "Default Title"):
    """
    Plots the probability distribution of the given data using a histogram.

    Parameters:
    - data (list or numpy array): The floating point numbers whose distribution you want to plot.
    - bins (int, sequence or str, optional): The method for calculating histogram bins. Default is 'auto'.
    - density (bool, optional): If True, the histogram is normalized to form a probability density,
                                i.e., the area under the histogram will sum to 1. Default is True.
    """
    # Calculate the histogram
    counts, bin_edges = np.histogram(data, bins=bins, density=density)

    # Calculate bin centers
    bin_centers = 0.5 * (bin_edges[:-1] + bin_edges[1:])

    # Plotting the histogram
    plt.figure(figsize=(8, 6))
    plt.bar(bin_centers, counts*np.diff(bin_edges), align='center', width=np.diff(bin_edges), edgecolor='black', alpha=0.7)
    plt.xlabel('Value')
    plt.ylabel('Probability Density')
    plt.title('Probability Distribution of Data')
    plt.title(title)
    plt.grid(True)
    plt.show()

def analyze_data(all_data):
    mses = [mse(data, normalize = False) for data in all_data]
    nlls = [nll_variant(data) for data in all_data]
    l1s = [l1(data, normalize = True) for data in all_data]

    print('l1s', sorted(l1s))
    plot_probability_distribution(mses, title = "Distribution of MSEs")
    plot_probability_distribution(nlls, title = "Distribution of NLL variant")
    plot_probability_distribution(l1s, title = "Distribution of l1s variant")

In [74]:
feature_nums = [806]#random.sample(range(0, 1000), 10)

def get_predictions(feature_num):
    # activation_data = get_activation_data_for_feature(f"https://www.neuronpedia.org/api/feature/gpt2-small/9-res-jb/{feature_num}")
    predictions = predict_activations(feature_num, test_number=10, show_examples=8)
    return predictions


with concurrent.futures.ThreadPoolExecutor() as executor:
    all_data = list(executor.map(get_predictions, feature_nums))

mses = [mse(data, normalize = False) for data in all_data]
nlls = [nll_variant(data) for data in all_data]
l1s = [l1(data, normalize = True) for data in all_data]

# print('l1s', sorted(l1s))
plot_probability_distribution(mses, title = "Distribution of MSEs")
plot_probability_distribution(nlls, title = "Distribution of NLL variant")
plot_probability_distribution(l1s, title = "Distribution of l1s variant")

NameError: name 'concurrent' is not defined

In [None]:
def custom_accuracy(data):
    eps = max([elem[0] for elem in data]) / 10
    values = []
    for elem in data:
        true, pred = elem
        ## Add eps to avoid zero case
        true, pred = true + eps, pred + eps
        # Scale values
        true, pred = true ** 0.75, pred ** 0.75
        # Calculate difference
        difference = abs(true - pred)
        # Take ratio
        error = difference / max(true, pred)
        
        accuracy = 1 - error
        values.append(accuracy)
    return sum(values)/len(values)

In [None]:
feature_nums

NameError: name 'feature_nums' is not defined

In [None]:
def run():
    data = get_predictions(806)
    # for i in range(len(all_data)):
        # data = all_data[i]
        # print(feature_nums[i])
    pprint.pprint(data)
    custom = custom_accuracy(data)
    print(custom)

    # custom = [custom_accuracy(data, eps = 0.1) for data in all_data]

for _ in range(1):
    run()

# plot_probability_distribution(custom, title = "Distribution of custom accuracy")

('You are evaluating an english description of an autoencoder feature. The '
 'description should correspond to sentences which result in high activation. '
 'The english description of the feature is: " past tense verbs"\n'
 'Here are 8 examples of sentences and their corresponding activations:\n'
 ' Example: " economy\'s cooled off enough, but it wasn\'t always so. Back in '
 'the mid", Activation: 19.96\n'
 'Example: " NL<|endoftext|>," Watts said.ĊĊRubio\'s disclosure sheds new '
 'light on his", Activation: 0.00\n'
 'Example: " in their NL<|endoftext|>," Watts said.ĊĊRubio\'s disclosure sheds '
 'new light", Activation: 0.00\n'
 'Example: "ĊĊRubio\'s disclosure sheds new light on his comments in October, '
 'when he", Activation: 0.00\n'
 'Example: " be sure to add a great feel and glitz to any game. These '
 'wonderful futuristic", Activation: 0.00\n'
 'Example: " their NL<|endoftext|>," Watts said.ĊĊRubio\'s disclosure sheds '
 'new light on", Activation: 0.00\n'
 'Example: " of

In [None]:
analyze_data(all_data)

TypeError: 'int' object is not subscriptable