In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

import torch
import json
import os

import torch.nn.functional as F
import pandas as pd

In [2]:
model_path = "/Volumes/BCross/models/Qwen 2.5/Qwen2.5-0.5B-Instruct"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
def get_conditional_probabilities(text, model, tokenizer):
    """
    Given a text, model, and tokenizer, return a dictionary with:
    - List of all conditional probabilities.
    - A dictionary mapping each token to its conditional probability.
    - A list of full probability distributions for each token.
    
    Args:
        text (str): The input text to analyze.
        model (PreTrainedModel): The language model to use for predictions.
        tokenizer (PreTrainedTokenizer): The tokenizer corresponding to the model.
    
    Returns:
        dict: A dictionary containing the conditional probabilities as a list,
              a dictionary of token probabilities, and a list of full distributions.
    """
    
    # Tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors="pt")

    # Initialize the dictionary to store the results
    conditional_probs = {
        "conditional_probabilities": [],  # List of conditional probabilities
        "token_probabilities": {},       # Dictionary of token-to-probability mappings
        "full_distributions": []         # List of full probability distributions for each token
    }

    # Compute the probability for the first token
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits

    # Compute the probability of the first token given no prior context
    first_token_prob = F.softmax(logits[:, 0, :], dim=-1).max().item()
    
    # Store the first token's probability (usually context-independent)
    first_token = tokenizer.decode([input_ids[0, 0]])
    conditional_probs["conditional_probabilities"].append(first_token_prob)
    conditional_probs["token_probabilities"][first_token] = first_token_prob

    # Store the full probability distribution for the first token
    full_distribution_first_token = F.softmax(logits[:, 0, :], dim=-1).squeeze().cpu().numpy()
    conditional_probs["full_distributions"].append(full_distribution_first_token)

    # Iterate through each token in the sequence starting from the second token
    for i in range(1, input_ids.size(1)):
        prefix = input_ids[:, :i]  # Context: all tokens before the current one
        next_token_id = input_ids[0, i]  # Current token to predict

        # Get logits for the context
        with torch.no_grad():
            outputs = model(prefix)
            logits = outputs.logits

        # Compute probabilities for the next token
        log_probs = F.log_softmax(logits[:, -1, :], dim=-1)  # Log probs for last step
        prob_distribution = log_probs.exp()  # Convert log probs to probabilities

        # Extract the current token's probability
        next_token_prob = prob_distribution[0, next_token_id].item()

        # Decode the full distribution for readability
        full_distribution = prob_distribution.squeeze().cpu().numpy()

        # Store the conditional probability and the full distribution
        token = tokenizer.decode([next_token_id])
        conditional_probs["conditional_probabilities"].append(next_token_prob)
        conditional_probs["token_probabilities"][token] = next_token_prob
        conditional_probs["full_distributions"].append(full_distribution)

    # Return the dictionary with both the list and the dictionary of probabilities
    return conditional_probs


In [5]:
def get_conditional_probabilities(text, model, tokenizer, 
                                   include_conditional_probabilities=True,
                                   include_token_probabilities=True,
                                   include_full_distributions=True):
    """
    Given a text, model, and tokenizer, return a dictionary with:
    - List of all conditional probabilities (optional).
    - A dictionary mapping each token to its conditional probability (optional).
    - A list of full probability distributions for each token (optional).
    
    Args:
        text (str): The input text to analyze.
        model (PreTrainedModel): The language model to use for predictions.
        tokenizer (PreTrainedTokenizer): The tokenizer corresponding to the model.
        include_conditional_probabilities (bool): Whether to include the list of conditional probabilities.
        include_token_probabilities (bool): Whether to include the dictionary of token probabilities.
        include_full_distributions (bool): Whether to include the list of full probability distributions.
    
    Returns:
        dict: A dictionary containing the conditional probabilities, token-to-probability mappings,
              and full distributions based on the flags provided.
    """
    
    # Tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors="pt")

    # Initialize the dictionary to store the results
    conditional_probs = {}

    # Compute the probability for the first token
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits

    # Compute the probability of the first token given no prior context
    first_token_prob = F.softmax(logits[:, 0, :], dim=-1).max().item()
    
    # Store the first token's probability (usually context-independent)
    first_token = tokenizer.decode([input_ids[0, 0]])

    # Add components to the dictionary if requested
    if include_conditional_probabilities:
        conditional_probs["conditional_probabilities"] = [first_token_prob]
    
    if include_token_probabilities:
        conditional_probs["token_probabilities"] = {first_token: first_token_prob}

    if include_full_distributions:
        full_distribution_first_token = F.softmax(logits[:, 0, :], dim=-1).squeeze().cpu().numpy()
        conditional_probs["full_distributions"] = [full_distribution_first_token]

    # Iterate through each token in the sequence starting from the second token
    for i in range(1, input_ids.size(1)):
        prefix = input_ids[:, :i]  # Context: all tokens before the current one
        next_token_id = input_ids[0, i]  # Current token to predict

        # Get logits for the context
        with torch.no_grad():
            outputs = model(prefix)
            logits = outputs.logits

        # Compute probabilities for the next token
        log_probs = F.log_softmax(logits[:, -1, :], dim=-1)  # Log probs for last step
        prob_distribution = log_probs.exp()  # Convert log probs to probabilities

        # Extract the current token's probability
        next_token_prob = prob_distribution[0, next_token_id].item()

        # Decode the full distribution for readability
        full_distribution = prob_distribution.squeeze().cpu().numpy()

        # Store the conditional probability and the full distribution if requested
        token = tokenizer.decode([next_token_id])

        if include_conditional_probabilities:
            conditional_probs["conditional_probabilities"].append(next_token_prob)
        
        if include_token_probabilities:
            conditional_probs["token_probabilities"][token] = next_token_prob
        
        if include_full_distributions:
            conditional_probs["full_distributions"].append(full_distribution)

    # Return the dictionary with the requested components
    return conditional_probs

In [6]:
# Example input text
text = "The quick brown fox jumps over the lazy dog."

# Call the function to get the conditional probabilities
conditional_probs = get_conditional_probabilities(text, model, tokenizer)

In [7]:
def list_files(location, exact_name=None):
    """
    Lists all files in the specified location, optionally filtering by file type.

    Parameters:
    - location (str): The directory to search in.
    - file_type (str, optional): The file extension to filter by (e.g., ".jsonl").

    Returns:
    - list: A list of full file paths that match the file type.
    """
    # Initialize an empty list to store file paths
    file_list = []
    
    # Walk through the directory
    for root, dirs, files in os.walk(location):
        for file_name in files:
            # Match exact file name if specified
            if exact_name and file_name == exact_name:
                file_list.append(os.path.join(root, file_name))
            # If no exact_name is provided, include all files
            elif not exact_name:
                file_list.append(os.path.join(root, file_name))
    
    return file_list

In [8]:
def read_jsonl(file_path):
    """
    Reads a JSONL file and converts it into a pandas DataFrame.

    Parameters:
    - file_path: Path to the JSONL file to read.

    Returns:
    - A pandas DataFrame containing the data from the JSONL file.
    """
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the line as JSON
            parsed_line = json.loads(line)
            # If the line is a single-element list, extract the first element
            if isinstance(parsed_line, list) and len(parsed_line) == 1:
                data.append(parsed_line[0])
            else:
                data.append(parsed_line)
    
    # Convert to a DataFrame
    data = pd.DataFrame(data)
    return data

In [9]:
base_loc = "/Volumes/BCross/datasets/author_verification"

test_or_training = "test"

base_file_type_loc = f"{base_loc}/{test_or_training}/"

In [10]:
file_list = list_files(base_file_type_loc, "known_raw.jsonl")

In [11]:
file_list[0]

'/Volumes/BCross/datasets/author_verification/test/StackExchange/known_raw.jsonl'

In [12]:
df = read_jsonl(file_list[0])

In [13]:
df

Unnamed: 0,doc_id,text,corpus,author,texttype
0,known [271958 stats] [ 7.39 kb].txt,Your classifier gives you a probability for ea...,StackExchange,271958,known
1,known [2736153 stats] [ 17.67 kb].txt,Moving average is what you get when you are UR...,StackExchange,2736153,known
2,known [2852150 stats] [ 12.08 kb].txt,Both logit and probit models provide statistic...,StackExchange,2852150,known
3,known [2875509 stats] [ 10.69 kb].txt,This is the extent of the knowledge I am famil...,StackExchange,2875509,known
4,known [298433 stats] [ 19.97 kb].txt,URL has some good free online tutorials for mu...,StackExchange,298433,known
...,...,...,...,...,...
109,known [9437855 stats] [ 19.9 kb].txt,The problem would be simple when the dataset d...,StackExchange,9437855,known
110,known [9466756 stats] [ 11.95 kb].txt,1) If we want to make an assumption about the ...,StackExchange,9466756,known
111,known [947356 stats] [ 3.05 kb].txt,As already pointed out by HANDLE at the end of...,StackExchange,947356,known
112,known [9722956 stats] [ 19.93 kb].txt,"You are talking about Bayesian analysis, not B...",StackExchange,9722956,known


In [14]:
text = df.iloc[0,1]

In [15]:
text

"Your classifier gives you a probability for each class. as a result. Hence, in a classification problem with MATH accuracy. I am currently developing a mathematical symbol classifier (see URL for my bachelors thesis in computer science. I do this by VALUE -fold cross-validation: I am now wondering to how many digits I should publish the accuracy. This is one form of transfer learning. So you can transfer some of the knowledge obtained from dataset MATH. See my URL for this and more terms explained in very few words. LDA is a dimensionality reduction method, not a classifier. In SKlearn, CODE seems to be a naive bayes classifier after LDA, see docs. One quality indicator for a clustering is the silhouette coefficient: Get a distance metric MATH for two objects in your space. For example, the euclidean distance. Let MATH be the average distance of o to the second-closest cluster: MATH s You want this value to be as big as possible. Everything below 0 is bad. To answer your questions, I 

In [16]:
results = get_conditional_probabilities(text, model, tokenizer,
                                        include_conditional_probabilities=True,
                                        include_token_probabilities=False,
                                        include_full_distributions=False)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f82f9ac3fd0>>
Traceback (most recent call last):
  File "/Users/user/my_env/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
results