In [130]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

import torch
import json
import os

import torch.nn.functional as F
import pandas as pd

In [131]:
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA version:", torch.version.cuda)
    print("GPU device:", torch.cuda.get_device_name(0))

CUDA available: True
CUDA version: 12.4
GPU device: NVIDIA GeForce GTX 1660 Ti


In [132]:
windows_base_loc = "//bc_nas_storage/BCross/"
mac_base_loc = "/Volumes/BCross/"

In [134]:
if os.path.exists(windows_base_loc):
    base_loc = windows_base_loc
    print("Using Windows PC")
elif os.path.exists(mac_base_loc):
    base_loc = mac_base_loc
    print("Using Mac")
else:
    print("Check location exists")

Using Windows PC


In [135]:
model_path = f"{base_loc}models/Qwen 2.5/Qwen2.5-0.5B-Instruct"

In [136]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

In [8]:
def get_conditional_probabilities(text, model, tokenizer):
    """
    Given a text, model, and tokenizer, return a dictionary with:
    - List of all conditional probabilities.
    - A dictionary mapping each token to its conditional probability.
    - A list of full probability distributions for each token.
    
    Args:
        text (str): The input text to analyze.
        model (PreTrainedModel): The language model to use for predictions.
        tokenizer (PreTrainedTokenizer): The tokenizer corresponding to the model.
    
    Returns:
        dict: A dictionary containing the conditional probabilities as a list,
              a dictionary of token probabilities, and a list of full distributions.
    """
    
    # Tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors="pt")

    # Initialize the dictionary to store the results
    conditional_probs = {
        "conditional_probabilities": [],  # List of conditional probabilities
        "token_probabilities": {},       # Dictionary of token-to-probability mappings
        "full_distributions": []         # List of full probability distributions for each token
    }

    # Compute the probability for the first token
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits

    # Compute the probability of the first token given no prior context
    first_token_prob = F.softmax(logits[:, 0, :], dim=-1).max().item()
    
    # Store the first token's probability (usually context-independent)
    first_token = tokenizer.decode([input_ids[0, 0]])
    conditional_probs["conditional_probabilities"].append(first_token_prob)
    conditional_probs["token_probabilities"][first_token] = first_token_prob

    # Store the full probability distribution for the first token
    full_distribution_first_token = F.softmax(logits[:, 0, :], dim=-1).squeeze().cpu().numpy()
    conditional_probs["full_distributions"].append(full_distribution_first_token)

    # Iterate through each token in the sequence starting from the second token
    for i in range(1, input_ids.size(1)):
        prefix = input_ids[:, :i]  # Context: all tokens before the current one
        next_token_id = input_ids[0, i]  # Current token to predict

        # Get logits for the context
        with torch.no_grad():
            outputs = model(prefix)
            logits = outputs.logits

        # Compute probabilities for the next token
        log_probs = F.log_softmax(logits[:, -1, :], dim=-1)  # Log probs for last step
        prob_distribution = log_probs.exp()  # Convert log probs to probabilities

        # Extract the current token's probability
        next_token_prob = prob_distribution[0, next_token_id].item()

        # Decode the full distribution for readability
        full_distribution = prob_distribution.squeeze().cpu().numpy()

        # Store the conditional probability and the full distribution
        token = tokenizer.decode([next_token_id])
        conditional_probs["conditional_probabilities"].append(next_token_prob)
        conditional_probs["token_probabilities"][token] = next_token_prob
        conditional_probs["full_distributions"].append(full_distribution)

    # Return the dictionary with both the list and the dictionary of probabilities
    return conditional_probs


In [9]:
def get_conditional_probabilities(text, model, tokenizer, 
                                   include_conditional_probabilities=True,
                                   include_token_probabilities=True,
                                   include_full_distributions=True):
    """
    Given a text, model, and tokenizer, return a dictionary with:
    - List of all conditional probabilities (optional).
    - A dictionary mapping each token to its conditional probability (optional).
    - A list of full probability distributions for each token (optional).
    
    Args:
        text (str): The input text to analyze.
        model (PreTrainedModel): The language model to use for predictions.
        tokenizer (PreTrainedTokenizer): The tokenizer corresponding to the model.
        include_conditional_probabilities (bool): Whether to include the list of conditional probabilities.
        include_token_probabilities (bool): Whether to include the dictionary of token probabilities.
        include_full_distributions (bool): Whether to include the list of full probability distributions.
    
    Returns:
        dict: A dictionary containing the conditional probabilities, token-to-probability mappings,
              and full distributions based on the flags provided.
    """
    
    # Tokenize the input text
    input_ids = tokenizer.encode(text, return_tensors="pt")

    # Initialize the dictionary to store the results
    conditional_probs = {}

    # Compute the probability for the first token
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits

    # Compute the probability of the first token given no prior context
    first_token_prob = F.softmax(logits[:, 0, :], dim=-1).max().item()
    
    # Store the first token's probability (usually context-independent)
    first_token = tokenizer.decode([input_ids[0, 0]])

    # Add components to the dictionary if requested
    if include_conditional_probabilities:
        conditional_probs["conditional_probabilities"] = [first_token_prob]
    
    if include_token_probabilities:
        conditional_probs["token_probabilities"] = {first_token: first_token_prob}

    if include_full_distributions:
        full_distribution_first_token = F.softmax(logits[:, 0, :], dim=-1).squeeze().cpu().numpy()
        conditional_probs["full_distributions"] = [full_distribution_first_token]

    # Iterate through each token in the sequence starting from the second token
    for i in range(1, input_ids.size(1)):
        prefix = input_ids[:, :i]  # Context: all tokens before the current one
        next_token_id = input_ids[0, i]  # Current token to predict

        # Get logits for the context
        with torch.no_grad():
            outputs = model(prefix)
            logits = outputs.logits

        # Compute probabilities for the next token
        log_probs = F.log_softmax(logits[:, -1, :], dim=-1)  # Log probs for last step
        prob_distribution = log_probs.exp()  # Convert log probs to probabilities

        # Extract the current token's probability
        next_token_prob = prob_distribution[0, next_token_id].item()

        # Decode the full distribution for readability
        full_distribution = prob_distribution.squeeze().cpu().numpy()

        # Store the conditional probability and the full distribution if requested
        token = tokenizer.decode([next_token_id])

        if include_conditional_probabilities:
            conditional_probs["conditional_probabilities"].append(next_token_prob)
        
        if include_token_probabilities:
            conditional_probs["token_probabilities"][token] = next_token_prob
        
        if include_full_distributions:
            conditional_probs["full_distributions"].append(full_distribution)

    # Return the dictionary with the requested components
    return conditional_probs

In [101]:
import torch.nn.functional as F
import torch

def get_conditional_probabilities(known_text, unknown_text=None, model=None, tokenizer=None, 
                                   include_conditional_probabilities=True,
                                   include_token_probabilities=True,
                                   include_full_distributions=True,
                                   return_as_probs=False):
    """
    Given a known text and an optional unknown text, return a dictionary with:
    - List of all conditional probabilities or log probabilities (optional).
    - A dictionary mapping each token to its conditional probability or log probability (optional).
    - A list of full probability or log probability distributions for each token (optional).
    
    Args:
        known_text (str): The known input text to analyze.
        unknown_text (str, optional): The unknown text to concatenate with known_text. Defaults to None.
        model (PreTrainedModel, optional): The language model to use for predictions. Defaults to None.
        tokenizer (PreTrainedTokenizer, optional): The tokenizer corresponding to the model. Defaults to None.
        include_conditional_probabilities (bool): Whether to include the list of probabilities/log-probs.
        include_token_probabilities (bool): Whether to include the dictionary of token probabilities/log-probs.
        include_full_distributions (bool): Whether to include the list of full distributions (probs/log-probs).
        return_as_probs (bool): Whether to return probabilities (True) or log-probabilities (False). Defaults to False.
    
    Returns:
        dict: A dictionary containing the selected components based on the flags provided.
    """
    # Check for GPU availability
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Move model to GPU
    model = model.to(device)

    # Tokenize the input text and move to GPU
    known_input_ids = tokenizer.encode(known_text, return_tensors="pt").to(device)

    # Concatenate known and unknown text if provided
    if unknown_text:
        unknown_input_ids = tokenizer.encode(unknown_text, return_tensors="pt").to(device)
        input_ids = torch.cat([known_input_ids, unknown_input_ids], dim=1)
    else:
        input_ids = known_input_ids

    # Calculate known and unknown token offsets
    known_token_count = known_input_ids.size(1)
    total_token_count = input_ids.size(1)

    # Initialize the dictionary to store the results
    conditional_probs = {}

    if include_conditional_probabilities:
        conditional_probs["conditional_probabilities"] = []
    if include_token_probabilities:
        conditional_probs["token_probabilities"] = {}
    if include_full_distributions:
        conditional_probs["full_distributions"] = []

    # Compute probabilities for all tokens
    with torch.no_grad():
        outputs = model(input_ids)
        log_probs = F.log_softmax(outputs.logits, dim=-1)  # Log probs for all tokens
        if return_as_probs:
            probs = log_probs.exp()

    # If unknown_text is provided, process only its tokens
    start_index = known_token_count if unknown_text else 0

    for i in range(start_index, total_token_count):
        next_token_id = input_ids[0, i]
        next_token_value = (probs if return_as_probs else log_probs)[0, i, next_token_id].item()
        token = tokenizer.decode([next_token_id])

        if include_conditional_probabilities:
            conditional_probs["conditional_probabilities"].append(next_token_value)
        
        if include_token_probabilities:
            conditional_probs["token_probabilities"][token] = next_token_value
        
        if include_full_distributions:
            full_distribution = (probs if return_as_probs else log_probs)[0, i].cpu().numpy()
            conditional_probs["full_distributions"].append(full_distribution)

    # Return the dictionary with the requested components
    return conditional_probs


In [74]:
from tqdm import tqdm  # For progress bar
import torch.nn.functional as F
import torch

def get_conditional_probabilities(known_text, unknown_text=None, model=None, tokenizer=None, 
                                   include_conditional_probabilities=True,
                                   include_token_probabilities=True,
                                   include_full_distributions=True,
                                   return_as_probs=False):
    """
    Given a known text and an optional unknown text, return a dictionary with:
    - List of all conditional probabilities or log probabilities (optional).
    - A dictionary mapping each token to its conditional probability or log probability (optional).
    - A list of full probability or log probability distributions for each token (optional).
    
    Args:
        known_text (str): The known input text to analyze.
        unknown_text (str, optional): The unknown text to concatenate with known_text. Defaults to None.
        model (PreTrainedModel, optional): The language model to use for predictions. Defaults to None.
        tokenizer (PreTrainedTokenizer, optional): The tokenizer corresponding to the model. Defaults to None.
        include_conditional_probabilities (bool): Whether to include the list of probabilities/log-probs.
        include_token_probabilities (bool): Whether to include the dictionary of token probabilities/log-probs.
        include_full_distributions (bool): Whether to include the list of full distributions (probs/log-probs).
        return_as_probs (bool): Whether to return probabilities (True) or log-probabilities (False). Defaults to False.
    
    Returns:
        dict: A dictionary containing the selected components based on the flags provided.
    """
    # Check for GPU availability
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Move model to GPU
    model = model.to(device)

    # Tokenize the input text and move to GPU
    known_input_ids = tokenizer.encode(known_text, return_tensors="pt").to(device)

    # Concatenate known and unknown text if provided
    if unknown_text:
        unknown_input_ids = tokenizer.encode(unknown_text, return_tensors="pt").to(device)
        input_ids = torch.cat([known_input_ids, unknown_input_ids], dim=1)
    else:
        input_ids = known_input_ids

    # Calculate known and unknown token offsets
    known_token_count = known_input_ids.size(1)
    total_token_count = input_ids.size(1)

    # Initialize the dictionary to store the results
    conditional_probs = {}

    if include_conditional_probabilities:
        conditional_probs["conditional_probabilities"] = []
    if include_token_probabilities:
        conditional_probs["token_probabilities"] = {}
    if include_full_distributions:
        conditional_probs["full_distributions"] = []

    # Compute probabilities for all tokens
    with torch.no_grad():
        outputs = model(input_ids)
        log_probs = F.log_softmax(outputs.logits, dim=-1)  # Log probs for all tokens
        if return_as_probs:
            probs = log_probs.exp()

    # If unknown_text is provided, process only its tokens
    start_index = known_token_count if unknown_text else 0

    for i in tqdm(range(start_index, total_token_count), desc="Processing tokens"):
        next_token_id = input_ids[0, i]
        next_token_value = (probs if return_as_probs else log_probs)[0, i, next_token_id].item()
        token = tokenizer.decode([next_token_id])

        if include_conditional_probabilities:
            conditional_probs["conditional_probabilities"].append(next_token_value)
        
        if include_token_probabilities:
            conditional_probs["token_probabilities"][token] = next_token_value
        
        if include_full_distributions:
            full_distribution = (probs if return_as_probs else log_probs)[0, i].cpu().numpy()
            conditional_probs["full_distributions"].append(full_distribution)

    # Return the dictionary with the requested components
    return conditional_probs


In [104]:
# Example input text
text = "The quick brown fox jumps over the lazy dog."
unknown_text = "Then it jumps over your mum."

# Call the function to get the conditional probabilities
#conditional_probs = get_conditional_probabilities(text, unknown_text, model=model, tokenizer=tokenizer)
conditional_probs = get_conditional_probabilities(text, model=model, tokenizer=tokenizer, return_as_probs=True)

Using device: cuda


In [105]:
conditional_probs

{'conditional_probabilities': [2.453970466831379e-07,
  1.0851216103446859e-07,
  1.7008306940624607e-07,
  5.5850136959634256e-06,
  1.0068577438460125e-07,
  2.7807447622762993e-05,
  3.0830833566142246e-05,
  1.399473512719851e-05,
  1.5797680816831416e-06,
  4.125164210222465e-09],
 'token_probabilities': {'The': 2.453970466831379e-07,
  ' quick': 1.0851216103446859e-07,
  ' brown': 1.7008306940624607e-07,
  ' fox': 5.5850136959634256e-06,
  ' jumps': 1.0068577438460125e-07,
  ' over': 2.7807447622762993e-05,
  ' the': 3.0830833566142246e-05,
  ' lazy': 1.399473512719851e-05,
  ' dog': 1.5797680816831416e-06,
  '.': 4.125164210222465e-09},
 'full_distributions': [array([4.2672150e-07, 7.7248828e-08, 2.3432511e-08, ..., 7.1411708e-09,
         7.1398092e-09, 7.1412258e-09], dtype=float32),
  array([1.1352657e-05, 3.8999551e-07, 2.3041113e-09, ..., 3.5843069e-12,
         3.5850797e-12, 3.5849086e-12], dtype=float32),
  array([1.8934730e-07, 9.8799694e-07, 3.2154337e-09, ..., 1.57355

In [106]:
def list_files(location, exact_name=None):
    """
    Lists all files in the specified location, optionally filtering by file type.

    Parameters:
    - location (str): The directory to search in.
    - file_type (str, optional): The file extension to filter by (e.g., ".jsonl").

    Returns:
    - list: A list of full file paths that match the file type.
    """
    # Initialize an empty list to store file paths
    file_list = []
    
    # Walk through the directory
    for root, dirs, files in os.walk(location):
        for file_name in files:
            # Match exact file name if specified
            if exact_name and file_name == exact_name:
                file_list.append(os.path.join(root, file_name))
            # If no exact_name is provided, include all files
            elif not exact_name:
                file_list.append(os.path.join(root, file_name))
    
    return file_list

In [107]:
def read_jsonl(file_path):
    """
    Reads a JSONL file and converts it into a pandas DataFrame.

    Parameters:
    - file_path: Path to the JSONL file to read.

    Returns:
    - A pandas DataFrame containing the data from the JSONL file.
    """
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the line as JSON
            parsed_line = json.loads(line)
            # If the line is a single-element list, extract the first element
            if isinstance(parsed_line, list) and len(parsed_line) == 1:
                data.append(parsed_line[0])
            else:
                data.append(parsed_line)
    
    # Convert to a DataFrame
    data = pd.DataFrame(data)
    return data

In [108]:
dataset_base_loc = f"{base_loc}datasets/author_verification"

test_or_training = "test"

base_file_type_loc = f"{dataset_base_loc}/{test_or_training}/"

In [109]:
file_list = list_files(base_file_type_loc, "known_raw.jsonl")

In [110]:
file_list

['//bc_nas_storage/BCross/datasets/author_verification/test/ACL\\known_raw.jsonl',
 '//bc_nas_storage/BCross/datasets/author_verification/test/All-the-news\\known_raw.jsonl',
 '//bc_nas_storage/BCross/datasets/author_verification/test/Amazon\\known_raw.jsonl',
 '//bc_nas_storage/BCross/datasets/author_verification/test/Enron\\known_raw.jsonl',
 '//bc_nas_storage/BCross/datasets/author_verification/test/IMDB\\known_raw.jsonl',
 "//bc_nas_storage/BCross/datasets/author_verification/test/Koppel's Blogs\\known_raw.jsonl",
 '//bc_nas_storage/BCross/datasets/author_verification/test/Perverted Justice\\known_raw.jsonl',
 '//bc_nas_storage/BCross/datasets/author_verification/test/Reddit\\known_raw.jsonl',
 '//bc_nas_storage/BCross/datasets/author_verification/test/StackExchange\\known_raw.jsonl',
 '//bc_nas_storage/BCross/datasets/author_verification/test/The Apricity\\known_raw.jsonl',
 '//bc_nas_storage/BCross/datasets/author_verification/test/The Telegraph\\known_raw.jsonl',
 '//bc_nas_stor

In [111]:
file_list[4]

'//bc_nas_storage/BCross/datasets/author_verification/test/IMDB\\known_raw.jsonl'

In [112]:
df = read_jsonl(file_list[4])

In [113]:
df

Unnamed: 0,doc_id,text,corpus,author,texttype
0,1000497_known,I liked this movie. I did have a nitpick thoug...,IMDB,1000497,known
1,10044932_known,"That's the story, right? Going back to the day...",IMDB,10044932,known
2,10074111_known,I was as excited as a young kid at Christmas w...,IMDB,10074111,known
3,10118021_known,"Picture it... it's a Saturday night, and there...",IMDB,10118021,known
4,10125890_known,This film is a definite 10 for me. I have alwa...,IMDB,10125890,known
...,...,...,...,...,...
950,994570_known,What starts out as an art house experiment tur...,IMDB,994570,known
951,9945874_known,I would not spend 1 minute in a room with some...,IMDB,9945874,known
952,994939_known,This film is pretty bad even by Woody Allen's ...,IMDB,994939,known
953,9975201_known,As far as sports movie go this is a great one....,IMDB,9975201,known


In [114]:
text = df.iloc[0,1]

In [115]:
text

'I liked this movie. I did have a nitpick though, I don\'t like nor see the need for the foolish russian accented english most of the actors used. Either speak Russian and subtitle it for me (not going to happen with a Hollywood movie) or speak just english. Adding a poorly done accent as if the characters spoke english with bad accents back in the day in the Motherland is not adding much in the way of atmosphere. I think the movies that have handled foreign settings best run it in the native tongue of the setting, then have a sort of magical transition into english as if the audience has just gained fluency in the language and now doesn\'t notice that it\'s listening to a movie in another language other than english. I think u571 handled the german that way. Over all the movie is fine, what big hearts those guys had who went into the reactor space to save their crew and their ship. Some gave all so that others may live, and that\'s a tearjerker and heroism no matter what nation you\'r

In [116]:
unknown_text = df.iloc[1,1]

In [117]:
unknown_text



In [120]:
results = get_conditional_probabilities(text, model=model, tokenizer=tokenizer,
                                        include_conditional_probabilities=False,
                                        include_token_probabilities=True,
                                        include_full_distributions=False)

Using device: cuda


In [121]:
results

{'token_probabilities': {'I': -15.330498695373535,
  ' liked': -15.648114204406738,
  ' this': -8.485764503479004,
  ' movie': -9.267634391784668,
  '.': -19.569072723388672,
  ' I': -9.42391300201416,
  ' did': -12.262917518615723,
  ' have': -8.230719566345215,
  ' a': -7.582947731018066,
  ' nit': -9.63819408416748,
  'pick': -15.85287857055664,
  ' though': -8.325688362121582,
  ',': -15.26042652130127,
  ' don': -15.284036636352539,
  "'t": -10.718945503234863,
  ' like': -8.241666793823242,
  ' nor': -7.776481628417969,
  ' see': -10.934805870056152,
  ' the': -7.041484832763672,
  ' need': -10.339701652526855,
  ' for': -11.636407852172852,
  ' foolish': -8.614945411682129,
  ' russian': -10.137657165527344,
  ' acc': -14.771279335021973,
  'ented': -20.407821655273438,
  ' english': -11.122166633605957,
  ' most': -7.9602179527282715,
  ' of': -11.434659004211426,
  ' actors': -9.97344970703125,
  ' used': -8.000399589538574,
  ' Either': -13.542048454284668,
  ' speak': -9.355

In [123]:
len(results['token_probabilities'])

395

In [128]:
results_2 = get_conditional_probabilities(text, unknown_text, model=model, tokenizer=tokenizer,
                                        include_conditional_probabilities=True,
                                        include_token_probabilities=True,
                                        include_full_distributions=False)

Using device: cuda


In [125]:
results_2

{'token_probabilities': {'That': -17.56633758544922,
  "'s": -14.153806686401367,
  ' the': -10.219949722290039,
  ' story': -9.122652053833008,
  ',': -14.240501403808594,
  ' right': -7.857714653015137,
  '?': -6.398604869842529,
  ' Going': -14.081364631652832,
  ' back': -11.48012924194336,
  ' to': -9.663529396057129,
  ' days': -12.426008224487305,
  ' of': -8.246127128601074,
  ' "': -3.1023805141448975,
  ' China': -5.821417808532715,
  ' Town': -10.677091598510742,
  ' with': -10.919380187988281,
  ' Jack': -9.254035949707031,
  ' Nicholson': -14.363350868225098,
  ' chasing': -10.012381553649902,
  ' lead': -9.91736125946045,
  ' after': -8.573844909667969,
  ' turning': -11.355340957641602,
  ' corners': -11.955739974975586,
  ' left': -5.499736785888672,
  ' and': -9.112051010131836,
  ' only': -8.981474876403809,
  ' find': -11.782228469848633,
  ' more': -9.090285301208496,
  '.': -18.431865692138672,
  ' Who': -12.350269317626953,
  ' dup': -14.02192497253418,
  'ing': -

In [129]:
len(results_2['conditional_probabilities'])

3086