In [1]:
import numpy as np
import pandas as pd
import os
import requests
from time import sleep
from pathlib import Path

import torch
from transformers import GPT2Tokenizer

In [2]:
local_dir = Path(os.getcwd())
data_dir = local_dir / 'llm-detect-ai-generated-text'
external_data_dir = local_dir / 'external-data'

### Load Data

In [3]:
train_essays = pd.read_csv(data_dir / 'train_essays.csv')
train_prompts = pd.read_csv(data_dir / 'train_prompts.csv')

train_essays = train_essays.merge(train_prompts, on='prompt_id', how='left')

In [4]:
# Persuade data
persuade_data = pd.read_csv(external_data_dir / 'Persuade' / 'persuade_2.csv')
persuade_data.rename(columns={'prompt_name': 'prompt_id', 'holistic_essay_score': 'score'}, inplace=True)
persuade_data['generated'] = 0
persuade_data['dataset'] = 'persuade'
persuade_data['source'] = persuade_data['prompt_id']

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
def predict_words_gpt2(sentence):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    tokens = tokenizer.encode(sentence, return_tensors='pt')
    for i in range(tokens.size(1) - 1):
        input_tokens = tokens[:, :i+1]
        target_token = tokens[:, i+1].tolist()  # Convert tensor to list

        with torch.no_grad():
            outputs = model(input_tokens)
            logits = outputs.logits

        softmax = torch.nn.Softmax(dim=-1)
        probabilities = softmax(logits[0, -1])
        top_50 = torch.topk(probabilities, 50, dim=0)
        top_50_words = [tokenizer.decode([idx]) for idx in top_50.indices.tolist()]
        actual_word_prob = probabilities[target_token[0]].item()  # Access the first element of the list

        print(f'Target word: {tokenizer.decode(target_token)}')  # No need for list brackets here
        print('Probability of actual word:', actual_word_prob)

sentence = persuade_data['text'][0]
predict_words_gpt2(sentence)

In [None]:
from transformers import BertForMaskedLM, BertTokenizer

def predict_words_bert(sentence):
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    model = BertForMaskedLM.from_pretrained('bert-base-cased')

    tokens = tokenizer.encode(sentence, return_tensors='pt')
    for i in range(tokens.size(1)):
        masked_index = i
        masked_tokens = tokens.clone()
        masked_tokens[0, masked_index] = tokenizer.mask_token_id

        with torch.no_grad():
            outputs = model(masked_tokens, labels=tokens)
            logits = outputs.logits

        softmax = torch.nn.Softmax(dim=0)
        probabilities = softmax(logits[0, masked_index])
        top_50 = torch.topk(probabilities, 50, dim=0)
        top_50_words = [tokenizer.decode([idx]) for idx in top_50.indices.tolist()]
        actual_word_prob = probabilities[tokens[0, masked_index]].item()

        print(f'Masked word: {tokenizer.decode([tokens[0, masked_index]])}')
        print('Top 50 predictions:', top_50_words)
        print('Probability of actual word:', actual_word_prob)

sentence = 'Your example sentence here.'
predict_words_bert(sentence)


In [None]:
from transformers import DebertaV2Tokenizer, DebertaV2ForMaskedLM

def predict_words_deberta(sentence):
    tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-base')
    model = DebertaV2ForMaskedLM.from_pretrained('microsoft/deberta-v3-base')

    tokens = tokenizer.encode(sentence, return_tensors='pt')
    for i in range(1, tokens.size(1) - 1):  # Skipping the first and last tokens ([CLS] and [SEP])
        masked_tokens = tokens.clone()
        masked_tokens[0, i] = tokenizer.mask_token_id
        target_token = tokens[0, i]

        with torch.no_grad():
            outputs = model(masked_tokens)
            logits = outputs.logits

        softmax = torch.nn.Softmax(dim=-1)
        probabilities = softmax(logits[0, i])
        top_50 = torch.topk(probabilities, 50, dim=0)
        top_50_words = [tokenizer.decode([idx]) for idx in top_50.indices.tolist()]
        actual_word_prob = probabilities[target_token].item()

        print(f'Masked word: {tokenizer.decode([target_token])}')
        print('Top 50 predictions:', top_50_words)
        print('Probability of actual word:', actual_word_prob)

sentence = 'Your example sentence here.'
predict_words_deberta(sentence)

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

def predict_words_deberta(sentence):
    tokenizer = AutoTokenizer.from_pretrained('roberta-base')
    model = AutoModelForMaskedLM.from_pretrained('roberta-base')

    tokens = tokenizer.encode(sentence, return_tensors='pt')
    for i in range(1, tokens.size(1) - 1):  # Skipping the first and last tokens ([CLS] and [SEP])
        masked_tokens = tokens.clone()
        masked_tokens[0, i] = tokenizer.mask_token_id
        target_token = tokens[0, i]

        with torch.no_grad():
            outputs = model(masked_tokens)
            logits = outputs.logits

        softmax = torch.nn.Softmax(dim=-1)
        probabilities = softmax(logits[0, i])
        top_50 = torch.topk(probabilities, 10, dim=0)
        top_50_words = [tokenizer.decode([idx]) for idx in top_50.indices.tolist()]
        actual_word_prob = probabilities[target_token].item()
        top_word_prob = probabilities[top_50.indices[0]].item()
        choice_ratio = actual_word_prob / top_word_prob

        print(f'Masked word: {tokenizer.decode([target_token])}')
        print('Probability of actual word:', actual_word_prob)
        print('Choice ratio:', choice_ratio)
        print('Top 10 predictions:', top_50_words, end='\n\n')

sentence = persuade_data['text'][0]
predict_words_deberta(sentence)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
test = '   test '
trimmed_test = test.strip()
print(trimmed_test)

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

def predict_words_deberta(sentence):
    tokenizer = AutoTokenizer.from_pretrained('roberta-base')
    model = AutoModelForMaskedLM.from_pretrained('roberta-base')

    tokens = tokenizer.encode(sentence, return_tensors='pt')
    for i in range(1, tokens.size(1) - 1):  # Skipping the first and last tokens ([CLS] and [SEP])
        masked_tokens = tokens.clone()
        masked_tokens[0, i] = tokenizer.mask_token_id
        target_token = tokens[0, i]

        with torch.no_grad():
            outputs = model(masked_tokens)
            logits = outputs.logits

        softmax = torch.nn.Softmax(dim=-1)
        probabilities = softmax(logits[0, i])
        top_50 = torch.topk(probabilities, 10, dim=0)
        top_50_words = [tokenizer.decode([idx]) for idx in top_50.indices.tolist()]
        actual_word_prob = probabilities[target_token].item()
        top_word_prob = probabilities[top_50.indices[0]].item()
        choice_ratio = actual_word_prob / top_word_prob

        print(f'Masked word: {tokenizer.decode([target_token])}')
        print('Probability of actual word:', actual_word_prob)
        print('Choice ratio:', choice_ratio)
        print('Top 10 predictions:', top_50_words, end='\n\n')

In [43]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModelForMaskedLM.from_pretrained('roberta-base')

text = train_essays['text'][0]
sentence_list = text.split('.')

In [44]:
def find_sublist(sub, larger):
    """Find the start and end index of a sublist in a larger list."""
    if not isinstance(larger, list):
        larger = larger.tolist()
        
    if not isinstance(sub, list):
        sub = sub.tolist()
        
    for i in range(len(larger)-len(sub)+1):
        if larger[i:i+len(sub)] == sub:
            return i, i+len(sub)
    return -1, -1

In [88]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import concurrent.futures

# Initialize tokenizer and model for GPT-2
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')
model = AutoModelForCausalLM.from_pretrained('distilgpt2')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading (…)lve/main/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [89]:
text = train_essays['text'][0]  # Your text data

# Process text
sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]
section = '. '.join(sentences) + '.'

tokens = tokenizer.encode(section, return_tensors='pt')

# Define context window size
context_window_size = 128

def predict_context(i):
    # Define context window
    start_index = max(0, i - context_window_size)
    end_index = i + 1

    # Extract context tokens
    context_tokens = tokens[:, start_index:end_index]

    with torch.no_grad():
        outputs = model(context_tokens)
        logits = outputs.logits

    # Process logits
    softmax = torch.nn.Softmax(dim=-1)
    probabilities = softmax(logits[0, -1])  # Only the last token's probabilities

    # Top 10 predictions
    top_10_indices = torch.topk(probabilities, 5).indices.tolist()
    top_10_words = tokenizer.decode(top_10_indices)

    # Print information
    context_text = tokenizer.decode(context_tokens[0])
    return f'Context: {context_text}\nTop 10 predictions: {top_10_words}\n'

# Create a ThreadPoolExecutor with a specified number of threads (adjust as needed)
num_threads = 5 # You can adjust this based on your available CPU/GPU resources
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    results = list(executor.map(predict_context, range(tokens.size(1))))

# Print the results
for result in results:
    print(result)


Context: C
Top 10 predictions:  The. A
The

Context: Cars
Top 10 predictions: ., and are in

Context: Cars.
Top 10 predictions: com
orgnet The

Context: Cars. Cars
Top 10 predictions: . are, and can

Context: Cars. Cars have
Top 10 predictions:  been a to the become

Context: Cars. Cars have been
Top 10 predictions:  in banned used built on

Context: Cars. Cars have been around
Top 10 predictions:  for since in the forever

Context: Cars. Cars have been around since
Top 10 predictions:  the they 18 2000 9

Context: Cars. Cars have been around since they
Top 10 predictions:  were first started began came

Context: Cars. Cars have been around since they became
Top 10 predictions:  popular a the extinct so

Context: Cars. Cars have been around since they became famous
Top 10 predictions: ., in and for

Context: Cars. Cars have been around since they became famous in
Top 10 predictions:  the their America a Hollywood

Context: Cars. Cars have been around since they became famous in the
Top

In [45]:
sentence_list = text.split('.')
sentence_list = [sentence.strip() for sentence in sentence_list]
sentence_list = [sentence for sentence in sentence_list if sentence]

for i, sentence in enumerate(sentence_list):
    print('New Sentence!', end='\n\n')
    section_start = max(i - 2, 0)
    section_end = min(i + 1, len(sentence_list))
    section = '. '.join(sentence_list[section_start:section_end]).strip()
    if not section.endswith('. '):
        section += '.'

    # Tokenize the section
    tokens = tokenizer(section, return_tensors='pt', add_special_tokens=False)
    if i == 0:
        sentence_tokens = tokenizer.encode(sentence_list[i] + '.', add_special_tokens=False)
    else:
        sentence_tokens = tokenizer.encode(' ' + sentence_list[i] + '.', add_special_tokens=False)
    
    # Find the index of the target sentence
    sentence_start, sentence_end = find_sublist(sentence_tokens, tokens['input_ids'][0])
    
    # Mask each token from sentence_start to sentence_end
    for i in range(sentence_start, sentence_end):
        masked_tokens = tokens['input_ids'].clone()
        masked_tokens[0, i] = tokenizer.mask_token_id
        masked_tokens = masked_tokens[0, 0:i+2].unsqueeze(0)
        target_token = tokens['input_ids'][0, i]

        with torch.no_grad():
            outputs = model(masked_tokens)
            logits = outputs.logits

        softmax = torch.nn.Softmax(dim=-1)
        probabilities = softmax(logits[0, i])
        top_50 = torch.topk(probabilities, 10, dim=0)
        top_50_words = [tokenizer.decode([idx]) for idx in top_50.indices.tolist()]
        actual_word_prob = probabilities[target_token].item()
        top_word_prob = probabilities[top_50.indices[0]].item()
        choice_ratio = actual_word_prob / top_word_prob

        print(f'Masked word: {tokenizer.decode([target_token])}')
        print('Probability of actual word:', actual_word_prob)
        print('Choice ratio:', choice_ratio)
        print('Top 10 predictions:', top_50_words, end='\n\n')

New Sentence!

Masked word: C
Probability of actual word: 1.5746464462201848e-14
Choice ratio: 1.5746475724958953e-14
Top 10 predictions: ['<s>', '</s>', '.', '<mask>', '?', ',', '!', '...', ']', ' (']

Masked word: ars
Probability of actual word: 7.012762637259584e-08
Choice ratio: 6.509847680793971e-07
Top 10 predictions: ['6', '4', '3', '5', '2', '7', '10', '8', '9', 'III']

Masked word: .
Probability of actual word: 0.007346807979047298
Choice ratio: 0.007405862983616421
Top 10 predictions: ['</s>', '.', '<s>', ']', ').', ' (', ' "', ')', ':', '."']

New Sentence!

Masked word:  Cars
Probability of actual word: 7.274016297742492e-06
Choice ratio: 4.090514278049207e-05
Top 10 predictions: ['com', '</s>', 'net', 'js', 'txt', 'org', 'xx', 'xxx', 'gov', 'jpg']

Masked word:  have
Probability of actual word: 0.03110678307712078
Choice ratio: 0.05142232917036161
Top 10 predictions: ['.', ' have', '?', '...', ':', ' Edit', ',', '!', ' and', ' […]']

Masked word:  been
Probability of actua

In [35]:
        target_token = tokens['input_ids'][0, i]

        with torch.no_grad():
            outputs = model(masked_tokens)
            logits = outputs.logits

        softmax = torch.nn.Softmax(dim=-1)
        probabilities = softmax(logits[0, i])
        top_50 = torch.topk(probabilities, 10, dim=0)
        top_50_words = [tokenizer.decode([idx]) for idx in top_50.indices.tolist()]
        actual_word_prob = probabilities[target_token].item()
        top_word_prob = probabilities[top_50.indices[0]].item()
        choice_ratio = actual_word_prob / top_word_prob

        print(f'Masked word: {tokenizer.decode([target_token])}')
        print('Probability of actual word:', actual_word_prob)
        print('Choice ratio:', choice_ratio)
        print('Top 10 predictions:', top_50_words, end='\n\n')

IndexError: index is out of bounds for dimension with size 0

In [36]:
masked_tokens

tensor([], size=(1, 0), dtype=torch.int64)

In [140]:
text = train_essays['text'][0]

sentence_list = text.split('.')
sentence_list = [sentence.strip() for sentence in sentence_list]
sentence_list = [sentence for sentence in sentence_list if sentence]

tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModelForMaskedLM.from_pretrained('roberta-base')

for i, sentence in enumerate(sentence_list):
    print('New Sentence!', end='\n\n')

    # Split the sentence into words
    words = sentence.split()

    for j, word in enumerate(words):
        # Context is the part of the sentence before the current word
        context = ' '.join(words[:j])
        masked_sentence = context + ' ' + tokenizer.mask_token if context else tokenizer.mask_token

        # Tokenize the masked sentence
        tokens = tokenizer(masked_sentence, return_tensors='pt', add_special_tokens=False)
        
        masked_index = tokens['input_ids'][0].tolist().index(tokenizer.mask_token_id)
        target_token = tokenizer.encode(word, add_special_tokens=False)[0]

        with torch.no_grad():
            outputs = model(tokens)
            logits = outputs.logits

        softmax = torch.nn.Softmax(dim=-1)
        probabilities = softmax(logits[0, masked_index])
        top_50 = torch.topk(probabilities, 10, dim=0)
        top_50_words = [tokenizer.decode([idx]) for idx in top_50.indices.tolist()]
        actual_word_prob = probabilities[target_token].item()
        top_word_prob = probabilities[top_50.indices[0]].item()
        choice_ratio = actual_word_prob / top_word_prob

        print(f'Masked word: {tokenizer.decode([target_token])}')
        print('Probability of actual word:', actual_word_prob)
        print('Choice ratio:', choice_ratio)
        print('Top 10 predictions:', top_50_words, end='\n\n')


New Sentence!



TypeError: list indices must be integers or slices, not tuple

In [None]:
# Assuming train_essays is a DataFrame with a column 'text'
text = train_essays['text'][0]

# Split and clean sentences
sentence_list = [sentence.strip() for sentence in text.split('.') if sentence.strip()]

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModelForMaskedLM.from_pretrained('roberta-base')

In [None]:
text = train_essays['text']

sentence_list = text.split('.')
sentence_list

for i, sentence in enumerate(sentence_list):
    section_start = i - 2
    section_end = i + 2
    if section_start < 0:
        section_start = 0
    if section_end > len(sentence_list):
        section_end = len(sentence_list)
        
    section = ''
    for j in range(section_start, section_end):
        section += sentence_list[j].strip() + '. '
    section = section.strip()
    
    sentence_num = i - section_start + 1