In [1]:
import os
import tempfile
import pandas as pd
from tqdm import tqdm

In [2]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = "expandable_segments:True"

In [3]:
def load_parquet(df: pd.DataFrame):
    try:
        with tempfile.TemporaryDirectory() as temp_dir:
            print(f"Created temporary directory: {temp_dir}")
            temp_file_path = os.path.join(temp_dir, 'intermediate_data.parquet')

            # Save intermediate results
            df.to_parquet(temp_file_path)
            print(f"Saved intermediate data to {temp_file_path}")

            # Load and perform further processing...
            loaded_data = pd.read_parquet(temp_file_path)
            print("Loaded intermediate data for next step.")
            return loaded_data

        # Outside the 'with' block:
        print(f"Temporary directory {temp_dir} exists? {os.path.exists(temp_dir)}")

    except Exception as e:
        print(f"An error occurred during processing: {e}")

In [4]:
df = pd.read_csv('../data/AI_Human.csv')

In [5]:
df = load_parquet(df)

Created temporary directory: /tmp/tmp0pc9gm1o
Saved intermediate data to /tmp/tmp0pc9gm1o/intermediate_data.parquet
Loaded intermediate data for next step.


In [6]:
df['generated'].value_counts()

generated
0.0    305797
1.0    181438
Name: count, dtype: int64

In [7]:
df.isna().mean()

text         0.0
generated    0.0
dtype: float64

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# 1. Configure the model you want to use
MODEL_CONFIG = {
    "id": "meta-llama/Llama-3.2-1B",
    "name": "Llama 3.2 1B",
    "torch_dtype": torch.float16
}

def load_model_and_tokenizer(config):
    """Loads the specified model and tokenizer from Hugging Face."""
    model_id = config["id"]
    try:
        # Load tokenizer and add a padding token if it's missing
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # Load the model with device_map for automatic hardware placement (GPU/CPU)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=config["torch_dtype"],
            device_map="auto",
            low_cpu_mem_usage=True
        )
        return model, tokenizer
    except Exception as e:
        print(f"Error loading {config['name']}: {e}")
        return None, None

def calculate_perplexity(text, model, tokenizer):
    """Calculates the perplexity of a given text using a specified model."""
    if not text.strip() or not model or not tokenizer:
        return float('inf')

    try:
        # Move model to the correct device
        device = next(model.parameters()).device
        
        # Tokenize the text
        encodings = tokenizer(text, return_tensors="pt")
        input_ids = encodings.input_ids.to(device)
        
        # Calculate the loss, which is the negative log-likelihood
        with torch.no_grad():
            outputs = model(input_ids, labels=input_ids)
            neg_log_likelihood = outputs.loss
            
        # Exponentiate the loss to get perplexity
        perplexity = torch.exp(neg_log_likelihood)
        torch.cuda.empty_cache()
        return perplexity.item()
    except Exception as e:
        print(f"Error in perplexity calculation: {e}")
        return float('inf')

In [9]:
# --- Main Execution ---
print("Loading model... This might take a moment.")
model, tokenizer = load_model_and_tokenizer(MODEL_CONFIG)

if model and tokenizer:
    print("Model loaded successfully. Calculating perplexity...")
    # A standard, common sentence
    human_text = "The quick brown fox jumps over the lazy dog."
    # A more verbose, thesaurus-like version, typical of some AI outputs
    ai_text = "The rapid, cinnamon-colored vulpine creature elegantly leaps across the indolent canine."

    human_perplexity = calculate_perplexity(human_text, model, tokenizer)
    ai_perplexity = calculate_perplexity(ai_text, model, tokenizer)

    print(f"\nHuman Text Perplexity: {human_perplexity:.2f}")
    print(f"AI Text Perplexity (Thesaurus version): {ai_perplexity:.2f}")

Loading model... This might take a moment.
Model loaded successfully. Calculating perplexity...

Human Text Perplexity: 7.05
AI Text Perplexity (Thesaurus version): 417.04


In [61]:
import pickle
import glob
import re
def save_checkpoint(filename, pyobject):
    with open(f'../data/checkpoint/{filename}.pkl', mode='wb') as pklf:
        pickle.dump(pyobject, pklf, protocol=pickle.HIGHEST_PROTOCOL)

def load_last_checkpoint(checkpoint_dir):
    if not os.path.exists(checkpoint_dir):
        os.mkdir(checkpoint_dir)
    
    file_idx = []
    for fp in glob.glob(f'{checkpoint_dir}/*.pkl'):
        fn = os.path.basename(fp)
        file_idx.append(int(re.findall(r'\d+', fn)[0]))
    last_checkpoint_idx = max(file_idx)
    with open(f'../data/checkpoint/perplexity_{last_checkpoint_idx}.pkl', mode='rb') as pklf:
        last_checkpoint_list = pickle.load(pklf)
    return last_checkpoint_list
        

In [65]:
checkpoint_dir = os.path.realpath('../data/checkpoint')
if len(glob.glob(f'{checkpoint_dir}/*.pkl')) == 0:
    perplexity_list = []
else:
    perplexity_list = load_last_checkpoint(checkpoint_dir)

In [66]:
len(perplexity_list)

10000

In [None]:
checkpoint_dir = os.path.realpath('../data/checkpoint')
if len(glob.glob(f'{checkpoint_dir}/*.pkl')) == 0:
    perplexity_list = []
else:
    perplexity_list = load_last_checkpoint(checkpoint_dir)

for idx, txt in enumerate(tqdm(df['text'])):
    if idx <= len(perplexity_list):
        continue
    
    perplexity = calculate_perplexity(txt, model, tokenizer)
    perplexity_list.append(perplexity)
    
    if idx %  10000 == 0:
        print(f'Creating checkpoint_{idx}')
        filename = f'perplexity_{idx}'
        save_checkpoint(filename, perplexity_list)
        print(f"{filename} created")
    

  2%|▏         | 10002/487235 [00:00<00:04, 98940.65it/s]

10001
10002
10003
10004
10005
10006
10007
10008
10009
10010
10011
10012
10013
10014
10015
10016
10017
10018
10019
10020
10021
10022
10023
10024
10025
10026
10027
10028
10029
10030
10031
10032
10033
10034
10035
10036
10037
10038
10039
10040
10041
10042
10043
10044
10045
10046
10047
10048
10049
10050
10051
10052
10053
10054
10055
10056
10057
10058
10059
10060
10061
10062
10063
10064
10065
10066
10067
10068
10069
10070
10071
10072
10073
10074
10075
10076
10077
10078
10079
10080
10081
10082
10083
10084
10085
10086
10087
10088
10089
10090
10091
10092
10093
10094
10095
10096
10097
10098
10099
10100
10101
10102
10103
10104
10105
10106
10107
10108
10109
10110
10111
10112
10113
10114
10115
10116
10117
10118
10119
10120
10121
10122
10123
10124
10125
10126
10127
10128
10129
10130
10131
10132
10133
10134
10135
10136
10137
10138
10139
10140
10141
10142
10143
10144
10145
10146
10147
10148
10149
10150
10151
10152
10153
10154
10155
10156
10157
10158
10159
10160
10161
10162
10163
10164
10165
10166
1016

  2%|▏         | 10171/487235 [00:11<09:10, 866.63it/s]  


KeyboardInterrupt: 

In [None]:
df

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0
...,...,...
487230,Tie Face on Mars is really just a big misunder...,0.0
487231,The whole purpose of democracy is to create a ...,0.0
487232,I firmly believe that governments worldwide sh...,1.0
487233,I DFN't agree with this decision because a LFT...,0.0
