In [1]:
# Install required libraries
!pip install --upgrade pypdf2
!pip install transformers==4.23.0

Collecting pypdf2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1
Collecting transformers==4.23.0
  Downloading transformers-4.23.0-py3-none-any.whl (5.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.3/5.3 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.23.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19

In [2]:
import PyPDF2 as pdf
import numpy as np
import torch
import pandas as pd
from google.colab import files
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [3]:
# Step #2: Function to upload and read PDF file
def upload_and_read_pdf():
    with open(files, "rb") as f:
         reader = pdf.PdfReader(f)
         text = ""
         for page_num in range(len(reader.pages)):
             page = reader.pages[page_num]
             text += page.extract_text()
    return text

# Step 3: Calculate Perplexity
def calculate_perplexity(text):
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    tokens = tokenizer.encode(text, return_tensors='pt')
    max_length = 1024
    stride = 512
    lls = []

    for i in range(0, tokens.size(1), stride):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, tokens.size(1))
        trg_len = end_loc - i  # may be different from stride on last loop
        input_ids = tokens[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            log_likelihood = outputs[0] * trg_len

        lls.append(log_likelihood)

    perplexity = torch.exp(torch.stack(lls).sum() / end_loc)
    return perplexity.item()

# Step 4: Analyze Burstiness
def calculate_burstiness(text):
    sentences = text.split('.')
    sentence_lengths = [len(sentence.split()) for sentence in sentences if sentence]

    mean_length = np.mean(sentence_lengths)
    std_dev = np.std(sentence_lengths)

    burstiness = std_dev / mean_length if mean_length else 0
    return burstiness

In [4]:
# Step 1: Upload the PDF file
uploaded = files.upload()
filenames = uploaded.keys()
lista_texts = list(filenames)
lista_texts

Saving True_text_1.pdf to True_text_1.pdf
Saving True_text_2.pdf to True_text_2.pdf
Saving True_text_3.pdf to True_text_3.pdf
Saving True_text_4.pdf to True_text_4.pdf
Saving True_text_5.pdf to True_text_5.pdf
Saving True_text_6.pdf to True_text_6.pdf
Saving True_text_7.pdf to True_text_7.pdf
Saving True_text_8.pdf to True_text_8.pdf
Saving True_text_9.pdf to True_text_9.pdf
Saving True_text_10.pdf to True_text_10.pdf


['True_text_1.pdf',
 'True_text_2.pdf',
 'True_text_3.pdf',
 'True_text_4.pdf',
 'True_text_5.pdf',
 'True_text_6.pdf',
 'True_text_7.pdf',
 'True_text_8.pdf',
 'True_text_9.pdf',
 'True_text_10.pdf']

In [6]:
# Main execution
if __name__ == "__main__":
    lista_perplexity = []
    lista_burstiness = []
    lista_token_length = []
    lista_word_counts = []
    lista_character_counts = []
    lista_sentence_count = []
    lista_mean_length = []
    lista_std_sentence_length = []
    for files in lista_texts:
        text = upload_and_read_pdf()
        perplexity = calculate_perplexity(text)
        lista_perplexity.append(perplexity)
        burstiness = calculate_burstiness(text)
        lista_burstiness.append(burstiness)
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        token_length = len(tokenizer.encode(text))
        lista_token_length.append(token_length)
        word_counts = len(text.split())
        sentences = text.split('.')
        sentence_count = len(sentences)
        lista_sentence_count.append(sentence_count)
        sentence_lengths = [len(sentence.split()) for sentence in sentences if sentence]
        mean_length = np.mean(sentence_lengths)
        lista_mean_length.append(mean_length)
        std_dev = np.std(sentence_lengths)
        lista_std_sentence_length.append(std_dev)
        lista_word_counts.append(word_counts)
        character_counts = len(text)
        lista_character_counts.append(character_counts)


df_texts = pd.DataFrame({'text': lista_texts, 'Token': lista_token_length, 'word counts':  lista_word_counts,
                         'character counts': lista_character_counts, 'sentence count': lista_sentence_count,
                         'std sentence length': lista_std_sentence_length, 'mean sentence length': lista_mean_length,
                         'Perplexity': lista_perplexity, 'Burstiness': lista_burstiness, 'AI Metric': 0})
df_texts

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1102 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1102 > 1024). Running this sequence through the model will result in indexing errors


Unnamed: 0,text,Token,word counts,character counts,sentence count,std sentence length,mean sentence length,Perplexity,Burstiness,AI Metric
0,True_text_1.pdf,294,196,1390,10,9.112629,19.6,45.224319,0.46493,0
1,True_text_2.pdf,812,535,3700,22,9.744674,24.363636,37.010246,0.399968,0
2,True_text_3.pdf,622,409,2724,17,11.465912,24.058824,33.915508,0.476578,0
3,True_text_4.pdf,563,359,2561,13,16.107994,27.615385,27.732531,0.583298,0
4,True_text_5.pdf,1102,709,5116,26,12.227201,27.269231,30.075304,0.448388,0
5,True_text_6.pdf,744,448,3133,20,10.165997,22.45,30.426464,0.452828,0
6,True_text_7.pdf,653,398,2750,19,14.525744,20.947368,32.598499,0.69344,0
7,True_text_8.pdf,561,367,2631,15,8.293505,24.466667,35.987747,0.338972,0
8,True_text_9.pdf,911,607,4123,25,10.813029,24.28,23.746851,0.445347,0
9,True_text_10.pdf,722,472,3131,22,9.78741,21.454545,29.901558,0.456193,0


In [7]:
# Step 5: Upload the model file
from google.colab import files
uploaded = files.upload()

Saving rf_model.pkl to rf_model.pkl


In [8]:
#save model
import pickle

#with open("rf_model.pkl", "wb") as f:
#     pickle.dump(rf_model, f)

#load model
with open("rf_model.pkl", "rb") as f:
     best_rf = pickle.load(f) # Use pickle.load to load the model

In [9]:
X = df_texts[['word counts', 'Token', 'Perplexity', 'character counts']]
y = df_texts['AI Metric']

In [10]:
# Step 6: Predict if the text is AI-generated or human-written
Text_predicted = best_rf.predict(X)
result = pd.DataFrame({'AI_text': y, 'AI Predicted': Text_predicted})
result

Unnamed: 0,AI_text,AI Predicted
0,0,1
1,0,0
2,0,0
3,0,1
4,0,0
5,0,1
6,0,0
7,0,0
8,0,0
9,0,1
