# Detect AI generated text using Naive Bayes Classification

Import required libraries

In [47]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer

In [48]:
training_data = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_essays.csv')
training_data

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,0
1,005db917,0,Transportation is a large necessity in most co...,0
2,008f63e3,0,"""America's love affair with it's vehicles seem...",0
3,00940276,0,How often do you ride in a car? Do you drive a...,0
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,0
...,...,...,...,...
1373,fe6ff9a5,1,There has been a fuss about the Elector Colleg...,0
1374,ff669174,0,Limiting car usage has many advantages. Such a...,0
1375,ffa247e0,0,There's a new trend that has been developing f...,0
1376,ffc237e9,0,As we all know cars are a big part of our soci...,0


In [49]:
prompt_data = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/train_prompts.csv')
prompt_data.head()

Unnamed: 0,prompt_id,prompt_name,instructions,source_text
0,0,Car-free cities,Write an explanatory essay to inform fellow ci...,"# In German Suburb, Life Goes On Without Cars ..."
1,1,Does the electoral college work?,Write a letter to your state senator in which ...,# What Is the Electoral College? by the Office...


In [50]:
testing_data = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')
testing_data

Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.


In [57]:
### HYPER PARAMETERS ###
vocabulary = ['in','is','and','me','I','has','you','he','she','will','was','should','him','her','themselves','or','but','not','shall','would','it','between']
min_freq_threshold = 4

In [61]:
def word_probability(essay):
    words = nltk.word_tokenize(essay)
    frequency_distribution = nltk.FreqDist(words)
    print(frequency_distribution.most_common(10))
    probability_dict={}
    for word in vocabulary:
        probability_dict[word] = [frequency_distribution[word]]
    
    #tagged_words = nltk.pos_tag(words)
    #errors = [word for word, tag in tagged_words if tag == 'VBZ' or tag == 'VBP']
    ### return human error probability ###
    return probability_dict

In [53]:
def convert_essays_to_frequency(df):
    prob_df = pd.DataFrame(columns=vocabulary)
    prob_df['id']=0
    training =(True if 'generated' in list(df.columns) else False)
    
    for index,row in df.iterrows():
        P = word_probability(row['text']) 
        P['id']=row['id']
        if training: 
            P['generated'] = int(row['generated'])
        prob_df = pd.concat([prob_df,pd.DataFrame(P)],ignore_index=True)
    return prob_df

In [54]:
def get_probabilities(df,columns,row_freq):
    df.drop(['id'],axis=1)
    P_llm = len(df[df['generated']==1])/len(df.index)
    total_docs = len(df.index)
    total_gen = len(df[df['generated']==1])
    probs = {}
    probs_llm = {}
    for i in columns:
        probs[i]=0
        probs_llm[i]=0
        
    for index,row in df.iterrows():
        for i in columns:
            if row_freq[i]==0:
                min_threshold = min_freq_threshold
            else:
                min_threshold = row_freq[i]
            if row[i] >= min_threshold:
                if i in probs.keys():
                    probs[i]+=1
                if row['generated']==1 and i in probs_llm.keys():
                    probs_llm[i]+=1
    
    ### Apply softening for values with 0 minimum threshold ###
    for key,val in probs.items():
        if val == 0:
            probs[key] = val+1
            total_docs+=1
    for key,val in probs_llm.items():
        if val == 0:
            probs_llm[key] = val+1
            total_gen+=1
    for key,val in probs.items():
        probs[key] = val/total_docs
    for key,val in probs_llm.items():
        probs_llm[key] = val/total_gen
    P_V = 1
    P_V_L = 1
    for key,val in probs.items():
        P_V*=val
        P_V_L*=probs_llm[key]
    print(P_V_L,P_llm,P_V)
    P = (P_V_L * P_llm)/P_V
    return P

In [55]:
def NBC():
    freq_train_dt = convert_essays_to_frequency(training_data)
    freq_test_dt = convert_essays_to_frequency(testing_data)
    columns = list(freq_test_dt.columns)
    columns.remove('id')
    for i in columns:
        freq_train_dt[i].astype(float)
        freq_test_dt[i].astype(float)
    if 'generated' in columns:
        columns.remove('generated')
    freq_test_dt['probability']=0.0
    for i,r in freq_test_dt.iterrows():
        r['probability'] = get_probabilities(freq_train_dt,columns,r)
#    print(freq_test_dt.head())
    return freq_test_dt[['id','probability']]

In [62]:
NBC().to_csv('submission.csv',index=False)

[(',', 40), ('the', 32), ('.', 23), ('of', 23), ('to', 21), ('a', 15), ('cars', 13), ('have', 12), ('and', 11), ('in', 8)]
[('.', 26), (',', 24), ('a', 18), ('of', 18), ('in', 16), ('the', 13), ('and', 12), ('to', 10), ('cars', 8), ('are', 6)]
[('.', 48), (',', 29), ('to', 28), ('the', 22), ('and', 20), ('is', 19), ('of', 19), ('a', 19), ('car', 18), ('in', 14)]
[(',', 38), ('.', 37), ('the', 24), ('and', 21), ('to', 20), ('a', 18), ('in', 14), ('you', 12), ('of', 12), ('car', 10)]
[('.', 47), (',', 35), ('to', 31), ('a', 25), ('the', 25), ('in', 21), ('is', 20), ('stress', 19), ('and', 17), ('that', 15)]
[('the', 41), (',', 22), ('.', 21), ('it', 15), ('vote', 14), ('and', 14), ('for', 13), ('they', 12), ('that', 11), ('of', 11)]
[('the', 44), ('.', 23), ('to', 17), (',', 16), ('of', 12), ('The', 12), ('electoral', 11), ('college', 11), ('is', 10), ('for', 10)]
[('.', 21), ('the', 20), ('of', 18), (',', 17), ('to', 16), ('in', 11), ('it', 11), ('as', 11), ('cars', 10), ('a', 9)]
[(','