In [15]:
import pandas as pd
import complex
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

pd.set_option('display.max_rows', 5000000)
dataset = pd.read_csv('train_v2_drct_02.csv')
dataset = pd.read_csv('megaset.csv')
dataset = pd.read_csv('metadataset.csv')

# dataset = 'text' (sting), 'generated' (int) (0 for human, 1 for AI), 'complexity' (float)
# dataset is sorted in ascending order accordding to the complexity

human = dataset[dataset['generated'] == 0]
ai = dataset[dataset['generated'] == 1]
print("AI essay count: ", len(ai))
print("Human essay count: ", len(human))

verbs_dic = complex.read_file_to_dic('vocabulary/top_english_verbs_lower_100000.txt')
verbs_dic2 = complex.read_file_to_dic('vocabulary/top_english_verbs_lower_10000.txt')
verbs_dic = complex.read_file_to_dic('vocabulary/top_english_words_lower_1000000.txt')


pd.options.mode.chained_assignment = None  # default='warn'

# Divide the whole dataset into 10 groups based on complexity
dataset['group'], total_bins = pd.qcut(dataset['complexity'], 7, labels=False, retbins=True)

# Create a list of DataFrames for the essays, one DataFrame per group for the entire dataset
divided_dataset = [dataset.loc[dataset['group'] == i, ['text', 'generated']] for i in range(7)]

# Extracting bounds for each group for the entire dataset
bounds = [(total_bins[i], total_bins[i+1]) for i in range(len(total_bins)-1)]

print("Bounds for the entire dataset: ", bounds)

# Optionally, print the number of essays in each group for the entire dataset
print("Essay counts in each group for the entire dataset: ", [len(df) for df in divided_dataset])


AI essay count:  21387
Human essay count:  30928
Bounds for the entire dataset:  [(0.0, 7089.0), (7089.0, 8595.5), (8595.5, 9620.0), (9620.0, 10309.714285714283), (10309.714285714283, 11432.0), (11432.0, 12624.0), (12624.0, 28779.5)]
Essay counts in each group for the entire dataset:  [7672, 7277, 7964, 6981, 7739, 7209, 7473]


In [16]:
def vocabulary_percentage(vocabulary_list):
    """
    Takes a list of words and returns a dictionary where keys are words
    and values are the percentage of appearances of those words in the list.
    """
    vocabulary_count = len(vocabulary_list)
    word_counts = {} 
    # Count appearances of each word
    for word in vocabulary_list:
        if word in word_counts:
            word_counts[word] += 1
        else:
            word_counts[word] = 1
    # Calculate percentage of appearances for each word
    vocabulary_percentage = {word: (count / vocabulary_count) for word, count in word_counts.items()}
    return vocabulary_percentage

def convert_df_to_dics(df):
    """
    Takes a DataFrame and returns two dictionaries of word prevalence, one for AI essays and one for human essays.
    """
    ai_words = []
    human_words = []
    
    ai_essays = df[df['generated'] == 1]['text']
    for essay in ai_essays:
        ai_words.extend(complex.clean_text(essay)) 
        
    human_essays = df[df['generated'] == 0]['text']
    for essay in human_essays:
        human_words.extend(complex.clean_text(essay))  

    human_dic = vocabulary_percentage(human_words)
    ai_dic = vocabulary_percentage(ai_words)

    #only verbs

    # human_dic = {k: v for k, v in human_dic.items() if k in verbs_dic2}
    # ai_dic = {k: v for k, v in ai_dic.items() if k in verbs_dic2}

    return human_dic, ai_dic


def get_prevalence_table(df):
    """
    Takes a dataframe with human and AI essays and returns a DataFrame with the prevalence factor for each word.
    """
    human_dic, ai_dic = convert_df_to_dics(df)
    
    prevalence_data = []
    for word in set(list(ai_dic.keys()) + list(human_dic.keys())):
        ai_prevalence = ai_dic.get(word, 0)
        human_prevalence = human_dic.get(word, None)
        if human_prevalence is not None:
            if ai_prevalence == 0:
                prevalence_factor = -25
            elif human_prevalence > ai_prevalence:
                if (ai_prevalence == 0):
                    prevalence_factor = 50
                else: 
                    prevalence_factor = -  human_prevalence / ai_prevalence 
            else:
                prevalence_factor = ai_prevalence / human_prevalence 
            prevalence_data.append({'word': word, 'prevalence': prevalence_factor})
    prevalence_table = pd.DataFrame(prevalence_data)
    prevalence_table = prevalence_table.sort_values(by='prevalence', ascending=False)
    return prevalence_table


prevalence_tables = [get_prevalence_table(group_df) for group_df in divided_dataset]
# convert prevalence_tables to dictionaries
prevalence_dics = [dict(zip(table['word'], table['prevalence'])) for table in prevalence_tables]



In [17]:
def get_prevalence_dic_for_essay(complexity_score):
    """
    Given an essay's complexity score, find and return the appropriate prevalence dictionary.
    
    :param complexity_score: The complexity score of the essay.
    :return: The prevalence dictionary for the complexity group the essay falls into.
    """
    # Iterate through the bounds to find the right complexity group
    for i, (lower_bound, upper_bound) in enumerate(bounds):
        if lower_bound <= complexity_score <= upper_bound:
            return prevalence_dics[i]
    # If the complexity score doesn't fall within any bounds, return None or handle appropriately
    return prevalence_dics[-1]


def predict_ai(essay, print_prevalence=False):
    """
    Given an essay and a prevalence table, predict whether the essay is AI-generated or human-written.
    
    :param essay: The essay to predict.
    :param table: The prevalence table to use for prediction.
    :return: 1 if the essay is predicted to be AI-generated, 0 if human-written.
    """
    prevalence_dict = get_prevalence_dic_for_essay(complex.calculate_complexity(essay, verbs_dic))
    prevalence_score = 0
    words = complex.clean_text(essay)
    for word in words:
        # check if the word is in the prevalence dictionary
        if word in prevalence_dict:
            prevalence = prevalence_dict.get(word, 0)
        else:
            prevalence = 0
        if (prevalence > 0):
            prevalence -= 1
        else:
            prevalence += 1
        prevalence_score += prevalence
    if print_prevalence:
        print(prevalence_score)
    return 1 if prevalence_score > 0 else 0



In [18]:
print('customs' in verbs_dic)
prevalence_tables[0].head(20)


True


Unnamed: 0,word,prevalence
33950,cooperation,258.113566
6830,embrace,210.033588
2901,well-rounded,210.033588
1651,cooperate,206.237801
25052,critically,194.850437
8725,lifelong,172.075711
15501,preferences,167.014661
30687,teamwork,167.014661
9881,travelling,160.688348
20405,customs,156.89256


In [19]:
test_data = dataset

# replace name of the column 'label' with 'generated'
test_data = pd.read_csv('human_expert_essays.csv')
test_data = pd.read_csv('train_v2_drct_02.csv')
test_data = pd.read_csv('metadataset.csv')
test_data = pd.read_csv('final_test.csv').sample(10000)

test_data['Verdict'] = test_data['text'].apply(predict_ai)

# Determine AI prediction based on the verdict
test_data['Prediction'] = test_data['Verdict']

generated_column = 'generated'
# Calculate overall accuracy and error rates
accuracy = (test_data['Prediction'] == test_data[generated_column]).mean()
fp = test_data[(test_data['Prediction'] == 1) & (test_data[generated_column] == 0)].shape[0]
fn = test_data[(test_data['Prediction'] == 0) & (test_data[generated_column] == 1)].shape[0]
tp = test_data[(test_data['Prediction'] == 1) & (test_data[generated_column] == 1)].shape[0]
tn = test_data[(test_data['Prediction'] == 0) & (test_data[generated_column] == 0)].shape[0]

digits = 4
accuracy = round(100 * accuracy, digits)

fpr = fp / (fp + tn) if (fp + tn) > 0 else 0
fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
fpr = round(100 * fpr, digits)
fnr = round(100 * fnr, digits)
                                   

def mean(listx):
    total = 0
    for i in range(len(listx)):
        total += listx[i]
    return total / len(listx)
    

print(f"Accuracy: {accuracy}%")
print(f"False Positive Rate (FPR): {fpr}%")
print(f"False Negative Rate (FNR): {fnr}%")
# print(f"Average prevalence = {mean(prev)}%")
# print(f"Prevalence std = {np.std(prev)}%")

Accuracy: 86.29%
False Positive Rate (FPR): 13.1648%
False Negative Rate (FNR): 14.7001%


In [1]:
with open('input.txt', 'r') as file:
    text = file.read()
verdict = predict_ai(text, True)
print(verdict)
print(text)

NameError: name 'predict_ai' is not defined

In [14]:
import textstat

with open('input.txt', 'r') as file:
    text = file.read()
score = textstat.flesch_reading_ease(text)
print(f"Flesch Reading Ease: {score}")


import readability
results = readability.getmeasures(text, lang='en')
print(results['readability grades']['FleschReadingEase'])
print(results)


from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')

words = word_tokenize(text)
words_without_stopwords = [word for word in words if word.lower() not in stopwords.words('english')]
lexical_diversity = len(set(words_without_stopwords)) / len(words_without_stopwords)
print(f"Lexical Diversity: {lexical_diversity}")





Flesch Reading Ease: 86.74
43.721818181818215
OrderedDict([('readability grades', OrderedDict([('Kincaid', 20.824545454545454), ('ARI', 23.946590909090908), ('Coleman-Liau', 5.97795265909091), ('FleschReadingEase', 43.721818181818215), ('GunningFogIndex', 25.090909090909093), ('LIX', 66.36363636363636), ('SMOGIndex', 14.291589790636214), ('RIX', 6.25), ('DaleChallIndex', 8.589454545454545)])), ('sentence info', OrderedDict([('characters_per_word', 3.7954545454545454), ('syll_per_word', 1.268181818181818), ('words_per_sentence', 55.0), ('sentences_per_paragraph', 1.0), ('type_token_ratio', 0.4), ('characters', 835), ('syllables', 279), ('words', 220), ('wordtypes', 88), ('sentences', 4), ('paragraphs', 4), ('long_words', 25), ('complex_words', 17), ('complex_words_dc', 31)])), ('word usage', OrderedDict([('tobeverb', 9), ('auxverb', 8), ('conjunction', 4), ('pronoun', 38), ('preposition', 24), ('nominalization', 0)])), ('sentence beginnings', OrderedDict([('pronoun', 2), ('interrogative