In [None]:
import pandas as pd
import numpy as np
import math
import complex
import re
import matplotlib.pyplot as plt
import seaborn as sns


words1 = complex.read_file_to_dic('vocabulary/top_english_adjs_lower_100000.txt')
words2 = complex.read_file_to_dic('vocabulary/top_english_words_lower_100000.txt')
words3 = complex.read_file_to_dic('vocabulary/top_english_verbs_lower_100000.txt')
words4 = complex.read_file_to_dic('vocabulary/top_english_nouns_lower_100000.txt')

def calculate_complexity(text: str) -> float:
    words = complex.clean_text(text)
    complexity = 0.0
    values = []
    for word in words:
        if words1.get(word) is not None:
            value = words1[word] +0.01
            # value = math.log2(value)
            complexity += value
            values.append(value)
        elif words3.get(word) is not None:
            value = words3[word] +0.01
            #value = math.log2(value)
            complexity += value
            values.append(value)
        # elif words4.get(word) is not None:
        #     value = words4[word] +0.01
        #     value = math.log2(value)
        #     complexity += value
        #     values.append(value)
    if values:
        return np.mean(values)
    else:
        return complexity  # Return 0 or a default value if no words match
    
def is_verb(word: str) -> bool:
    if words3.get(word) is not None:
        return True
    else:
        return False
    
def is_adj(word: str) -> bool:
    if words1.get(word) is not None:
        return True
    else:
        return False


In [122]:
import pandas as pd
import numpy as np
from nltk import bigrams
import nltk
from nltk.tokenize import word_tokenize
import nltk
from nltk.tokenize import sent_tokenize
import re
from nltk import trigrams
from textstat import flesch_reading_ease
nltk.download('punkt')

# Load the two-word prevalence table
two_word_prevalence_table = pd.read_csv('two_word_prevalence_table.csv')
two_word_prevalence_dict = two_word_prevalence_table.set_index('Bigram')['Prevalence_Factor'].to_dict()

prevalence_table = pd.read_csv('prevalence_table.csv')
prevalence_dict = prevalence_table.set_index('Word')['Prevalence_Factor'].to_dict()

# Load the three-word prevalence table
three_word_prevalence_table = pd.read_csv('three_word_prevalence_table.csv')
three_word_prevalence_dict = three_word_prevalence_table.set_index('Trigram')['Prevalence_Factor'].to_dict()

four_word_prevalence_table = pd.read_csv('four_word_prevalence_table.csv')
four_word_prevalence_dict = four_word_prevalence_table.set_index('Fourgram')['Prevalence_Factor'].to_dict()

five_word_prevalence_table = pd.read_csv('five_word_prevalence_table.csv')
five_word_prevalence_dict = five_word_prevalence_table.set_index('Fivegram')['Prevalence_Factor'].to_dict()


def get_text_fivegram_prevalence(text):
    """Calculate the total prevalence factor for all fivegrams in the text."""
    text = clean_text(text)  #
    words = word_tokenize(text)
    total_prevalence = 0
    for i in range(len(words)):
        if i + 4 < len(words):
            fivegram = [words[i], words[i + 1], words[i + 2], words[i + 3], words[i + 4]]
            total_prevalence += five_word_prevalence_dict.get(' '.join(fivegram), 0)
    return total_prevalence

def get_prevalence_array(text):
    text = clean_text(text)  # Clean the text using the previously defined function
    words = word_tokenize(text)  # Tokenize the text into words
    prevalences = [get_prevalence_word(word) for word in words]  # Get prevalence scores for each word
    # Ensure the list has 1000 cells, filling empty cells with 0
    prevalence_array = prevalences[:1000] + [0] * (1000 - len(prevalences))
    return prevalence_array

def get_text_fourgram_prevalence(text):
    """Calculate the total prevalence factor for all fourgrams in the text."""
    text = clean_text(text)  #
    words = word_tokenize(text)
    total_prevalence = 0
    for i in range(len(words)):
        if i + 3 < len(words):
            fourgram = [words[i], words[i + 1], words[i + 2], words[i + 3]]
            total_prevalence += four_word_prevalence_dict.get(' '.join(fourgram), 0)
    return total_prevalence

def get_text_trigram_prevalence(text):
    """Calculate the total prevalence factor for all trigrams in the text."""
    text = clean_text(text)  #
    words = word_tokenize(text)
    total_prevalence = 0
    for i in range(len(words)):
        if i + 2 < len(words):
            trigram = [words[i], words[i + 1], words[i + 2]]
            total_prevalence += three_word_prevalence_dict.get(' '.join(trigram), 0)
    return total_prevalence


# ranges:  (-628.88, 51.78), 2: (51.79, 62.17), 3: (62.18, 69.82), 4: (69.86, 77.67), 5: (77.71, 105.45)}
# calculate readability using text textstat.flesch_reading_ease(text)
weighted_prevalence_dicts = {}
for i in range(1, 6):
    table = pd.read_csv(f'weighted_prevalence{i}.csv')
    weighted_prevalence_dicts[i] = table.set_index('word')['prevalence'].to_dict()


def get_text_weighted_prevalence(text):
    readability = calculate_readability(text)
    # Determine the group based on readability
    if readability <= 51.78:
        group = 1
    elif readability <= 62.17:
        group = 2
    elif readability <= 69.82:
        group = 3
    elif readability <= 77.67:
        group = 4
    else:
        group = 5
    # Calculate the weighted prevalence for words in the text
    words = clean_text(text)
    prevalence_scores = [weighted_prevalence_dicts[group].get(word, 0) for word in words]
    
    # Calculate and return the average weighted prevalence
    if prevalence_scores:
        return np.mean(prevalence_scores)
    else:
        return 0

def calculate_readability(text):
    return flesch_reading_ease(text)

def get_text_bigram_prevalence(text):
    """Calculate the total prevalence factor for all bigrams in the text."""
    text = clean_text(text)  # Clean the text
    words = word_tokenize(text)  # Tokenize the text into words
    bigram_sequence = list(bigrams(words))  # Create bigrams from the words
    total_prevalence = sum(get_bigram_prevalence(' '.join(bigram)) for bigram in bigram_sequence)
    return total_prevalence


def get_text_average_prevalence(text):
    """Calculate the average prevalence factor for all words in the text."""
    text = clean_text(text)  # Clean the text
    words = word_tokenize(text)  # Clean the text
    total_prevalence = sum(get_prevalence_word(word) for word in words)
    return total_prevalence / len(words)

def get_text_median_prevalence(text):
    text = clean_text(text)
    words = text.split()
    prevalences = []
    for word in words:
        if (prevalence_dict.get(word) is not None):
            prevalences.append(prevalence_dict[word])
    return np.median(prevalences)

def get_text_percentage_of_words_in_prevalence_table(text):
    """Calculate the percentage of words in the text that are in the prevalence table."""
    text = clean_text(text)  # Clean the text
    words = word_tokenize(text)  # Tokenize the text into words
    words_in_table = sum(word in prevalence_dict for word in words)
    return words_in_table / len(words)

def get_text_bigram_prevalence_std(text):
    """Calculate the standard deviation of the prevalence factors for all bigrams in the text."""
    text = clean_text(text)  # Clean the text
    words = text.split()  # Tokenize the text into words
    bigram_sequence = list(bigrams(words))  # Create bigrams from the words
    bigram_list = [' '.join(bigram) for bigram in bigram_sequence]  # Rename variable here
    prevalences = []
    for bigram in bigram_list:  # Use the renamed variable
        if two_word_prevalence_table.get(bigram) is not None:
            prevalences.append(two_word_prevalence_table[bigram])
    if len(prevalences) == 0:
        return 0
    return np.std(prevalences)


def get_text_bigram_prevalence_mean(text):
    """Calculate the mean of the prevalence factors for all bigrams in the text."""
    text = clean_text(text)  # Clean the text
    words = word_tokenize(text)  # Tokenize the text into words
    bigram_sequence = list(bigrams(words))  # Create bigrams from the words
    prevalences = [get_bigram_prevalence(' '.join(bigram)) for bigram in bigram_sequence]
    return np.mean(prevalences)

def get_text_bigram_prevalence_median(text):
    """Calculate the median of the prevalence factors for all bigrams in the text."""
    text = clean_text(text)  # Clean the text
    words = text.split()  # Tokenize the text into words
    bigram_sequence = list(bigrams(words))  # Create bigrams from the words
    bigram_list = [' '.join(bigram) for bigram in bigram_sequence]  # Rename variable here
    prevalences = []
    for bigram in bigram_list:  # Use the renamed variable
        if two_word_prevalence_table.get(bigram) is not None:
            prevalences.append(two_word_prevalence_table[bigram])
    if len(prevalences) == 0:
        return 0
    return np.median(prevalences)

def clean_text(text):
    """Clean the text for analysis."""
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove punctuation
    return text

def get_bigram_prevalence(bigram):
    """Get the prevalence factor for a bigram."""
    bigram = tuple(bigram.split())  # Ensure the bigram is in tuple format
    return two_word_prevalence_dict.get(' '.join(bigram), 0)

def get_text_median_complexity(text):
    text = str(text)
    words = complex.clean_text(text)
    complexities = [calculate_complexity(word) for word in words]
    return np.median(complexities)

def get_prevalence_word(word):
    word = str(word)
    word = word.lower()
    if word in prevalence_dict:
        return prevalence_dict[word]
    else:
        return 0

def get_prevalence_text(text):
    text = str(text)
    words = text.split()
    prevalence = 0
    for word in words:
        word = complex.clean_word(word)
        prevalence += get_prevalence_word(word)
    return prevalence

def get_text_prevalence_std(text):
    text = str(text)
    words = text.split()
    prevalences = []
    for word in words:
        if (prevalence_dict.get(word) is not None):
            prevalences.append(prevalence_dict[word])
    if len(prevalences) == 0:
        return 0
    return np.std(prevalences)

def get_text_number_percentage(text):
    words = complex.clean_text(text)
    total_words = len(words)
    numeric_words = sum(word.replace(',', '').replace('.', '').isdigit() for word in words)
    if total_words > 0:
        percentage = (numeric_words / total_words) 
    else:
        percentage = 0
    return percentage

def get_text_variance(text):
    text = str(text)
    words = complex.clean_text(text)
    complexities = []
    for word in words:
        complexity = calculate_complexity(word) * len( word.split())
        complexities.append(complexity)
    return np.std(complexities)

def get_text_sentence_std(text):
    text = str(text)
    sentences = split_into_sentences(text)
    lengths = [len(sentence.split()) for sentence in sentences]
    std = np.std(lengths)
    return std

def get_paragraph_std(text):
    text = str(text)
    paragraphs = split_into_paragraphs(text)
    lengths = [len(paragraph.split()) for paragraph in paragraphs]
    std = np.std(lengths)
    return std

def get_text_length(text):
    text = str(text)
    return len(text.split())

def split_into_sentences(text):
    text = str(text)
    sentences = sent_tokenize(text)
    return sentences

def get_average_paragraph_length(text):
    text = str(text)
    paragraphs = split_into_paragraphs(text)
    lengths = [len(paragraph.split()) for paragraph in paragraphs]
    return np.mean(lengths)

def split_into_paragraphs(text):
    """
    Splits the input text into paragraphs and returns a list of paragraphs.
    Accounts for paragraphs split by one or more newlines.
    """
    text = str(text)
    # Replace double newlines with a unique marker, then split on single newlines
    unique_marker = '\x02'  # Using ASCII Start of Text character as unlikely to appear in text
    text_with_markers = text.replace('\n\n', unique_marker)
    potential_paragraphs = text_with_markers.split('\n')
    
    # Split again on the unique marker to identify paragraphs split by double newlines
    paragraphs = []
    for para in potential_paragraphs:
        if unique_marker in para:
            paragraphs.extend(para.split(unique_marker))
        else:
            paragraphs.append(para)
    
    # Filter out any empty paragraphs that may result from multiple newlines
    paragraphs = [para.strip() for para in paragraphs if para.strip()]
    return paragraphs

def get_text_length(text):
    text = str(text)
    return len(text.split())

def percentage_of_capitalized_words(text):
    # Split the text into sentences
    sentences = re.split(r'(?<=[.!?]) +', text)
    capitalized_non_starter_count = 0
    total_words_count = 0

    # Iterate over each sentence
    for sentence in sentences:
        # Split the sentence into words
        words = sentence.split()
        # Exclude the first word of a sentence from the count
        words_to_check = words[1:] if len(words) > 0 else []
        # Count the number of capitalized words that are not sentence starters
        capitalized_non_starter_count += sum(1 for word in words_to_check if word.istitle())
        total_words_count += len(words_to_check)

    # Calculate the percentage of capitalized words that are not sentence starters
    if total_words_count == 0:  # Avoid division by zero
        return 0
    percentage = (capitalized_non_starter_count / total_words_count) * 100
    return percentage


[nltk_data] Downloading package punkt to /Users/agus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import textstat
import pandas as pd


path = "megaset4.csv"
path = "test_essays.csv"
dataset = pd.read_csv(path)

# dataset['text'] = dataset['text'].astype(str)


# dataset['fivegram_prevalence'] = dataset['text'].apply(get_text_fivegram_prevalence)
# print('fivegram_prevalence done')
# dataset['percentage_capitalized'] = dataset['text'].apply(percentage_of_capitalized_words)
# print('percentage_capitalized done')

# dataset['fourgram_prevalence'] = dataset['text'].apply(get_text_fourgram_prevalence)
# print('fourgram_prevalence done')

# dataset['average_prevalence'] = dataset['text'].apply(get_text_average_prevalence)
# print('average_prevalence done')

# dataset['median_prevalence'] = dataset['text'].apply(get_text_median_prevalence)
# print('median_prevalence done')

# dataset['prevalence_std'] = dataset['text'].apply(get_text_prevalence_std)
# print('prevalence_std done')

# dataset['bigram_prevalence_std'] = dataset['text'].apply(get_text_bigram_prevalence_std)
# print('bigram_prevalence_std done')
# dataset['bigram_prevalence_mean'] = dataset['text'].apply(get_text_bigram_prevalence_mean)
# print('bigram_prevalence_mean done')
# dataset['bigram_prevalence_median'] = dataset['text'].apply(get_text_bigram_prevalence_median)
# print('bigram_prevalence_median done')
# dataset['weighted_prevalence'] = dataset['text'].apply(get_text_weighted_prevalence)
# print('weighted_prevalence done')

# dataset['median_complexity'] = dataset['text'].apply(get_text_median_complexity)
# print('median_complexity done')

# dataset['paragraph_std'] = dataset['text'].apply(get_paragraph_std)
# print('paragraph_std done')

# dataset['average_paragraph_length'] = dataset['text'].apply(get_average_paragraph_length)
# print('average_paragraph_length done')
# dataset['complexity'] = dataset['text'].apply(calculate_complexity)
# print('complexity done')
# dataset['number_percentage'] = dataset['text'].apply(get_text_number_percentage)
# print('number_percentage done')
# dataset['readability'] = dataset['text'].apply(textstat.flesch_reading_ease)
# print('readability done')

dataset['prevalence'] = dataset['text'].apply(get_prevalence_text)
print('prevalence done')
dataset['bigram_prevalence'] = dataset['text'].apply(get_text_bigram_prevalence)
print('bigram_prevalence done')
# dataset['trigram_prevalence'] = dataset['text'].apply(get_text_trigram_prevalence)
# print('trigram_prevalence done')
dataset['length'] = dataset['text'].apply(get_text_length)
print('length done')
dataset['variance'] = dataset['text'].apply(get_text_variance)
print('variance done')
# dataset['sentence_std'] = dataset['text'].apply(get_text_sentence_std)
# print('sentence_std done')
# dataset['percentage_of_words_in_prevalence_table'] = dataset['text'].apply(get_text_percentage_of_words_in_prevalence_table)
# print('percentage_of_words_in_prevalence_table done')

dataset.to_csv(path, index=False)






In [None]:
path = "human_expert_essays.csv"
path = "human.csv"
path = "Essays/ai_generated_essays_llm_detect_kaggle.csv"
path = "undetectable.csv"
path = "final_test2.csv"
dataset = pd.read_csv(path)

# dataset.info()

# from object to string
# convert text to string
# dataset['text'] = dataset['text'].astype(str)


dataset['fivegram_prevalence'] = dataset['text'].apply(get_text_fivegram_prevalence)
print('fivegram_prevalence done')
# dataset['percentage_capitalized'] = dataset['text'].apply(percentage_of_capitalized_words)
# print('percentage_capitalized done')
# dataset['fourgram_prevalence'] = dataset['text'].apply(get_text_fourgram_prevalence)
# print('fourgram_prevalence done')
# dataset['average_prevalence'] = dataset['text'].apply(get_text_average_prevalence)
# print('average_prevalence done')
# dataset['median_prevalence'] = dataset['text'].apply(get_text_median_prevalence)
# print('median_prevalence done')
# dataset['prevalence_std'] = dataset['text'].apply(get_text_prevalence_std)
# print('prevalence_std done')
# dataset['bigram_prevalence_std'] = dataset['text'].apply(get_text_bigram_prevalence_std)
# print('bigram_prevalence_std done')
# dataset['bigram_prevalence_mean'] = dataset['text'].apply(get_text_bigram_prevalence_mean)
# print('bigram_prevalence_mean done')
# dataset['bigram_prevalence_median'] = dataset['text'].apply(get_text_bigram_prevalence_median)
# print('bigram_prevalence_median done')
# dataset['weighted_prevalence'] = dataset['text'].apply(get_text_weighted_prevalence)
# print('weighted_prevalence done')
# dataset['median_complexity'] = dataset['text'].apply(get_text_median_complexity)
# print('median_complexity done')
# dataset['paragraph_std'] = dataset['text'].apply(get_paragraph_std)
# print('paragraph_std done')
# dataset['average_paragraph_length'] = dataset['text'].apply(get_average_paragraph_length)
# print('average_paragraph_length done')
# dataset['prevalence'] = dataset['text'].apply(get_prevalence_text)
# dataset['complexity'] = dataset['text'].apply(calculate_complexity)
# print('complexity done')
# dataset['number_percentage'] = dataset['text'].apply(get_text_number_percentage)
# print('number_percentage done')
# dataset['readability'] = dataset['text'].apply(textstat.flesch_reading_ease)
# print('readability done')
# dataset['bigram_prevalence'] = dataset['text'].apply(get_text_bigram_prevalence)
# print('bigram_prevalence done')
# dataset['trigram_prevalence'] = dataset['text'].apply(get_text_trigram_prevalence)
# print('trigram_prevalence done')
# dataset['length'] = dataset['text'].apply(get_text_length)
# print('length done')
# dataset['variance'] = dataset['text'].apply(get_text_variance)
# print('variance done')
# dataset['sentence_std'] = dataset['text'].apply(get_text_sentence_std)
# print('sentence_std done')
# dataset['percentage_of_words_in_prevalence_table'] = dataset['text'].apply(get_text_percentage_of_words_in_prevalence_table)
# print('percentage_of_words_in_prevalence_table done')

dataset.to_csv(path, index=False)

# Trained and Tested with itself

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Assuming dataset is already loaded and contains the 'complexity', 'Prevalence', and 'generated' columns

# Prepare the features and target

X = dataset[x_features]  # Features
y = dataset['generated']  # Target variable

# Split the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize KNN classifier
# Number of neighbors can be tuned; starting with 5
knn = KNeighborsClassifier(n_neighbors=51)

# Train the classifier
knn.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = knn.predict(X_test)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall accuracy: {accuracy * 100:.2f}%")

# Calculate the confusion matrix to get false positive and false negative rates
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()

false_positive_rate = fp / (fp + tn)
false_negative_rate = fn / (fn + tp)

print(f"False 0 rate (False Positive Rate): {false_positive_rate * 100:.2f}%")
print(f"False 1 rate (False Negative Rate): {false_negative_rate * 100:.2f}%")


# Non-standardized KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

# Load the training dataset
path_train = "megaset4.csv"
train_data = pd.read_csv(path_train)

# Load the testing dataset
path_test = "final_test2.csv"
test_data = pd.read_csv(path_test)


# Prepare the features and target for the training data
X_train = train_data[x_features]  # Features
y_train = train_data['generated']  # Target variable

# Prepare the features and target for the testing data
X_test = test_data[x_features]  # Features
y_test = test_data['generated']  # Target variable


# Initialize KNN classifier with a certain number of neighbors, e.g., 5
knn = KNeighborsClassifier(n_neighbors=51)

# Train the classifier on the training data
knn.fit(X_train, y_train)
 
# X_test = pd.read_csv('undetectable.csv')

# Make predictions on the testing data
y_pred = knn.predict(X_test)

# Evaluate the accuracy on the testing data
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall accuracy on the test set: {accuracy * 100:.2f}%")

# Calculate the confusion matrix to get false positive and false negative rates
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()

# Calculate rates
false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0

# Print the rates
print(f"False Positive Rate: {false_positive_rate * 100:.2f}%")
print(f"False Negative Rate: {false_negative_rate * 100:.2f}%")



## Standardized KNN

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
path_train = "megaset4.csv"
train_data = pd.read_csv(path_train)

# Load the testing dataset
path_test = "human.csv"
path_test = "human_expert_essays.csv"
path_test = "Essays/ai_generated_essays_llm_detect_kaggle.csv"
path_test = "final_test2.csv"

test_data = pd.read_csv(path_test).sample(10000)

# drop na rows
test_data = test_data.dropna()


# Initialize the StandardScaler
scaler = StandardScaler()

# Prepare the features and target for the training data
X_train = train_data[x_features]  # Features
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform the training data
y_train = train_data['generated']  # Target variable

# Prepare the features and target for the testing data
X_test = test_data[x_features]  # Features
X_test_scaled = scaler.transform(X_test)  # Transform the testing data using the same scaler
y_test = test_data['generated']  # Target variable

# Initialize KNN classifier with a certain number of neighbors, e.g., 5
knn = KNeighborsClassifier(n_neighbors=51)

# Train the classifier on the scaled training data
knn.fit(X_train_scaled, y_train)

# Make predictions on the scaled testing data
y_pred = knn.predict(X_test_scaled)

# Evaluate the accuracy on the testing data
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall accuracy on the test set: {accuracy * 100:.4f}%")

# Calculate the confusion matrix to get false positive and false negative rates
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()

# Calculate rates
false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0

# Print the rates
print(f"False Positive Rate: {false_positive_rate * 100:.4f}%")
print(f"False Negative Rate: {false_negative_rate * 100:.4f}%")
print(f"Average error rate: {(false_positive_rate*100+ false_negative_rate*100) / 2 :.4f}%")



import numpy as np

# Use the predict_proba method
y_proba = knn.predict_proba(X_test_scaled)

# Choose a custom threshold
threshold = 0.5  # Lowering the threshold may increase the FPR and decrease the FNR

# Apply custom threshold to get new predictions
y_pred_custom = (y_proba[:, 1] >= threshold).astype(int)

# Recalculate the metrics with the new predictions
accuracy_custom = accuracy_score(y_test, y_pred_custom)
conf_matrix_custom = confusion_matrix(y_test, y_pred_custom)
tn_custom, fp_custom, fn_custom, tp_custom = conf_matrix_custom.ravel()

# Recalculate rates
false_positive_rate_custom = fp_custom / (fp_custom + tn_custom) if (fp_custom + tn_custom) > 0 else 0
false_negative_rate_custom = fn_custom / (fn_custom + tp_custom) if (fn_custom + tp_custom) > 0 else 0

# Print the new rates
print(f"Custom Threshold: {threshold}")
print(f"Overall accuracy on the test set with custom threshold: {accuracy_custom * 100:.4f}%")
print(f"False Positive Rate with custom threshold: {false_positive_rate_custom * 100:.4f}%")
print(f"False Negative Rate with custom threshold: {false_negative_rate_custom * 100:.4f}%")
print(f"Average error rate with custom threshold: {(false_positive_rate_custom*100 + false_negative_rate_custom*100) / 2 :.4f}%")



# Prevalence-factor classifier

In [114]:
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

# Assuming 'final_test2.csv' is the correct path to your test dataset
test_data = pd.read_csv('final_test2.csv')
test_data = pd.read_csv('test_essays.csv')
# test_data = pd.read_csv('undetectable.csv')

# Assign 1 if 'prevalence' > 0, else 0, for each row in the DataFrame
# test_data['prediction'] = (test_data['prevalence'] + test_data['bigram_prevalence'] + test_data['trigram_prevalence'] + 
#                            test_data['fourgram_prevalence'] + test_data['fivegram_prevalence']> 0).astype(int)
test_data['prediction'] = (test_data['prevalence'] + test_data['bigram_prevalence']> 0).astype(int)


# Calculate and print the overall accuracy
accuracy = accuracy_score(test_data['generated'], test_data['prediction'])
print(f"Overall accuracy: {accuracy * 100:.4f}%")

# Calculate the confusion matrix to get false positive and false negative rates
conf_matrix = confusion_matrix(test_data['generated'], test_data['prediction'])
tn, fp, fn, tp = conf_matrix.ravel()

# Calculate rates
false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0

# Print the rates
print(f"False Positive Rate: {false_positive_rate * 100:.4f}%")
print(f"False Negative Rate: {false_negative_rate * 100:.4f}%")
print(f"Average error rate: {(false_positive_rate*100+ false_negative_rate*100) / 2 :.4f}%")


Overall accuracy: 57.3556%
False Positive Rate: 6.8120%
False Negative Rate: 52.6677%
Average error rate: 29.7398%


# Standardized RBF Support Vector Machines

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
path_train = "megaset4.csv"
train_data = pd.read_csv(path_train)

# Load the testing dataset
path_test = "final_test2.csv"
test_data = pd.read_csv(path_test).sample(10000)


# Initialize the StandardScaler
scaler = StandardScaler()

# Prepare the features and target for the training data
X_train = train_data[x_features]  # Features
X_train_scaled = scaler.fit_transform(X_train)  # Fit and transform the training data
y_train = train_data['generated']  # Target variable

# Prepare the features and target for the testing data
X_test = test_data[x_features]  # Features
X_test_scaled = scaler.transform(X_test)  # Transform the testing data using the same scaler
y_test = test_data['generated']  # Target variable

# Initialize Gaussian SVM classifier

# svm_clf = SVC(kernel='rbf', C=1, gamma='scale')  # 'scale' is the default value for gamma

svm_clf = SVC(kernel='rbf', C=1, gamma='scale', probability=True)

# Train the classifier on the scaled training data
svm_clf.fit(X_train_scaled, y_train)

# Make predictions on the scaled testing data
y_pred = svm_clf.predict(X_test_scaled)

# Evaluate the accuracy on the testing data
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall accuracy on the test set: {accuracy * 100:.2f}%")

# Calculate the confusion matrix to get false positive and false negative rates
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()

# Calculate rates
false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0

# Print the rates
print(f"False Positive Rate: {false_positive_rate * 100:.2f}%")
print(f"False Negative Rate: {false_negative_rate * 100:.2f}%")
print(f"Average error rate: {(false_positive_rate*100+ false_negative_rate*100) / 2 :.2f}%")



# Neural Network

Best:

x_features = ['complexity', 'prevalence', 'bigram_prevalence', 'trigram_prevalence', 
              'sentence_std', 'readability', 'average_prevalence', 'median_prevalence', 
              'prevalence_std', 'variance', 'number_percentage', 'bigram_prevalence_std', 
              'bigram_prevalence_mean', 'bigram_prevalence_median', 'paragraph_std', 'length']

mlp_clf = MLPClassifier(hidden_layer_sizes=(12,12), activation='relu', solver='adam', max_iter=400, random_state=12) 

1.26



              

In [139]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load the training dataset
path_train = "train_essays.csv"
path_train = "megaset4.csv"
train_data = pd.read_csv(path_train)

# train_data_dropped = train_data.drop(train_data[train_data['generated'] == 1].sample(n=1).index)
train_data = train_data[train_data['text'].apply(lambda x: get_text_length(x) >= 0 and get_text_length(x) <= 5000)]




# train_data = pd.concat([train_data[train_data['generated'] == 0].sample(n=train_data['generated'].value_counts().min(), random_state=42), train_data[train_data['generated'] == 1].sample(n=train_data['generated'].value_counts().min(), random_state=42)])


# Load the testing dataset
path_test = "Essays/ai_generated_essays_llm_detect_kaggle.csv"
path_test = "undetectable.csv"
path_test = "test_essays.csv"
path_test = "final_test2.csv"
test_data = pd.read_csv(path_test)
test_data = test_data[test_data['text'].apply(lambda x: get_text_length(x) >= 00 and get_text_length(x) <= 10000)]


# Initialize the StandardScaler
scaler = StandardScaler()
 
x_features = ['complexity', 'prevalence', 'bigram_prevalence', 'trigram_prevalence', 
              'sentence_std', 'readability', 'average_prevalence', 'median_prevalence', 
              'prevalence_std', 'variance', 'number_percentage', 'bigram_prevalence_std', 
              'bigram_prevalence_mean', 'bigram_prevalence_median']


x_features = [ 'complexity', 'prevalence', 'bigram_prevalence', 'trigram_prevalence', 
              'sentence_std',  'average_prevalence', 'median_prevalence', 
              'prevalence_std', 'variance', 'number_percentage', 'paragraph_std', 'length']

x_features = ['complexity', 'prevalence', 'bigram_prevalence', 'trigram_prevalence', 
              'sentence_std', 'readability', 'average_prevalence', 'median_prevalence', 
              'prevalence_std', 'variance', 'number_percentage',  'paragraph_std', 'median_complexity',
]

x_features = ['complexity', 'prevalence', 'bigram_prevalence', 'trigram_prevalence', 
              'sentence_std', 'readability', 'average_prevalence', 'median_prevalence', 
              'prevalence_std', 'variance', 'number_percentage', 'bigram_prevalence_std', 
              'bigram_prevalence_mean', 'bigram_prevalence_median', 'paragraph_std', 'length']


x_features = ['complexity', 'readability', 'variance', 'sentence_std', 'prevalence', 'average_prevalence', 'median_prevalence', 'bigram_prevalence',
              'paragraph_std', 'length', 'number_percentage', 'prevalence_std', 'bigram_prevalence_std', 'bigram_prevalence_mean', 'bigram_prevalence_median',
              'percentage_of_words_in_prevalence_table', 'median_complexity','trigram_prevalence']
              
x_features = ['prevalence', 'bigram_prevalence', 
              'prevalence_std', 'complexity', 'length','trigram_prevalence']

x_features = ['prevalence', 'bigram_prevalence', 'trigram_prevalence', 'fourgram_prevalence', 'variance', 'length', 'median_prevalence', 'complexity']




# Prepare the features and target for the training data
X_train = train_data[x_features]
X_train_scaled = scaler.fit_transform(X_train)
y_train = train_data['generated']

# Prepare the features and target for the testing data
X_test = test_data[x_features]
X_test_scaled = scaler.transform(X_test)
y_test = test_data['generated']

# Initialize MLPClassifier with a specified architecture

mlp_clf = MLPClassifier(hidden_layer_sizes=(25,25), activation='relu', 
                        solver='adam', max_iter=4000, random_state=12, verbose=True, tol=0.0001)  


# Train the classifier on the scaled training data
mlp_clf.fit(X_train_scaled, y_train)

# # Make predictions on the scaled testing data
# y_pred_test = mlp_clf.predict(X_test_scaled)
# # Make predictions on the scaled training data for overfitting check
# y_pred_train = mlp_clf.predict(X_train_scaled)

threshold = 0.5

# Get prediction probabilities for the scaled testing data
y_pred_proba_test = mlp_clf.predict_proba(X_test_scaled)[:, 1]  # Probability of the positive class
# Convert probabilities to binary outcomes based on the threshold
y_pred_test = (y_pred_proba_test >= threshold).astype(int)

# Get prediction probabilities for the scaled training data
y_pred_proba_train = mlp_clf.predict_proba(X_train_scaled)[:, 1]  # Probability of the positive class
# Convert probabilities to binary outcomes based on the threshold
y_pred_train = (y_pred_proba_train >= threshold).astype(int)




# Evaluate the accuracy on the testing data
accuracy_test = accuracy_score(y_test, y_pred_test)
# Evaluate the accuracy on the training data
accuracy_train = accuracy_score(y_train, y_pred_train)

# Output the accuracies
print(f"Accuracy on the training set: {accuracy_train * 100:.2f}%")
print(f"Accuracy on the test set: {accuracy_test * 100:.2f}%")

# If the accuracy on the training set is significantly higher than on the test set,
# it may indicate overfitting.

# Calculate the confusion matrix for test data to get false positive and false negative rates
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
tn, fp, fn, tp = conf_matrix_test.ravel()

# Calculate rates
false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0

# Print the rates
print(f"False Positive Rate on test data: {false_positive_rate * 100:.4f}%")
print(f"False Negative Rate on test data: {false_negative_rate * 100:.4f}%")
print(f"Average error rate on test data: {(false_positive_rate*100 + false_negative_rate*100) / 2 :.4f}%")

score = 1 / false_negative_rate + 1 / false_positive_rate
print(f"Score: {score:.2f}")

# Print the confusion matrix for the test data
print("Confusion Matrix for the test data:")
print(conf_matrix_test)


Iteration 1, loss = 0.13524513
Iteration 2, loss = 0.04444581
Iteration 3, loss = 0.03902259
Iteration 4, loss = 0.03741978
Iteration 5, loss = 0.03657398
Iteration 6, loss = 0.03595962
Iteration 7, loss = 0.03557124
Iteration 8, loss = 0.03511527
Iteration 9, loss = 0.03502677
Iteration 10, loss = 0.03446367
Iteration 11, loss = 0.03442558
Iteration 12, loss = 0.03423272
Iteration 13, loss = 0.03392674
Iteration 14, loss = 0.03386512
Iteration 15, loss = 0.03358167
Iteration 16, loss = 0.03353267
Iteration 17, loss = 0.03330155
Iteration 18, loss = 0.03312898
Iteration 19, loss = 0.03293316
Iteration 20, loss = 0.03296112
Iteration 21, loss = 0.03278027
Iteration 22, loss = 0.03270985
Iteration 23, loss = 0.03261398
Iteration 24, loss = 0.03246170
Iteration 25, loss = 0.03220793
Iteration 26, loss = 0.03217191
Iteration 27, loss = 0.03200358
Iteration 28, loss = 0.03188274
Iteration 29, loss = 0.03167471
Iteration 30, loss = 0.03149820
Iteration 31, loss = 0.03155829
Iteration 32, los

In [135]:


testdata = pd.read_csv('undetectable.csv')

testdata = testdata['text'].to_list()
lower = 1
for i in range(lower, 21):

    if lower == 1:
        # with open(f'human/human{i}.txt', 'r') as file:
        #     text = file.read()
        # with open(f'gpt/gpt3.5-{i}.txt', 'r') as file:
        #     text = file.read()
        # with open(f'humanWikipedia/article{i}.txt', 'r') as file:
        #     text = file.read()
        with open(f'Wikipedia/human{i}.txt', 'r') as file:
            text = file.read()
    
    if (lower == 0):
        text = testdata[i]
        if i > 3:
            break

    features = []
        
    for feature in x_features:
        if (feature == 'complexity'):
            features.append(calculate_complexity(text))
        if (feature == 'prevalence'):
            features.append(get_prevalence_text(text))
        if (feature == 'bigram_prevalence'):
            features.append(get_text_bigram_prevalence(text))
        if (feature == 'trigram_prevalence'):
            features.append(get_text_trigram_prevalence(text))
        if (feature == 'sentence_std'):
            features.append(get_text_sentence_std(text))
        if (feature == 'readability'):
            features.append(textstat.flesch_reading_ease(text))
        if (feature == 'average_prevalence'):
            features.append(get_text_average_prevalence(text))
        if (feature == 'median_prevalence'):
            features.append(get_text_median_prevalence(text))
        if (feature == 'prevalence_std'):
            features.append(get_text_prevalence_std(text))
        if (feature == 'variance'):
            features.append(get_text_variance(text))
        if (feature == 'number_percentage'):
            features.append(get_text_number_percentage(text))
        if (feature == 'bigram_prevalence_std'):
            features.append(get_text_bigram_prevalence_std(text))
        if (feature == 'bigram_prevalence_mean'):
            features.append(get_text_bigram_prevalence_mean(text))
        if (feature == 'bigram_prevalence_median'):
            features.append(get_text_bigram_prevalence_median(text))
        if (feature == 'paragraph_std'):
            features.append(get_paragraph_std(text))
        if (feature == 'length'):
            features.append(get_text_length(text))
        if (feature == 'median_complexity'):
            features.append(get_text_median_complexity(text))
        if (feature == 'percentage_of_words_in_prevalence_table'):
            features.append(get_text_percentage_of_words_in_prevalence_table(text))
        if (feature == 'average_paragraph_length'):
            features.append(get_average_paragraph_length(text))
        if (feature == 'fourgram_prevalence'):
            features.append(get_text_fourgram_prevalence(text))
        if (feature == 'percentage_capitalized'):
            features.append(percentage_of_capitalized_words(text))
        if (feature == 'fivegram_prevalence'):
            features.append(get_text_fivegram_prevalence(text))
 
#    IMPORTANT: Transform the new features using the same scaler used for the training data
    features_scaled = scaler.transform([features])  # Note: scaler.transform expects a 2D array
# unscaled features
    print("Index: ", i)
    print(f"Features: {features}")
# Make a prediction with the scaled features
    verdict = mlp_clf.predict(features_scaled)
    proba = mlp_clf.predict_proba(features_scaled)
    
    print(f"Scaled Features: {features_scaled}")
    print(f"Verdict: {verdict[0]}")
    print(f"Probability of being AI: {proba[0][1]*100}%")
    #print(text)

# average_length = pd.read_csv('megaset4.csv')['length'].mean()
# print(average_length)

# with open(f'input.txt', 'r') as file:
#         text = file.read()
# features = []
# for feature in x_features:
#         if (feature == 'complexity'):
#             features.append(calculate_complexity(text))
#         if (feature == 'prevalence'):
#             features.append(get_prevalence_text(text))
#         if (feature == 'bigram_prevalence'):
#             features.append(get_text_bigram_prevalence(text))
#         if (feature == 'trigram_prevalence'):
#             features.append(get_text_trigram_prevalence(text))
#         if (feature == 'sentence_std'):
#             features.append(get_text_sentence_std(text))
#         if (feature == 'readability'):
#             features.append(textstat.flesch_reading_ease(text))
#         if (feature == 'average_prevalence'):
#             features.append(get_text_average_prevalence(text))
#         if (feature == 'median_prevalence'):
#             features.append(get_text_median_prevalence(text))
#         if (feature == 'prevalence_std'):
#             features.append(get_text_prevalence_std(text))
#         if (feature == 'variance'):
#             features.append(get_text_variance(text))
#         if (feature == 'number_percentage'):
#             features.append(get_text_number_percentage(text))
#         if (feature == 'bigram_prevalence_std'):
#             features.append(get_text_bigram_prevalence_std(text))
#         if (feature == 'bigram_prevalence_mean'):
#             features.append(get_text_bigram_prevalence_mean(text))
#         if (feature == 'bigram_prevalence_median'):
#             features.append(get_text_bigram_prevalence_median(text))
#         if (feature == 'paragraph_std'):
#             features.append(get_paragraph_std(text))
#         if (feature == 'length'):
#             features.append(get_text_length(text))
#         if (feature == 'median_complexity'):
#             features.append(get_text_median_complexity(text))
#         if (feature == 'percentage_of_words_in_prevalence_table'):
#             features.append(get_text_percentage_of_words_in_prevalence_table(text))
#         if (feature == 'average_paragraph_length'):
#             features.append(get_average_paragraph_length(text))
# features_scaled = scaler.transform([features])  # Note: scaler.transform expects a 2D array
# verdict = mlp_clf.predict(features_scaled)
# proba = mlp_clf.predict_proba(features_scaled)
# print(f"Features: {features}")
# print(f"Scaled Features: {features_scaled}")
# print(f"Verdict: {verdict[0]}")
# print(f"Probability of being AI: {proba[0][1] * 100:.2f}%")
# print (text)
# print("prev: ",get_text_trigram_prevalence(text))
# prev = three_word_prevalence_dict.get(' '.join(('and', 'personal', 'growth')), 0)
# print(prev)
# trigram dic




Index:  1
Features: [-12808.140878794833, -4053.6793170306305, -484.9661406942898, 625.1712575531878, 22248.590292647506, 6408, -1.016281438202946]
Scaled Features: [[-5.46844835 -1.14440575 -0.51604874 -0.28329656  1.18392423 13.18368489
  -0.70669057]]
Verdict: 0
Probability of being AI: 1.6605755220559422e-11%
Index:  2
Features: [-4691.779076077079, 2884.082882912085, 2360.771229856527, 1061.134394741611, 21414.527789215183, 6856, 1.0216708560153416]
Scaled Features: [[-1.9822257   0.35188094  0.04774961 -0.15356701  0.79451609 14.17413678
   0.87446287]]
Verdict: 0
Probability of being AI: 1.544028223173057e-16%
Index:  3
Features: [-1542.7132900745994, -1931.2383014200793, 300.3099655288885, 257.61573943797185, 20527.8136592938, 3451, -1.1282367307056391]
Scaled Features: [[-0.62960679 -0.68665293 -0.3604696  -0.39267006  0.38052589  6.64626024
  -0.79355153]]
Verdict: 0
Probability of being AI: 1.1365966012386766e-08%




Index:  4
Features: [-3663.9186547272866, -2821.3925225591684, -662.0823412919108, 21.024077482313828, 19244.224448780817, 1785, -1.2172971957531755]
Scaled Features: [[-1.54072859 -0.87863499 -0.55113906 -0.46307264 -0.21875785  2.96301726
  -0.86264945]]
Verdict: 0
Probability of being AI: 0.001290113902777759%
Index:  5
Features: [-13943.613036347511, -3245.3636739501617, -1328.8871879376788, -648.060341919317, 20866.554375052463, 2762, -1.7102337843039426]
Scaled Features: [[-5.95616795 -0.97007404 -0.68324664 -0.66217209  0.53867757  5.12299829
  -1.24509627]]
Verdict: 0
Probability of being AI: 2.510239843376181e-13%
Index:  6
Features: [-5688.815270794946, -2199.394987049079, -877.7091003486378, -173.51923747916678, 20185.20629411792, 1502, -1.732188690249852]
Scaled Features: [[-2.41048287 -0.74448703 -0.5938591  -0.52096289  0.22056893  2.33735234
  -1.26213008]]
Verdict: 0
Probability of being AI: 0.00015699489460628045%
Index:  7
Features: [-532.2089078506299, -901.678187143



## Predict and save missclasifications


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import os
import shutil
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Function to clean directories
def prepare_directory(path):
    if os.path.exists(path):
        shutil.rmtree(path)
    os.makedirs(path)

# Function to save texts
def save_texts(texts, indices, directory):
    for i, index in enumerate(indices, start=1):
        with open(f"{directory}/text_{i}.txt", 'w', encoding='utf-8') as file:
            file.write(texts.iloc[index])

# Load the training dataset
# path_train = "megaset4.csv"
# train_data = pd.read_csv(path_train)

# # Load the testing dataset
# path_test = "final_test2.csv"
# test_data = pd.read_csv(path_test)

# # Initialize the StandardScaler
# scaler = StandardScaler()

# # Specify the feature names
# x_features = ['complexity', 'prevalence', 'bigram_prevalence', 'trigram_prevalence', 
#               'sentence_std', 'readability', 'average_prevalence', 'median_prevalence', 
#               'prevalence_std', 'variance', 'number_percentage', 'bigram_prevalence_std', 
#               'bigram_prevalence_mean', 'bigram_prevalence_median']

# # Prepare the features and target for the training data
# X_train = train_data[x_features]
# X_train_scaled = scaler.fit_transform(X_train)
y_train = train_data['generated']

# Prepare the features and target for the testing data
X_test = test_data[x_features]
X_test_scaled = scaler.transform(X_test)
y_test = test_data['generated']

# Initialize and train MLPClassifier
# mlp_clf = MLPClassifier(hidden_layer_sizes=(150), activation='relu', solver='adam', max_iter=400, random_state=12)
# mlp_clf.fit(X_train_scaled, y_train)

# Make predictions
y_pred_test = mlp_clf.predict(X_test_scaled)
y_pred_train = mlp_clf.predict(X_train_scaled)

# Evaluate accuracy
accuracy_test = accuracy_score(y_test, y_pred_test)
accuracy_train = accuracy_score(y_train, y_pred_train)

# Output accuracies
print(f"Accuracy on the training set: {accuracy_train * 100:.2f}%")
print(f"Accuracy on the test set: {accuracy_test * 100:.2f}%")

# Calculate and output rates
conf_matrix_test = confusion_matrix(y_test, y_pred_test)
tn, fp, fn, tp = conf_matrix_test.ravel()
false_positive_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
false_negative_rate = fn / (fn + tp) if (fn + tp) > 0 else 0
print(f"False Positive Rate on test data: {false_positive_rate * 100:.2f}%")
print(f"False Negative Rate on test data: {false_negative_rate * 100:.2f}%")
print(f"Average error rate on test data: {(false_positive_rate*100 + false_negative_rate*100) / 2 :.2f}%")
print("Confusion Matrix for the test data:")
print(conf_matrix_test)

# Prepare directories
prepare_directory('false_positives')
prepare_directory('false_negatives')

# Identify false positives and negatives
fp_indices = test_data[(y_test == 0) & (y_pred_test == 1)].index
fn_indices = test_data[(y_test == 1) & (y_pred_test == 0)].index

# Save false positives and negatives to files
save_texts(test_data['text'], fp_indices, 'false_positives')
save_texts(test_data['text'], fn_indices, 'false_negatives')


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
  
# Load the dataset
path = "final_test2.csv"  # replace with your actual path
dataset = pd.read_csv(path)

# Plotting the data
plt.figure(figsize=(10, 6))
column2 = 'fivegram_prevalence'
# Plot generated == 0 in blue
plt.scatter(dataset[dataset['generated'] == 0][column2], 
            dataset[dataset['generated'] == 0]['prevalence'], 
            c='blue', label='Human (generated=0)', alpha=0.6)

# Plot generated == 1 in red
plt.scatter(dataset[dataset['generated'] == 1][column2], 
            dataset[dataset['generated'] == 1]['prevalence'], 
            c='red', label='AI (generated=1)', alpha=0.6)

plt.title(f'{column2} vs Prevalence by Origin')
plt.xlabel(column2)
plt.ylabel('Prevalence')
plt.legend()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
  
# Load the dataset
path = "final_test2.csv"  # replace with your actual path
path = "megaset4.csv"  # replace with your actual path
dataset = pd.read_csv(path)

# Plotting the data
plt.figure(figsize=(10, 6))
column2 = 'bigram_prevalence'
# Plot generated == 0 in blue
plt.scatter(dataset[dataset['generated'] == 0][column2], 
            dataset[dataset['generated'] == 0]['prevalence'], 
            c='blue', label='Human (generated=0)', alpha=0.6)

# Plot generated == 1 in red
plt.scatter(dataset[dataset['generated'] == 1][column2], 
            dataset[dataset['generated'] == 1]['prevalence'], 
            c='red', label='AI (generated=1)', alpha=0.6)

plt.title(f'{column2} vs Prevalence by Origin')
plt.xlabel(column2)
plt.ylabel('Prevalence')
plt.legend()
plt.show()

In [None]:
%matplotlib widget

In [None]:
%matplotlib widget
import matplotlib.pyplot as plt
import pandas as pd
from mpl_toolkits.mplot3d import Axes3D

# Load the dataset
path = "final_test2.csv"  # replace with your actual path
dataset = pd.read_csv(path).sample(50)

# Creating a 3D plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')

column2 = 'bigram_prevalence'
column3 = 'trigram_prevalence'

# Plot generated == 0 in blue
ax.scatter(dataset[dataset['generated'] == 0][column2], 
           dataset[dataset['generated'] == 0]['prevalence'], 
           dataset[dataset['generated'] == 0][column3], 
           c='blue', label='Human (generated=0)', alpha=0.6)

# Plot generated == 1 in red
ax.scatter(dataset[dataset['generated'] == 1][column2], 
           dataset[dataset['generated'] == 1]['prevalence'], 
           dataset[dataset['generated'] == 1][column3], 
           c='red', label='AI (generated=1)', alpha=0.6)

ax.set_title(f'{column2}, {column3} vs Prevalence by Origin')
ax.set_xlabel(column2)
ax.set_ylabel('Prevalence')
ax.set_zlabel(column3)
ax.legend()

plt.show()

