In [8]:
import pandas as pd
import numpy as np
import re
import complex

# Define the provided functions here

def clean_word(word):
    word = word.lower()  # Lowercase the word
    # Remove leading/trailing/inter-word punctuation
    cleaned_word = re.sub(r'(?<!\w)[\'\"?!;:,.]|(?<=\s)[\'\"?!;:,.]|[\'\"?!;:,.](?!\w)', '', word)
    return cleaned_word

def clean_text(text):
    words = text.split()
    cleaned_words = [clean_word(word) for word in words if clean_word(word)]
    return cleaned_words

def calculate_complexity(text: str, dic: dict) -> float:
    words = clean_text(text)
    complexity = 0.0
    values = []
    for word in words:
        if dic.get(word) is not None:
            complexity += dic[word]
            values.append(dic[word])
    if values:
        return np.median(values)
    else:
        return complexity  # Return 0 or a default value if no words match


words = complex.read_file_to_dic('vocabulary/top_english_verbs_lower_100000.txt')

# Load the datasets
df1 = pd.read_csv('Essays/ai_generated_essays_llm_detect_kaggle.csv')


# Standardize column names
# df1.rename(columns={'label': 'generated'}, inplace=True)

# # add 'complexity' column
# df1['complexity'] = df1['text'].apply(lambda x: calculate_complexity(x, words))

# # add generated column (make them all equal to 0)
# # df1['generated'] = 0

# df1 = df1[['text', 'generated', 'complexity']]

# # Sort the combined dataset by 'complexity' in ascending order
# df1 = df1.sort_values(by='complexity', ascending=True)

# Save the combined dataset
df1.to_csv('Essays/ai_generated_essays_llm_detect_kaggle.csv', index=False)


In [11]:
import pandas as pd

# Specify the path to your Parquet file
parquet_file_path = 'valid_essays.parquet'

# Specify the path where you want to save the CSV file
csv_file_path = 'test_essays.csv'

# Read the Parquet file
df = pd.read_parquet(parquet_file_path)

# Save the dataframe to a CSV file
df.to_csv(csv_file_path, index=False)

print(f'CSV file has been saved to {csv_file_path}')


CSV file has been saved to test_essays.csv


In [11]:
import pandas as pd
import numpy as np
import re
import complex

# Define the provided functions here

def clean_word(word):
    word = str(word)
    word = word.lower()  # Lowercase the word
    # Remove leading/trailing/inter-word punctuation
    cleaned_word = re.sub(r'(?<!\w)[\'\"?!;:,.]|(?<=\s)[\'\"?!;:,.]|[\'\"?!;:,.](?!\w)', '', word)
    return cleaned_word

def clean_text(text):
    text = str(text)
    words = text.split()
    cleaned_words = [clean_word(word) for word in words if clean_word(word)]
    return cleaned_words

def calculate_complexity(text: str, dic: dict) -> float:
    text = str(text)
    words = clean_text(text)
    complexity = 0.0
    values = []
    for word in words:
        if dic.get(word) is not None:
            complexity += dic[word]
            values.append(dic[word])
    if values:
        return np.median(values)
    else:
        return complexity  # Return 0 or a default value if no words match



# Load the datasets
df1 = pd.read_csv('megaset.csv')

# print count of human essays before merging

# print non numeric values in 'generated' column

df2 = pd.read_csv('Essays/ai_generated_essays_llm_detect_kaggle.csv')


print("Before human", len(df1[df1['generated'] == 0]))
print("Before ai", len(df1[df1['generated'] == 1]))

percentage = 0.5  # For example, to drop 25% of the rows where 'generated' == 1

# # Determine how many rows to drop
# rows_to_drop = int(len(df1[df1['generated'] == 0]) * percentage)

# # Randomly choose indices of rows to drop
# indices_to_drop = np.random.choice(df1[df1['generated'] == 0].index, rows_to_drop, replace=False)

# # Drop those rows
# df1 = df1.drop(indices_to_drop)



# Combine the datasets
combined_df = pd.concat([df1, df2], ignore_index=True)

# Calculate complexity for each essay
combined_df['complexity'] = combined_df['text'].apply(lambda x: calculate_complexity(x, words))

combined_df = combined_df[['text', 'generated', 'complexity', 'variance', 'length']]

# Sort the combined dataset by 'complexity' in ascending order
combined_df = combined_df.sort_values(by='complexity', ascending=True)

#  print count of human essays after merging
print("After: human ", len(combined_df[combined_df['generated'] == 0]))
print("After: ai ", len(combined_df[combined_df['generated'] == 1]))



# Save the combined dataset
# combined_df.to_csv('merged.csv', index=False)

Before human 23060
Before ai 10265
After:  23060
After:  26011


In [6]:
import pandas as pd
df = pd.read_csv('persuade_corpus_1.0.csv')
# df.rename(columns={'full_text': 'text'}, inplace=True)
# df['generated'] = 0
# df = df[['text', 'generated']]
df.info()
df.to_csv('persuade_corpus.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 285383 entries, 0 to 285382
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   text       285383 non-null  object
 1   generated  285383 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 4.4+ MB


In [14]:
import pandas as pd

def calculate_text_overlap_percentage(path_to_set1, path_to_set2):
    """
    Calculates the percentage of texts in set1 that are also in set2.

    Parameters:
    - path_to_set1: The file path to the first dataset.
    - path_to_set2: The file path to the second dataset.

    Returns:
    - The percentage of texts in set1 that are also in set2.
    """
    # Load the datasets
    set1 = pd.read_csv(path_to_set1)
    set2 = pd.read_csv(path_to_set2)
    
    # Extract the 'text' columns
    texts_set1 = set(set1['text'])
    texts_set2 = set(set2['text'])
    
    # Determine the number of texts in set1 that are also in set2
    overlap_count = len(texts_set1.intersection(texts_set2))
    
    # Calculate the percentage
    percentage_overlap = (overlap_count / len(texts_set1)) * 100
    
    return percentage_overlap


import pandas as pd

def remove_common_texts(path_to_set1, path_to_set2, path_to_modified_set2=None):
    """
    Removes texts from the second dataset that are present in both datasets.

    Parameters:
    - path_to_set1: The file path to the first dataset.
    - path_to_set2: The file path to the second dataset.
    - path_to_modified_set2: Optional. The file path where the modified second dataset should be saved.

    Returns:
    - The modified second dataset as a DataFrame if path_to_modified_set2 is None.
    """
    # Load the datasets
    set1 = pd.read_csv(path_to_set1)
    set2 = pd.read_csv(path_to_set2)
    
    # Identify texts that are common to both datasets
    common_texts = set(set1['text']).intersection(set(set2['text']))
    
    # Remove these common texts from the second dataset
    modified_set2 = set2[~set2['text'].isin(common_texts)]
    
    # Optionally, save the modified second dataset to a file
    if path_to_modified_set2:
        modified_set2.to_csv(path_to_modified_set2, index=False)
    else:
        return modified_set2
    

def add_unique_texts_from_set1_to_set2(path_to_set1, path_to_set2, path_to_modified_set2):
    """
    Adds non-overlapping texts from the first dataset to the second dataset.

    Parameters:
    - path_to_set1: The file path to the first dataset.
    - path_to_set2: The file path to the second dataset.
    - path_to_modified_set2: The file path where the modified second dataset should be saved.
    """
    # Load the datasets
    set1 = pd.read_csv(path_to_set1)
    set2 = pd.read_csv(path_to_set2)
    
    # Identify texts that are unique to set1 (not present in set2)
    unique_texts_to_set1 = set1[~set1['text'].isin(set2['text'])]
    
    # Combine set2 with unique texts from set1
    modified_set2 = pd.concat([set2, unique_texts_to_set1], ignore_index=True)
    
    # Save the modified second dataset to the file
    modified_set2.to_csv(path_to_modified_set2, index=False)

    
def calculate_duplicate_percentage(df, text_column='text'):
    """
    Calculates the percentage of duplicate texts within a DataFrame.

    Parameters:
    - df: The DataFrame to analyze.
    - text_column: The name of the column containing the texts. Defaults to 'text'.

    Returns:
    - The percentage of duplicate texts within the DataFrame.
    """
    total_texts = len(df)
    unique_texts = len(df[text_column].unique())
    duplicate_texts = total_texts - unique_texts
    percentage_duplicates = (duplicate_texts / total_texts) * 100
    
    return percentage_duplicates



def remove_duplicates(df, text_column='text'):
    """
    Removes all duplicate texts from a DataFrame, keeping only the first occurrence.

    Parameters:
    - df: The DataFrame to process.
    - text_column: The name of the column containing the texts. Defaults to 'text'.

    Returns:
    - A new DataFrame with duplicates removed.
    """
    # Remove duplicates based on the text_column and keep the first occurrence
    df_cleaned = df.drop_duplicates(subset=[text_column], keep='first')
    
    return df_cleaned


# Example usage
# Assuming 'dataset' is your DataFrame and it has a column named '


# Example usage

# Or, if you want to work with the modified DataFrame directly:
# modified_set2 = remove_common_texts(path_to_set1, path_to_set2)
# print(modified_set2)


# Example usage
# path_to_set2 = 'train_v2_drct_02.csv'
path_to_set1 = "train_essays.csv"
# path_to_set1 = 'train_v2_drct_02.csv'



# path_to_set1 = 'metadataset.csv'
# # path_to_set1 = "megaset2.csv"
# df1 = pd.read_csv(path_to_set1)


# # calculate ai count
# print("Before ai", len(df1[df1['generated'] == 1]))
# print("Before human", len(df1[df1['generated'] == 0]))







# # path_to_set2 = "human.csv"
path_to_set2 = 'megaset4.csv'
# df2 = pd.read_csv(path_to_set2)

# # # calculate ai count
# print("Before ai", len(df2[df2['generated'] == 1]))
# print("Before human", len(df2[df2['generated'] == 0]))




# # This will save the modified second dataset to a new file
# percentage = calculate_text_overlap_percentage(path_to_set1, path_to_set2)
# print(f"Percentage of texts in set1 that are in set2: {percentage:.2f}%")



destination = 'megaset4.csv'
# df = pd.read_csv(path_to_set2)

# add_unique_texts_from_set1_to_set2(path_to_set1, path_to_set2, destination)



# percentage_duplicates = calculate_duplicate_percentage(df, 'text')
# print(f"Percentage of duplicate texts: {percentage_duplicates:.2f}%")



percentage = calculate_text_overlap_percentage(path_to_set1, destination)
print(f"Percentage of texts in set1 that are in set2: {percentage:.2f}%")
# df = pd.read_csv(destination)
# df = remove_duplicates(df)
# df.to_csv(path_to_set1, index=False)

df = pd.read_csv(path_to_set1)
print(f"Percentage of duplicate texts: {calculate_duplicate_percentage(df, 'text'):.2f}%")
#df = remove_duplicates(df)

df = remove_duplicates(df)
print(f"Percentage of duplicate texts: {calculate_duplicate_percentage(df, 'text'):.2f}%")

df.to_csv(path_to_set1, index=False)


# path_to_set1 = 'train_v2_drct_02.csv'
# path_to_set2 = 'final_test.csv'


Percentage of texts in set1 that are in set2: 19.73%
Percentage of duplicate texts: 4.51%
Percentage of duplicate texts: 0.00%
