In [9]:
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')

stemmer = PorterStemmer()

def preprocess_text(text):
    text = re.sub(r'[\^\\w\\s]', '', text.lower())
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return words, stemmed_words

def get_text_stats(texts):
    sentence_counts, sentence_lengths, text_lengths = [], [], []
    all_words, all_stemmed_words = [], []
    for text in texts:
        sentences = text.split('.')
        sentence_counts.append(len(sentences))
        for sentence in sentences:
            sentence_lengths.append(len(sentence.split()))
        text_words, text_stemmed_words = preprocess_text(text)
        text_lengths.append(len(text_words))
        all_words.extend(text_words)
        all_stemmed_words.extend(text_stemmed_words)
    word_count = len(all_words)
    unique_word_count = len(set(all_words))
    stemmed_word_count = len(Counter(all_stemmed_words))
    stemmed_unique_word_count = len(set(all_stemmed_words))
    ttr = unique_word_count / word_count  # estimating lexical diversity
    return {
        'sentence_count': round(np.mean(sentence_counts), 2),
        'sentence_length': round(np.mean(sentence_lengths), 2),
        'text_length': round(np.mean(text_lengths), 2),
        'word_count': word_count,
        'unique_word_count': unique_word_count,
        'stemmed_word_count': stemmed_word_count,
        'stemmed_unique_word_count': stemmed_unique_word_count,
        'ttr': ttr
    }

data = pd.read_csv('all_data.csv')
human_texts = data[data.labels == 0]['text'].values
chatgpt_texts = data[data.labels == 1]['text'].values

human_stats = get_text_stats(human_texts)
chatgpt_stats = get_text_stats(chatgpt_texts)

print(f"Human unique word count: {human_stats['unique_word_count']}, ChatGPT unique word count: {chatgpt_stats['unique_word_count']}")
print(f"Human word count: {human_stats['word_count']}, ChatGPT word count: {chatgpt_stats['word_count']}")
print(f"Human stemmed unique word count: {human_stats['stemmed_unique_word_count']}, ChatGPT stemmed unique word count: {chatgpt_stats['stemmed_unique_word_count']}")
print(f"Human stemmed word count: {human_stats['stemmed_word_count']}, ChatGPT stemmed word count: {chatgpt_stats['stemmed_word_count']}")
print(f"Human sentence count: {human_stats['sentence_count']}, ChatGPT sentence count: {chatgpt_stats['sentence_count']}")
print(f"Human sentence length: {human_stats['sentence_length']}, ChatGPT sentence length: {chatgpt_stats['sentence_length']}")
print(f"Human text length: {human_stats['text_length']}, ChatGPT text length: {chatgpt_stats['text_length']}")
print(f"Human TTR: {human_stats['ttr']}, ChatGPT TTR: {chatgpt_stats['ttr']}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Human unique word count: 11600, ChatGPT unique word count: 13478
Human word count: 245066, ChatGPT word count: 993511
Human stemmed unique word count: 10863, ChatGPT stemmed unique word count: 12582
Human stemmed word count: 10863, ChatGPT stemmed word count: 12582
Human sentence count: 12.73, ChatGPT sentence count: 12.52
Human sentence length: 8.78, ChatGPT sentence length: 9.02
Human text length: 111.39, ChatGPT text length: 112.9
Human TTR: 0.047334187524993264, ChatGPT TTR: 0.013566029968465372
