In [1]:
pip install nltk




In [2]:
pip install scikit-learn




In [3]:
import pandas as pd

# Read CSV file into DataFrame
file_path = 'Dataset.csv'
df = pd.read_csv(file_path)

# combine the text columns
df_combined = df.melt(id_vars=['Domain'], value_vars=['Human-generated text', 'ChatGPT-generated text', 'Mixed text'], value_name='All Text')


In [4]:
# @title Remove Citations & numbers
import re

def clean_text(texts):
    cleaned_text = [re.sub(r'\(.*?\)|\[\d+(-\d+)?\]|\s\d+.*?\s|\s\d+', '', text) for text in texts]
    return cleaned_text

df_combined['Clean Text'] = clean_text(df_combined['All Text'])
print(df_combined['Clean Text'])

0       WordNet reflects the relationship between word...
1       Surprisingly perhaps, it was not untilthat the...
2       Astronomy is without doubt the empirical scien...
3       Galaxy clusters represent the largest class of...
4       More puzzling are diffuse extended radio sourc...
                              ...                        
2995    As a measure of how divided the country can be...
2996    In contrast to refugees, who have crossed an i...
2997    The use of epidemiology in documenting the mor...
2998    Haiti, situated in the Northern hemisphere, st...
2999    The recognition of Posttraumatic Stress Disord...
Name: Clean Text, Length: 3000, dtype: object


In [8]:
# @title Min & Max Paragraphs Length

# Define function to calculate word count statistics
def word_stats(text_column):
    max_lengths = []
    min_lengths = []

    for text in text_column:
        paragraphs = text.split('\n')
        # Remove any empty or whitespace-only paragraphs
        paragraphs = [p for p in paragraphs if p.strip()]
        word_counts = [len(paragraph.split()) for paragraph in paragraphs]

        # Determine the max and min word count for the text and add to the lists
        max_lengths.append(max(word_counts, default=0))
        min_lengths.append(min(word_counts, default=0))
    return max(max_lengths), min(min_lengths)

# Print overall max and min paragraph lengths
max_length, min_length = word_stats(df_combined['Clean Text'])
print("Overall:")
print(f"Max Paragraph Length: {max_length} words")
print(f"Min Paragraph Length: {min_length} words")

# Print max and min paragraph lengths per domain
for domain, group in df_combined.groupby('Domain'):
    max_length, min_length = word_stats(group['Clean Text'])
    print("\nDomain:", domain)
    print(f"Max Paragraph Length: {max_length} words")
    print(f"Min Paragraph Length: {min_length} words")


Overall:
Max Paragraph Length: 490 words
Min Paragraph Length: 1 words

Domain: Astrophysics and Astronomy
Max Paragraph Length: 287 words
Min Paragraph Length: 1 words

Domain: Climate Science and Environmental Studies
Max Paragraph Length: 490 words
Min Paragraph Length: 1 words

Domain: Computer Science and Artificial Intelligence
Max Paragraph Length: 430 words
Min Paragraph Length: 1 words

Domain: Genetics and Genomics
Max Paragraph Length: 419 words
Min Paragraph Length: 1 words

Domain: Materials Science and Engineering
Max Paragraph Length: 295 words
Min Paragraph Length: 13 words

Domain: Mathematics and Statistics
Max Paragraph Length: 239 words
Min Paragraph Length: 1 words

Domain: Medical Research and Healthcare
Max Paragraph Length: 320 words
Min Paragraph Length: 1 words

Domain: Natural Language Processing
Max Paragraph Length: 402 words
Min Paragraph Length: 1 words

Domain: Neuroscience and Psychology
Max Paragraph Length: 343 words
Min Paragraph Length: 1 words

Dom

In [6]:
# @title Calculate AVG Paragraphs len.
from scipy.stats import zscore

def paragraph_lengths(text_column):
    lengths = []
    for text in text_column:
        paragraphs = text.split('\n')
        paragraphs = [p.strip() for p in paragraphs if p.strip()]  # remove blank lines
        lengths.extend([len(p.split()) for p in paragraphs])
    return lengths

# Get the paragraph lengths for all the data
all_lengths = paragraph_lengths(df_combined['Clean Text'])
overall_avg_length = sum(all_lengths) / len(all_lengths) if all_lengths else 0

# Get the paragraph lengths for each domain
domain_avg_lengths = []
for domain, group in df_combined.groupby('Domain'):
    lengths = paragraph_lengths(group['Clean Text'])
    avg_length = sum(lengths) / len(lengths) if lengths else 0
    domain_avg_lengths.append(avg_length)


# Normalize using Z-score
all_avg_lengths = [overall_avg_length] + domain_avg_lengths
z_scores = zscore(all_avg_lengths)

print(f"Overall:")
print(f"Avg Paragraph Length: {overall_avg_length} words (Z-score {z_scores[0]})\n")

for domain, avg_length, z_score in zip(df_combined['Domain'].unique(), domain_avg_lengths, z_scores[1:]):
    print(f"Domain: {domain}")
    print(f"Avg Paragraph Length: {avg_length} words (Z-score {z_score})\n")



Overall:
Avg Paragraph Length: 34.87716992876409 words (Z-score -0.2961140621262857)

Domain: Medical Research and Healthcare
Avg Paragraph Length: 29.504332755632582 words (Z-score -0.5005848004156827)

Domain: Astrophysics and Astronomy
Avg Paragraph Length: 23.90556274256145 words (Z-score -0.7136537249328877)

Domain: Climate Science and Environmental Studies
Avg Paragraph Length: 30.55864069622876 words (Z-score -0.46046165578064274)

Domain: Genetics and Genomics
Avg Paragraph Length: 37.08015564202335 words (Z-score -0.21227639857606573)

Domain: Neuroscience and Psychology
Avg Paragraph Length: 101.66929133858268 words (Z-score 2.2457523552810312)

Domain: Natural Language Processing
Avg Paragraph Length: 23.447837150127228 words (Z-score -0.7310731044530804)

Domain: Mathematics and Statistics
Avg Paragraph Length: 21.28584729981378 words (Z-score -0.8133506138253066)

Domain: Computer Science and Artificial Intelligence
Avg Paragraph Length: 36.34679089026915 words (Z-score -

In [7]:
# @title Count uniqe words
from collections import Counter
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

def unique_words(text_column):
    return Counter(word.lower() for text in text_column for word in text.split() if word.lower() not in stopwords.words('english'))

# Get unique words for the entire DataFrame
unique_word_count = len(unique_words(df_combined['Clean Text']))

print("\nOverall:")
print(f"Number of Unique Words (excluding stopwords): {unique_word_count}")

# Print number of unique words per domain, excluding stopwords
for domain, group in df_combined.groupby('Domain'):
    unique_word_count = len(unique_words(group['Clean Text']))
    print("\nDomain:", domain)
    print(f"Number of Unique Words (excluding stopwords): {unique_word_count}")



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.



Overall:
Number of Unique Words (excluding stopwords): 29151

Domain: Astrophysics and Astronomy
Number of Unique Words (excluding stopwords): 6790

Domain: Climate Science and Environmental Studies
Number of Unique Words (excluding stopwords): 7106

Domain: Computer Science and Artificial Intelligence
Number of Unique Words (excluding stopwords): 8266

Domain: Genetics and Genomics
Number of Unique Words (excluding stopwords): 5190

Domain: Materials Science and Engineering
Number of Unique Words (excluding stopwords): 3021

Domain: Mathematics and Statistics
Number of Unique Words (excluding stopwords): 1516

Domain: Medical Research and Healthcare
Number of Unique Words (excluding stopwords): 5540

Domain: Natural Language Processing
Number of Unique Words (excluding stopwords): 7126

Domain: Neuroscience and Psychology
Number of Unique Words (excluding stopwords): 5120

Domain: Social Sciences and Humanities
Number of Unique Words (excluding stopwords): 8908
