In [1]:
import requests
import os
import pandas as pd
import time
import random
import re
from bs4 import BeautifulSoup
from tqdm import tqdm

class HumanTextCollector:
    """A class to collect human-written texts from various sources."""
    
    def __init__(self, output_dir="data/human_texts"):
        """Initialize the collector with an output directory."""
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
    
    def clean_text(self, text):
        """Clean and normalize text."""
        # Remove extra whitespace and normalize
        text = re.sub(r'\s+', ' ', text).strip()
        # Remove URLs
        text = re.sub(r'https?://\S+', '', text)
        # Remove special characters but keep punctuation
        text = re.sub(r'[^\w\s.,?!;:\'\"-]', '', text)
        return text
    
    def save_text(self, text, source, filename=None):
        """Save text to a file."""
        if not filename:
            # Create a filename based on source and random number
            filename = f"{source.replace(' ', '_').lower()}_{random.randint(1000, 9999)}.txt"
        
        file_path = os.path.join(self.output_dir, filename)
        
        # Save the text
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(text)
        
        return file_path
    
    def fetch_gutenberg_text(self, book_id, min_length=500, max_length=2000):
        """Fetch text from Project Gutenberg."""
        try:
            url = f"https://www.gutenberg.org/files/{book_id}/{book_id}-0.txt"
            response = requests.get(url)
            
            if response.status_code != 200:
                # Try alternative URL format
                url = f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"
                response = requests.get(url)
                
            if response.status_code == 200:
                # Get the full text
                full_text = response.text
                
                # Remove Project Gutenberg header and footer
                start_marker = "*** START OF"
                end_marker = "*** END OF"
                
                if start_marker in full_text and end_marker in full_text:
                    content = full_text.split(start_marker)[1].split(end_marker)[0]
                else:
                    content = full_text
                
                # Split into paragraphs
                paragraphs = [p for p in content.split('\n\n') if len(p.strip()) > min_length]
                
                if paragraphs:
                    # Select a random paragraph of appropriate length
                    selected_texts = []
                    for _ in range(3):  # Get 3 samples from this book
                        paragraph = random.choice(paragraphs)
                        # Truncate if too long
                        if len(paragraph) > max_length:
                            paragraph = paragraph[:max_length]
                        selected_texts.append(self.clean_text(paragraph))
                    
                    return selected_texts
            
            return []
            
        except Exception as e:
            print(f"Error fetching Project Gutenberg text {book_id}: {e}")
            return []

    def collect_gutenberg_texts(self, book_ids=None, count=15):
        """Collect texts from Project Gutenberg books."""
        if not book_ids:
            # Popular classic books on Project Gutenberg
            book_ids = [
                1342,   # Pride and Prejudice
                11,     # Alice's Adventures in Wonderland
                84,     # Frankenstein
                1400,   # Great Expectations
                2701,   # Moby Dick
                1952,   # The Yellow Wallpaper
                74,     # The Adventures of Tom Sawyer
                98,     # A Tale of Two Cities
                345,    # Dracula
                1232,   # The Prince
                2600,   # War and Peace
                16328,  # Beowulf
                76,     # Adventures of Huckleberry Finn
                2814,   # Dubliners
                1661    # The Adventures of Sherlock Holmes
            ]
        
        saved_files = []
        
        print(f"Collecting text samples from Project Gutenberg...")
        for book_id in tqdm(book_ids):
            texts = self.fetch_gutenberg_text(book_id)
            
            for i, text in enumerate(texts):
                if text and len(saved_files) < count:
                    filename = f"gutenberg_{book_id}_{i+1}.txt"
                    file_path = self.save_text(text, f"gutenberg_{book_id}", filename)
                    saved_files.append(file_path)
                    
                if len(saved_files) >= count:
                    break
                    
        print(f"Collected {len(saved_files)} text samples from Project Gutenberg")
        return saved_files
    
    def fetch_wikipedia_text(self, title, min_length=500, max_length=2000):
        """Fetch text from Wikipedia article."""
        try:
            # Make API request to get the article content
            url = "https://en.wikipedia.org/w/api.php"
            params = {
                "action": "query",
                "format": "json",
                "titles": title,
                "prop": "extracts",
                "explaintext": True
            }
            
            response = requests.get(url, params=params)
            data = response.json()
            
            # Extract the page content
            pages = data["query"]["pages"]
            page_id = next(iter(pages))
            
            if "extract" in pages[page_id]:
                extract = pages[page_id]["extract"]
                
                # Split into paragraphs
                paragraphs = [p for p in extract.split('\n') if len(p.strip()) > min_length]
                
                if paragraphs:
                    # Select a suitable paragraph
                    paragraph = random.choice(paragraphs)
                    # Truncate if too long
                    if len(paragraph) > max_length:
                        paragraph = paragraph[:max_length]
                    
                    return self.clean_text(paragraph)
            
            return None
            
        except Exception as e:
            print(f"Error fetching Wikipedia article {title}: {e}")
            return None

    def collect_wikipedia_texts(self, topics=None, count=15):
        """Collect texts from Wikipedia articles."""
        if not topics:
            # Various topics from different domains
            topics = [
                "Artificial_intelligence",
                "Climate_change",
                "Quantum_mechanics",
                "Renaissance",
                "World_War_II",
                "Solar_System",
                "Evolution",
                "Democracy",
                "Industrial_Revolution",
                "Mathematics",
                "Psychology",
                "Internet",
                "Economics",
                "Agriculture",
                "Film",
                "Literature",
                "Biology",
                "Philosophy",
                "Music_theory",
                "Genetics"
            ]
        
        saved_files = []
        
        print(f"Collecting text samples from Wikipedia...")
        for topic in tqdm(topics):
            text = self.fetch_wikipedia_text(topic)
            
            if text:
                file_path = self.save_text(text, f"wikipedia_{topic}")
                saved_files.append(file_path)
                
            if len(saved_files) >= count:
                break
                
            # Be kind to Wikipedia API
            time.sleep(1)
                    
        print(f"Collected {len(saved_files)} text samples from Wikipedia")
        return saved_files
    
    def collect_all_texts(self, gutenberg_count=10, wikipedia_count=10):
        """Collect texts from all sources."""
        all_files = []
        
        # Collect from Project Gutenberg
        gutenberg_files = self.collect_gutenberg_texts(count=gutenberg_count)
        all_files.extend(gutenberg_files)
        
        # Collect from Wikipedia
        wikipedia_files = self.collect_wikipedia_texts(count=wikipedia_count)
        all_files.extend(wikipedia_files)
        
        print(f"\nTotal collected: {len(all_files)} human-written text samples")
        return all_files


def main():
    # Set up the collector
    collector = HumanTextCollector()
    
    # Collect human-written texts from various sources
    # Adjust the counts as needed
    collected_files = collector.collect_all_texts(
        gutenberg_count=15,
        wikipedia_count=10
    )
    
    print(f"\nHuman text collection complete!")
    print(f"Text files are stored in: {collector.output_dir}")
    
    # Display sample of collected texts
    if collected_files:
        print("\nSample of collected texts:")
        for file_path in random.sample(collected_files, min(3, len(collected_files))):
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                print(f"\n{os.path.basename(file_path)}:")
                print(f"{content[:150]}...")


if __name__ == "__main__":
    main()

Collecting text samples from Project Gutenberg...


100%|███████████████████████████████████████████| 15/15 [00:13<00:00,  1.12it/s]


Collected 15 text samples from Project Gutenberg
Collecting text samples from Wikipedia...


 45%|███████████████████▊                        | 9/20 [00:10<00:12,  1.16s/it]

Collected 10 text samples from Wikipedia

Total collected: 25 human-written text samples

Human text collection complete!
Text files are stored in: data/human_texts

Sample of collected texts:

wikipedia_artificial_intelligence_5432.txt:
On June 28, 2015, Google Photos's new image labeling feature mistakenly identified Jacky Alcine and a friend as "gorillas" because they were black. Th...

wikipedia_solar_system_5898.txt:
As of the 2020s, a few astronomers have hypothesized that Planet Nine a planet beyond Neptune might exist, based on statistical variance in the orbit ...

wikipedia_industrial_revolution_1806.txt:
In 1824 Joseph Aspdin, a British bricklayer turned builder, patented a chemical process for making portland cement, an important advance in the buildi...





In [2]:
import os
import requests
import json
import time
import pandas as pd
import random
from tqdm import tqdm

# API Configuration
API_KEY = "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"
API_URL = "https://api-inference.huggingface.co/models/"

# Available free LLMs to use
FREE_LLMS = {
    "mistral": "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "falcon": "tiiuae/falcon-7b-instruct",
    "bloomz": "bigscience/bloomz-p3",
    "gpt2": "gpt2-xl",
    "flan-t5": "google/flan-t5-xl"
}

class DataCollector:
    def __init__(self, api_key, models_dict):
        self.api_key = api_key
        self.models = models_dict
        self.headers = {"Authorization": f"Bearer {api_key}"}
        
    def query_model(self, model_name, prompt, max_retries=5, backoff_factor=2):
        """Query the specified model with error handling and backoff strategy"""
        model_id = self.models.get(model_name)
        if not model_id:
            raise ValueError(f"Model {model_name} not found in available models")
        
        api_url = f"{API_URL}{model_id}"
        
        # Different payload format based on model type
        if "mistral" in model_name or "Mixtral" in model_id:
            payload = {
                "inputs": f"<s>[INST] {prompt} [/INST]",
                "parameters": {"max_new_tokens": 256, "temperature": 0.7, "return_full_text": False}
            }
        elif "falcon" in model_name:
            payload = {
                "inputs": f"User: {prompt}\nAssistant:",
                "parameters": {"max_new_tokens": 256, "temperature": 0.7, "return_full_text": False}
            }
        elif "bloomz" in model_id:
            payload = {
                "inputs": prompt,
                "parameters": {"max_new_tokens": 256, "temperature": 0.7, "return_full_text": False}
            }
        elif "flan-t5" in model_id:
            payload = {
                "inputs": prompt,
                "parameters": {"max_new_tokens": 256, "temperature": 0.7, "return_full_text": False}
            }
        else:
            # Default format for other models
            payload = {
                "inputs": prompt,
                "parameters": {"max_new_tokens": 256, "temperature": 0.7, "return_full_text": False}
            }
        
        for retry in range(max_retries):
            try:
                response = requests.post(api_url, headers=self.headers, json=payload)
                
                if response.status_code == 200:
                    # Successfully got a response
                    if isinstance(response.json(), list):
                        generated_text = response.json()[0].get("generated_text", "")
                    else:
                        generated_text = response.json().get("generated_text", "")
                    
                    return generated_text.strip()
                
                elif response.status_code == 503:
                    # Model is loading
                    response_data = response.json() if response.content else {"estimated_time": backoff_factor * (2 ** retry)}
                    wait_time = response_data.get("estimated_time", backoff_factor * (2 ** retry))
                    print(f"Model {model_name} is loading. Waiting {wait_time} seconds...")
                    time.sleep(wait_time)
                    continue
                    
                else:
                    # Other error
                    print(f"Error {response.status_code}: {response.text}")
                    # Try with a different error handling approach
                    if retry < max_retries - 1:
                        wait_time = backoff_factor * (2 ** retry)
                        print(f"Retrying in {wait_time} seconds...")
                        time.sleep(wait_time)
                    continue
            
            except Exception as e:
                print(f"Exception occurred: {e}")
                if retry < max_retries - 1:
                    wait_time = backoff_factor * (2 ** retry)
                    print(f"Retrying in {wait_time} seconds...")
                    time.sleep(wait_time)
        
        raise Exception(f"Failed to get response from {model_name} after {max_retries} retries")
    
    def generate_text(self, model_name, prompt_template, topic):
        """Generate text on a specific topic using the specified model"""
        full_prompt = prompt_template.format(topic=topic)
        return self.query_model(model_name, full_prompt)
    
    def regenerate_text(self, model_name, original_text):
        """Regenerate a given text using the specified model"""
        prompt = f"Please rewrite the following text in your own words while preserving the key information and tone:\n\n{original_text}\n\nRewritten text:"
        return self.query_model(model_name, prompt)


class DatasetBuilder:
    def __init__(self, collector):
        self.collector = collector
        self.human_texts = []
        self.ai_texts = []
        self.dataset = pd.DataFrame()
        
    def load_human_texts_from_files(self, directory_path):
        """Load human-written texts from text files in the specified directory"""
        if not os.path.exists(directory_path):
            os.makedirs(directory_path)
            print(f"Created directory {directory_path}. Please add human text files before proceeding.")
            return
            
        text_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
        if not text_files:
            print(f"No text files found in {directory_path}. Please add some human text files.")
            return
            
        for filename in text_files:
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    content = file.read().strip()
                    if len(content.split()) >= 30:  # Ensure the text has at least 30 words
                        self.human_texts.append({
                            'text': content,
                            'source': filename,
                            'label': 'human'
                        })
            except Exception as e:
                print(f"Error reading {filename}: {e}")
                
        print(f"Loaded {len(self.human_texts)} human texts")
    
    def generate_ai_texts(self, topics, models=None, samples_per_topic=2):
        """Generate AI texts on various topics using available models"""
        if not models:
            models = list(self.collector.models.keys())
            
        prompt_template = "Write a comprehensive and informative passage about {topic}. Be detailed and thorough."
        
        for model in models:
            for topic in tqdm(topics, desc=f"Generating texts with {model}"):
                for _ in range(samples_per_topic):
                    try:
                        generated_text = self.collector.generate_text(model, prompt_template, topic)
                        if generated_text and len(generated_text.split()) >= 30:
                            self.ai_texts.append({
                                'text': generated_text,
                                'source': model,
                                'label': 'ai',
                                'topic': topic
                            })
                    except Exception as e:
                        print(f"Error generating text with {model} on topic '{topic}': {e}")
        
        print(f"Generated {len(self.ai_texts)} AI texts")
    
    def build_dataset(self):
        """Combine human and AI texts into a balanced dataset"""
        all_texts = self.human_texts + self.ai_texts
        random.shuffle(all_texts)
        self.dataset = pd.DataFrame(all_texts)
        return self.dataset
    
    def process_regenerations(self, models=None, sample_size=None):
        """Add regenerations of texts using specified models"""
        if not models:
            models = list(self.collector.models.keys())[:2]  # Use first 2 models by default
        
        if sample_size and sample_size < len(self.dataset):
            texts_to_process = self.dataset.sample(sample_size)
        else:
            texts_to_process = self.dataset
        
        for model in models:
            regen_column = f"regeneration_{model}"
            texts_to_process[regen_column] = None
            
            for idx, row in tqdm(texts_to_process.iterrows(), total=len(texts_to_process), desc=f"Regenerating with {model}"):
                try:
                    regeneration = self.collector.regenerate_text(model, row['text'])
                    self.dataset.loc[idx, regen_column] = regeneration
                except Exception as e:
                    print(f"Error regenerating text at index {idx} with {model}: {e}")
                    self.dataset.loc[idx, regen_column] = "ERROR"
        
        return self.dataset
    
    def save_dataset(self, output_file="ai_text_detection_dataset.csv"):
        """Save the dataset to a CSV file"""
        self.dataset.to_csv(output_file, index=False)
        print(f"Dataset saved to {output_file}")
        return output_file


# Sample topics for AI text generation
TOPICS = [
    "climate change",
    "artificial intelligence ethics",
    "space exploration",
    "renewable energy",
    "quantum computing",
    "global economics",
    "modern literature",
    "cryptocurrency",
    "sustainable agriculture",
    "medical advances",
    "world history",
    "digital privacy",
    "education reform",
    "cultural diversity",
    "wildlife conservation"
]


def main():
    # Create the data collection directory structure
    os.makedirs("data/human_texts", exist_ok=True)
    
    # Initialize the collector and dataset builder
    collector = DataCollector(API_KEY, FREE_LLMS)
    builder = DatasetBuilder(collector)
    
    # Step 1: Load human-written texts
    print("Step 1: Loading human-written texts...")
    builder.load_human_texts_from_files("data/human_texts")
    
    # If no human texts found, provide instructions and create sample texts
    if not builder.human_texts:
        print("\nNo human texts found. Creating sample texts for testing...")
        
        # Create a few sample human texts directly
        sample_texts = [
            "The quick brown fox jumps over the lazy dog. This is a simple test sentence that contains all the letters in the English alphabet. It's commonly used for font testing and other purposes where you need a standard sample text.",
            "In the heart of the dense forest, a small stream wound its way between moss-covered rocks and fallen logs. Sunlight filtered through the canopy above, creating dappled patterns on the forest floor. Birds called to one another from the branches overhead.",
            "The history of computing spans decades, from early mechanical calculators to today's sophisticated electronic devices. The development of transistors in the mid-20th century revolutionized the field, allowing for smaller and more efficient computers."
        ]
        
        for i, text in enumerate(sample_texts):
            filepath = os.path.join("data/human_texts", f"sample_text_{i+1}.txt")
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(text)
            
            builder.human_texts.append({
                'text': text,
                'source': f"sample_{i+1}",
                'label': 'human'
            })
        
        print(f"Created {len(sample_texts)} sample human texts for testing purposes.")
    
    # Step 2: Generate AI texts
    print("\nStep 2: Generating AI texts using free LLMs...")
    # Use a subset of models to save API calls
    used_models = ["mistral", "mistral"]  # Using models that are more likely to work without special access
    
    # Simpler topics to reduce complexity
    simple_topics = [
        "the benefits of exercise",
        "healthy eating habits",
        "importance of education",
        "history of computers",
        "climate and weather"
    ]
    
    builder.generate_ai_texts(simple_topics, models=used_models, samples_per_topic=1)
    
    # Step 3: Build the combined dataset
    print("\nStep 3: Building the dataset...")
    dataset = builder.build_dataset()
    print(f"Dataset created with {len(dataset)} samples ({sum(dataset['label'] == 'human')} human, {sum(dataset['label'] == 'ai')} AI)")
    
    # Step 4: Generate regenerations
    print("\nStep 4: Generating text regenerations for similarity analysis...")
    # Process just a sample to save API calls
    sample_size = min(len(dataset), 10)  # Process up to 10 samples for regeneration
    final_dataset = builder.process_regenerations(models=used_models[:1], sample_size=sample_size)
    
    # Step 5: Save the dataset
    print("\nStep 5: Saving the final dataset...")
    output_file = builder.save_dataset()
    
    print(f"\nData collection complete! Dataset saved to {output_file}")
    print(f"Final dataset contains {len(final_dataset)} samples with original texts and regenerations")
    
    # Display dataset sample
    print("\nSample of the dataset:")
    print(dataset.head(2))


if __name__ == "__main__":
    main()

Step 1: Loading human-written texts...
Loaded 105 human texts

Step 2: Generating AI texts using free LLMs...


Generating texts with mistral: 100%|██████████████| 5/5 [00:00<00:00,  5.77it/s]
Generating texts with mistral: 100%|██████████████| 5/5 [00:00<00:00,  6.63it/s]


Generated 10 AI texts

Step 3: Building the dataset...
Dataset created with 115 samples (105 human, 10 AI)

Step 4: Generating text regenerations for similarity analysis...


Regenerating with mistral: 100%|████████████████| 10/10 [00:24<00:00,  2.48s/it]


Step 5: Saving the final dataset...
Dataset saved to ai_text_detection_dataset.csv

Data collection complete! Dataset saved to ai_text_detection_dataset.csv
Final dataset contains 115 samples with original texts and regenerations

Sample of the dataset:
                                                text  \
0  Although all of Germany's stated demands had b...   
1  The Renaissance began in times of religious tu...   

                            source  label topic regeneration_mistral  
0  wikipedia_world_war_ii_6311.txt  human   NaN                  NaN  
1   wikipedia_renaissance_2117.txt  human   NaN                  NaN  



