<a href="https://colab.research.google.com/github/Downforcedemon/AI/blob/main/Daneel_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Step 1: Setup & Install Required Libraries**
- Install PyMuPDF for PDF extraction
- Install spaCy for Named Entity Recognition (NER)
- Load the English NLP model


In [2]:
!pip install pymupdf spacy
import fitz  # PyMuPDF
import spacy
import os
import glob
import json
import re
import random
nlp = spacy.load('en_core_web_sm')

Collecting pymupdf
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m81.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.2


## **Step 2: Extract Text from All Books**
- Loop through all PDFs in the dataset
- Extract text and save as `.txt` files


In [3]:
pdf_files = glob.glob('/content/*.pdf')
for pdf_path in pdf_files:
    print(f'Processing: {pdf_path}')
    doc = fitz.open(pdf_path)
    extracted_text = ''
    for page in doc:
        extracted_text += page.get_text() + '\n\n'
    book_name = os.path.basename(pdf_path).replace('.pdf', '').replace(' ', '_').lower()
    output_text_file = f'/content/{book_name}.txt'
    with open(output_text_file, 'w', encoding='utf-8') as file:
        file.write(extracted_text)
    print(f'Extracted text saved to: {output_text_file}\n')

Processing: /content/6_The_Currents_of_Space.pdf
Extracted text saved to: /content/6_the_currents_of_space.txt

Processing: /content/2_The_Naked_Sun.pdf
Extracted text saved to: /content/2_the_naked_sun.txt

Processing: /content/3. The_Robots_of_Dawn.pdf
Extracted text saved to: /content/3._the_robots_of_dawn.txt

Processing: /content/1_The_Caves_Steel.pdf
Extracted text saved to: /content/1_the_caves_steel.txt

Processing: /content/5_The_Stars_Like_Dust.pdf
Extracted text saved to: /content/5_the_stars_like_dust.txt

Processing: /content/4_Robtos_And_Empire.pdf
Extracted text saved to: /content/4_robtos_and_empire.txt

Processing: /content/7_Pebble_in_th_sky.pdf
Extracted text saved to: /content/7_pebble_in_th_sky.txt



## **Step 3: Preprocess Extracted Text**
- Remove extra spaces and symbols
- Normalize text
- Split text into sentences


In [4]:
def preprocess_text(text):
    text = text.replace("\n", " ").replace("\r", " ")  # Remove newlines
    text = " ".join(text.split())  # Remove excessive spaces
    text = text.lower()  # Convert to lowercase

    # Corrected regex for removing unwanted characters
    text = re.sub(r"[^a-zA-Z0-9.,!?;:'\"()\\-]", " ", text)

    return text


## **Step 4: Named Entity Recognition (NER) & Extract Conversations**
- Identify all mentions of Daneel and related keywords
- Extract full conversations surrounding mentions
- Save the structured data in JSON format

In [5]:
daneel_keywords = [
    'daneel', 'r. daneel olivaw', 'daneel olivaw', 'ol daneel',
    'positronic brain', 'humanoid robot', 'spacers robot',
    'earth’s first robot', 'the immortal guardian', 'fastolfe’s creation',
    'gaia’s protector', 'robot detective', 'robotic partner', 'friend of Elijah Baley'
]

text_files = glob.glob('/content/*.txt')
for text_file_path in text_files:
    print(f'Processing NER for: {text_file_path}')
    with open(text_file_path, 'r', encoding='utf-8') as file:
        extracted_text = file.read()
    cleaned_text = preprocess_text(extracted_text)
    doc = nlp(cleaned_text)
    sentences = [sent.text.strip() for sent in doc.sents]
    extracted_conversations = []
    i = 0
    while i < len(sentences):
        sentence = sentences[i].lower()
        if any(keyword in sentence for keyword in daneel_keywords):
            conversation = []
            start = max(i - 2, 0)
            end = min(i + 5, len(sentences))
            for j in range(start, end):
                conversation.append(sentences[j])
            extracted_conversations.append({
                'context': f'Conversation mentioning Daneel (sentence {i})',
                'dialogue': ' '.join(conversation)
            })
            i = end
        else:
            i += 1
    book_name = os.path.basename(text_file_path).replace('.txt', '')
    json_output_path = f'/content/daneel_conversations_{book_name}.json'
    with open(json_output_path, 'w', encoding='utf-8') as json_file:
        json.dump(extracted_conversations, json_file, indent=4)
    print(f'Extracted conversations saved to {json_output_path}\n')

Processing NER for: /content/1_the_caves_steel.txt
Extracted conversations saved to /content/daneel_conversations_1_the_caves_steel.json

Processing NER for: /content/2_the_naked_sun.txt
Extracted conversations saved to /content/daneel_conversations_2_the_naked_sun.json

Processing NER for: /content/5_the_stars_like_dust.txt
Extracted conversations saved to /content/daneel_conversations_5_the_stars_like_dust.json

Processing NER for: /content/3._the_robots_of_dawn.txt
Extracted conversations saved to /content/daneel_conversations_3._the_robots_of_dawn.json

Processing NER for: /content/7_pebble_in_th_sky.txt
Extracted conversations saved to /content/daneel_conversations_7_pebble_in_th_sky.json

Processing NER for: /content/6_the_currents_of_space.txt
Extracted conversations saved to /content/daneel_conversations_6_the_currents_of_space.json

Processing NER for: /content/4_robtos_and_empire.txt
Extracted conversations saved to /content/daneel_conversations_4_robtos_and_empire.json



## **Step 5: Preparing Data for AI Training**
- The extracted JSON files will now be formatted into a dialogue dataset
- Next, we will create an AI training-friendly structure

In [7]:
# Step 1: Load all extracted JSON files
json_files = glob.glob("/content/daneel_conversations_*.json")
all_dialogues = []

# Load all conversations from extracted JSON files
for file in json_files:
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        all_dialogues.extend(data)

print(f"Loaded {len(all_dialogues)} conversations from {len(json_files)} books.")

# Step 2: Convert raw text into structured dialogue format
formatted_dataset = []

# Function to generate synthetic questions
def generate_synthetic_question(response):
    prompts = [
        "What does this mean?", "Can you elaborate?", "Why is this important?",
        "How does this relate to robots?", "Explain further.", "What is your perspective on this?"
    ]
    return random.choice(prompts)

# Process each conversation
for conversation in all_dialogues:
    dialogue_text = conversation["dialogue"].split(". ")  # Split conversation into sentences

    structured_conversation = {"context": conversation["context"], "dialogue": []}

    for i in range(len(dialogue_text) - 1):
        input_text = dialogue_text[i].strip()
        output_text = dialogue_text[i + 1].strip()

        # Ensure valid Q&A pairs
        if input_text and output_text:
            structured_conversation["dialogue"].append({"input": input_text, "output": output_text})

        # Generate synthetic questions if needed
        if random.random() < 0.3:  # 30% chance to add a synthetic Q&A
            structured_conversation["dialogue"].append({"input": generate_synthetic_question(output_text), "output": output_text})

    formatted_dataset.append(structured_conversation)

# Step 3: Save dataset in AI training format
json_output_path = "/content/daneel_dialogue_dataset.json"
with open(json_output_path, "w", encoding="utf-8") as json_file:
    json.dump(formatted_dataset, json_file, indent=4)

# Provide download link
json_output_path


Loaded 1343 conversations from 7 books.


'/content/daneel_dialogue_dataset.json'