In [1]:
import re

def preprocess_text(input_file: str, output_file: str, start_paragraph: int = 0, num_paragraphs: int = 10) -> str:
    """
    Preprocess the book text by removing the introduction, table of contents, ending sections,
    text with leading spaces, and text enclosed in square brackets or between underscores.
    Removes unwanted punctuation but keeps sentence-ending punctuation marks.
    """
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()

    # Remove text enclosed in square brackets, including multiline content (e.g., [Illustration: ...])
    text = re.sub(r'\[.*?\]', '', text, flags=re.DOTALL)

    # Remove unwanted punctuation but keep sentence-ending punctuation (., !, ?)
    text = re.sub(r"[\"'“”‘’\]\[\(\){}]", '', text)

    # Print and remove lines with leading spaces
    removed_lines = [line for line in text.splitlines() if '      ' in line]
    print("Lines being removed due to leading spaces:")
    for line in removed_lines:
        print(line)

    # Remove lines with leading spaces (non-content or formatted text)
    text = '\n'.join([line for line in text.splitlines() if '      ' not in line])

    # Identify the start of the main content by looking for the first meaningful paragraph
    paragraphs = text.split('\n\n')
    main_content_index = -1

    for i, paragraph in enumerate(paragraphs):
        # Check if the paragraph is a valid start (contains more than one word and looks like a complete sentence)
        if re.match(r'^[A-Z][^?!.]*[.?!]$', paragraph.strip(), re.MULTILINE) and len(paragraph.split()) > 5:
            main_content_index = i
            break

    if main_content_index != -1:
        # Retain only the content from the first main paragraph onward
        text = '\n\n'.join(paragraphs[main_content_index:])

    # Identify the ending marker using non-content blocks (e.g., multiple empty lines or formatting markers)
    match = re.search(r'(\n\s*\n\s*){3,}', text, flags=re.DOTALL)
    if match:
        surrounding_text = text[max(0, match.start() - 100):match.start() + 100].lower()
        if 'chapter' not in surrounding_text:
            text = text[:match.start()]  # Remove content after the non-content block

    # Remove specific project markers (e.g., "*** END OF THE PROJECT GUTENBERG")
    text = re.sub(r'\*\*\*.*?\*\*\*', '', text, flags=re.DOTALL)

    # Trim empty lines at the start and end of the book
    text = text.strip()

    # Clean up any extra newlines or spaces for a cleaner output
    text = re.sub(r'\n\s*\n', '\n\n', text)  # Maintain paragraphs with double newlines
    text = re.sub(r'-', ' ', text)  # Remove hyphens
    text = re.sub(r'[ ]+', ' ', text)  # Normalize spaces
    text = re.sub(r'_', '', text)  # Remove "_" symbols while keeping the words intact

    new_paragraphs = text.split('\n\n')

    # Extract the specified fragment
    fragment_paragraphs = new_paragraphs[start_paragraph:start_paragraph + num_paragraphs]
    fragment_text = '\n\n'.join(fragment_paragraphs)

    # Write the fragment to the output file
    with open(output_file, 'w', encoding='utf-8') as out_f:
        out_f.write(fragment_text)

    return output_file

# Example usage
!wget https://www.gutenberg.org/files/1342/1342-0.txt -O pride_and_prejudice.txt
preprocess_text("pride_and_prejudice.txt", "preprocessed.txt", num_paragraphs=50)


--2025-01-08 12:33:07--  https://www.gutenberg.org/files/1342/1342-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 752575 (735K) [text/plain]
Saving to: ‘pride_and_prejudice.txt’


2025-01-08 12:33:08 (3.76 MB/s) - ‘pride_and_prejudice.txt’ saved [752575/752575]

Lines being removed due to leading spaces:
                            
                            
                                PRIDE.
                                  and
                               PREJUDICE
                                  by
                             Jane Austen,
                           with a Preface by
                           George Saintsbury
                                  and
                           Illustrations by
                             Hugh Thomson
                         
  

'preprocessed.txt'

In [10]:
!pip install groq
import os
from groq import Groq
import re
from google.colab import userdata

class BookProcessor:
    def __init__(self, api_key, book_path):
        """
        Initialize the processor with Groq API key and book path
        """
        self.client = Groq(api_key=api_key)
        self.book_path = book_path
        self.output_file = 'interactions.txt'

    def process_book(self, chunk_size=10, output_path='processed_book.txt'):
        """
        Process the book in chunks of paragraphs
        """
        processed_content = []

        # Open output file in write mode to clear previous contents
        with open(self.output_file, 'w', encoding='utf-8') as outfile:
            pass

        with open(self.book_path, 'r', encoding='utf-8') as file:
            paragraphs = file.read().split('\n\n')

        # Remove empty paragraphs
        paragraphs = [p for p in paragraphs if p.strip()]

        # Process paragraphs in chunks
        for i in range(0, len(paragraphs), chunk_size):
            chunk = paragraphs[i:i+chunk_size]

            # Analyze chunk with LLM
            self._analyze_chunk_with_llm(chunk, chunk_index=i//chunk_size)

        return processed_content

    def _analyze_chunk_with_llm(self, chunk, chunk_index):
        """
        Use Groq LLM to analyze a chunk of paragraphs
        """
        # Join paragraphs with numbering for context
        numbered_chunk = "\n".join([f"Paragraph {i+1}: {p}" for i, p in enumerate(chunk)])

        prompt = f"""
        Analyze this chunk of paragraphs:

        {numbered_chunk}

        Comprehensive Chunk Analysis Requirements:
        1. Identify ALL full names of characters ACTUALLY PRESENT and interacting in this chunk
        2. Exclude characters who are merely mentioned but do not appear or interact
        3. Resolve characters' aliases and nicknames to their full names
        4. Provide a concise, overarching summary of key interactions
        5. Focus on the chunk's overall narrative context, not paragraph-by-paragraph details

        Output Format:
        Chunk Overview:
        - Characters Present: [Full Names, Resolved Aliases]
        - Key Interactions: [Comprehensive Summary]
        """

        try:
            response = self.client.chat.completions.create(
                model="mixtral-8x7b-32768",
                messages=[
                    {"role": "system", "content": "You are a literary analysis assistant specializing in novels, providing nuanced contextual analysis."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=1024,
                temperature=0.0
            )

            # Append response to single output file
            response_text = response.choices[0].message.content
            with open(self.output_file, 'a', encoding='utf-8') as f:
                f.write(f"--- Chunk {chunk_index + 1} Response ---\n{response_text}\n\n")

            print(f"--- Processed text: \n{numbered_chunk}\n\n")
            print(f"--- LLM Response ---\n{response_text}\n\n")

            print(f"Appended chunk {chunk_index + 1} response to {self.output_file}\n\n")
            return response_text

        except Exception as e:
            print(f"Error processing chunk: {e}")
            return "\n".join(chunk)

def main():
    # Replace with your actual Groq API key
    GROQ_API_KEY = userdata.get("GROQ_API_KEY")
    BOOK_PATH = 'preprocessed.txt'

    processor = BookProcessor(GROQ_API_KEY, BOOK_PATH)
    processed_book = processor.process_book(chunk_size=10)

if __name__ == "__main__":
    main()

--- Processed text: 
Paragraph 1: It is a truth universally acknowledged, that a single man in possession
of a good fortune must be in want of a wife.
Paragraph 2: However little known the feelings or views of such a man may be on his
first entering a neighbourhood, this truth is so well fixed in the minds
of the surrounding families, that he is considered as the rightful
property of some one or other of their daughters.
Paragraph 3: My dear Mr. Bennet, said his lady to him one day, have you heard that
Netherfield Park is let at last?
Paragraph 4: Mr. Bennet replied that he had not.
Paragraph 5: But it is, returned she; for Mrs. Long has just been here, and she
told me all about it.
Paragraph 6: Mr. Bennet made no answer.
Paragraph 7: Do not you want to know who has taken it? cried his wife, impatiently.
Paragraph 8: You want to tell me, and I have no objection to hearing it.
Paragraph 9: This was invitation enough.
Paragraph 10: Why, my dear, you must know, Mrs. Long says that Netherf

In [12]:
!pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [41]:
import os
import logging
import re
from groq import Groq
from dotenv import load_dotenv


class BookProcessor:
    def __init__(self, api_key, book_path, output_file='interactions.txt'):
        if not api_key:
            raise ValueError("API Key is missing. Ensure it is set correctly.")

        self.client = Groq(api_key=api_key)
        self.book_path = book_path
        self.output_file = output_file
        self.processed_paragraphs = set()
        self._setup_logging()

    def _setup_logging(self):
        logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
        self.logger = logging.getLogger(__name__)

    def process_book(self, chunk_size=10, overlap=2):
        """
        Process the book in chunks with optional overlap.
        """
        if not os.path.exists(self.book_path):
            self.logger.error(f"Book file not found: {self.book_path}")
            return

        paragraphs = self._load_book()
        self.logger.info(f"Loaded {len(paragraphs)} paragraphs.")
        with open(self.output_file, 'w', encoding='utf-8') as f:
            f.write("")

        for i in range(0, len(paragraphs), chunk_size - overlap):
            chunk = paragraphs[i:i + chunk_size]
            self._analyze_chunk_with_llm(chunk, start_paragraph=i + 1)

    def _load_book(self):
        """
        Load the book and split it into paragraphs.
        """
        with open(self.book_path, 'r', encoding='utf-8') as file:
            paragraphs = file.read().split('\n\n')
        return [p.strip() for p in paragraphs if p.strip()]

    def _analyze_chunk_with_llm(self, chunk, start_paragraph):
        """
        Analyze a chunk of paragraphs using the LLM.
        """
        numbered_chunk = "\n".join([f"Paragraph {i + start_paragraph}: {p}" for i, p in enumerate(chunk)])
        prompt = self._build_prompt(numbered_chunk)

        try:
            response = self.client.chat.completions.create(
                model="mixtral-8x7b-32768",
                messages=[
                    {"role": "system", "content": "You are a literary analysis assistant specializing in novels."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=1024,
                temperature=0.0
            )
            response_text = response.choices[0].message.content
            self._save_response(chunk, response_text, start_paragraph)
        except Exception as e:
            self.logger.error(f"Error processing chunk starting at paragraph {start_paragraph}: {e}")

    def _build_prompt(self, numbered_chunk):
        """
        Build the prompt to send to the LLM.
        """
        return f"""
        Analyze the following paragraphs:

        {numbered_chunk}

        For each paragraph:
        1. Identify all characters directly interacting, using their full names. Do not include any descriptions or additional details.
        2. If a character's name is not explicitly mentioned in the current paragraph, inspect the entire book context to resolve it to a full name.
        3. If no characters directly interact, state "No direct interactions."

        Provide the output in this exact format:
        Paragraph 1: [Full names of interacting people or "No direct interactions."]
        Paragraph 2: [Full names of interacting people or "No direct interactions."]
        """


    def _save_response(self, chunk, response_text, start_paragraph):
        """
        Save processed interactions for each paragraph in a concise format.
        """
        interactions = response_text.split("\n")

        with open(self.output_file, 'a', encoding='utf-8') as f:
            for i, paragraph in enumerate(chunk):
                paragraph_number = i + start_paragraph
                if paragraph_number in self.processed_paragraphs:
                    continue

                interaction_line = interactions[i].strip() if i < len(interactions) else "No direct interactions."

                if "no direct interaction" in interaction_line.lower():
                    interaction_text = "No direct interactions."
                else:
                    interaction_text = re.sub(r"^Paragraph \d+:\s*", "", interaction_line, flags=re.IGNORECASE)

                f.write(f"Paragraph {paragraph_number}: {paragraph.strip()}\n")
                f.write(f"Interactions: {interaction_text}\n\n")

                self.processed_paragraphs.add(paragraph_number)

        self.logger.info(f"Saved chunk starting at paragraph {start_paragraph} to {self.output_file}")


if __name__ == "__main__":
    load_dotenv()
    GROQ_API_KEY = userdata.get("GROQ_API_KEY")
    book_path = 'preprocessed.txt'

    processor = BookProcessor(GROQ_API_KEY, book_path)
    processor.process_book(chunk_size=10, overlap=3)


In [42]:
output_file = 'interactions.txt'

try:
    with open(output_file, 'r', encoding='utf-8') as f:
        print("\n--- Interactions.txt Contents ---\n")
        print(f.read())
except FileNotFoundError:
    print(f"File '{output_file}' not found. Please ensure the file exists.")



--- Interactions.txt Contents ---

Paragraph 1: It is a truth universally acknowledged, that a single man in possession
of a good fortune must be in want of a wife.
Interactions: No direct interactions.

Paragraph 2: However little known the feelings or views of such a man may be on his
first entering a neighbourhood, this truth is so well fixed in the minds
of the surrounding families, that he is considered as the rightful
property of some one or other of their daughters.
Interactions: No direct interactions.

Paragraph 3: My dear Mr. Bennet, said his lady to him one day, have you heard that
Netherfield Park is let at last?
Interactions: Mrs. Bennet, Mr. Bennet

Paragraph 4: Mr. Bennet replied that he had not.
Interactions: Mrs. Bennet, Mr. Bennet

Paragraph 5: But it is, returned she; for Mrs. Long has just been here, and she
told me all about it.
Interactions: Mrs. Bennet

Paragraph 6: Mr. Bennet made no answer.
Interactions: Mrs. Bennet, Mr. Bennet

Paragraph 7: Do not you want to