In [1]:
!pip install Faker

Collecting Faker
  Downloading Faker-35.2.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-35.2.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker
Successfully installed Faker-35.2.0


In [2]:
from faker import Faker
import random
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import os


# Generating Text

In [3]:
class TextGenerator:
    def __init__(self, seed=None):
        self.faker = Faker()
        if seed:
            Faker.seed(seed)
            random.seed(seed)

    def generate_paragraph(self, num_sentences=5, min_words=15, max_words=25):
        """Generate a paragraph with longer sentences"""
        sentences = []
        for _ in range(num_sentences):
            # Generate longer sentences by combining multiple faker sentences
            num_words = random.randint(min_words, max_words)
            words = []
            while len(words) < num_words:
                words.extend(self.faker.sentence().split()[:-1])  # Remove period
            sentence = ' '.join(words[:num_words]) + '.'
            sentences.append(sentence)
        return ' '.join(sentences)

    def generate_document(self, target_size, min_sentences=5, max_sentences=10):
        """Generate a document targeting a specific size in tokens"""
        document = []
        current_size = 0

        while current_size < target_size:
            num_sentences = random.randint(min_sentences, max_sentences)
            paragraph = self.generate_paragraph(num_sentences, min_words=15, max_words=25)
            document.append(paragraph)
            current_size += len(paragraph.split())

        return '\n\n'.join(document)

    def generate_test_documents(self, num_docs=2, size_multiplier=1):
        """Generate test documents with different sizes"""
        documents = []
        for i in range(num_docs):
            if i == 0:
                # First document: style reference (~2000-3000 tokens)
                target_size = 2500 * size_multiplier
            else:
                # Second document: document to summarize (~6000-8000 tokens)
                target_size = 7000 * size_multiplier

            doc = self.generate_document(target_size)
            documents.append(doc)
        return documents

In [4]:
def initialize_nltk():
    """Initialize NLTK resources safely"""
    try:
        # Download required NLTK data
        nltk.download('punkt_tab')  # Add punkt_tab specifically
        nltk.download('punkt')
        nltk.download('stopwords')
    except Exception as e:
        print(f"Warning: Failed to download NLTK resources: {str(e)}")

    # Verify the resources are available
    try:
        nltk.data.find('tokenizers/punkt_tab/english')  # Updated path
        nltk.data.find('tokenizers/punkt')
        nltk.data.find('corpora/stopwords')
    except LookupError as e:
        # If punkt_tab is not found, we can still proceed with punkt
        if 'punkt_tab' in str(e):
            print("Warning: punkt_tab not found, falling back to punkt")
        else:
            print(f"Error: Required NLTK resources not found: {str(e)}")
            raise

# Analyzing Text

In [5]:
class StyleAnalyzer:
    """Analyzes and matches text style characteristics"""
    def __init__(self):
        initialize_nltk()

    def analyze_style(self, text):
        """Extract style characteristics from text"""
        sentences = sent_tokenize(text)
        words = word_tokenize(text)

        return {
            'avg_sentence_length': sum(len(word_tokenize(s)) for s in sentences) / len(sentences),
            'sentence_lengths': [len(word_tokenize(s)) for s in sentences],
            'avg_word_length': sum(len(w) for w in words) / len(words)
        }

    def match_style(self, sentences, ref_style):
        """Adjust sentences to match reference style"""
        target_avg_length = ref_style['avg_sentence_length']
        adjusted_sentences = []

        for sentence in sentences:
            words = word_tokenize(sentence)
            current_length = len(words)

            if current_length > target_avg_length * 1.5:
                # Split long sentences
                mid = len(words) // 2
                adjusted_sentences.extend([
                    ' '.join(words[:mid]) + '.',
                    ' '.join(words[mid:]) + '.'
                ])
            elif current_length < target_avg_length * 0.5 and adjusted_sentences:
                # Combine short sentences when possible
                prev = adjusted_sentences.pop()
                combined = f"{prev[:-1]} and {sentence}"
                adjusted_sentences.append(combined)
            else:
                adjusted_sentences.append(sentence)

        return adjusted_sentences

In [6]:
class Summarizer:
    def __init__(self, context_window_size=4000):
        self.context_window_size = context_window_size
        initialize_nltk()
        try:
            self.stop_words = set(stopwords.words('english'))
        except Exception as e:
            print(f"Warning: Failed to load stopwords, using empty set: {str(e)}")
            self.stop_words = set()
        self.style_analyzer = StyleAnalyzer()

    def get_token_count(self, text):
        return len(word_tokenize(text))

    def sent_tokenize(self, text):
        """Wrapper for NLTK's sent_tokenize"""
        return sent_tokenize(text)

    def sentence_similarity(self, sent1, sent2):
        words1 = [word.lower() for word in word_tokenize(sent1) if word.lower() not in self.stop_words]
        words2 = [word.lower() for word in word_tokenize(sent2) if word.lower() not in self.stop_words]

        all_words = list(set(words1 + words2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        for word in words1:
            vector1[all_words.index(word)] += 1
        for word in words2:
            vector2[all_words.index(word)] += 1

        return 1 - cosine_distance(vector1, vector2)

    def generate_summary(self, text, style_reference=None, target_length=None):
        """Generate summary with style matching"""
        sentences = self.sent_tokenize(text)
        if len(sentences) <= 1:
            return text

        # Generate basic summary
        similarity_matrix = np.zeros((len(sentences), len(sentences)))
        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 != idx2:
                    similarity_matrix[idx1][idx2] = self.sentence_similarity(sentences[idx1], sentences[idx2])

        nx_graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(nx_graph)
        ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

        # Select sentences based on target length
        if target_length:
            current_length = 0
            summary_sentences = []
            for _, sentence in ranked_sentences:
                current_length += len(word_tokenize(sentence))
                if current_length > target_length:
                    break
                summary_sentences.append(sentence)
        else:
            num_sentences = max(1, int(len(sentences) * 0.3))
            summary_sentences = [s for _, s in ranked_sentences[:num_sentences]]

        # Apply style matching if reference is provided
        if style_reference:
            ref_style = self.style_analyzer.analyze_style(style_reference)
            summary_sentences = self.style_analyzer.match_style(summary_sentences, ref_style)

        return ' '.join(summary_sentences)

    def hierarchical_summarize(self, text, style_reference=None, target_length=None):
        """Hierarchical summarization with style matching"""
        current_text = text
        while self.get_token_count(current_text) > self.context_window_size:
            sentences = self.sent_tokenize(current_text)
            chunks = []
            current_chunk = []
            current_length = 0

            for sentence in sentences:
                sentence_length = self.get_token_count(sentence)
                if current_length + sentence_length > self.context_window_size:
                    if current_chunk:
                        chunks.append(' '.join(current_chunk))
                    current_chunk = [sentence]
                    current_length = sentence_length
                else:
                    current_chunk.append(sentence)
                    current_length += sentence_length

            if current_chunk:
                chunks.append(' '.join(current_chunk))

            # Summarize each chunk with style matching
            chunk_target = target_length // len(chunks) if target_length else None
            summaries = [
                self.generate_summary(
                    chunk,
                    style_reference=style_reference,
                    target_length=chunk_target
                ) for chunk in chunks
            ]
            current_text = ' '.join(summaries)

        return current_text

In [7]:
def save_text(text, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(text)

def calculate_proportional_length(ref_length, doc_length, context_window_size):
    """Calculate target length proportionally"""
    ratio = ref_length / doc_length
    return min(context_window_size, int(doc_length * ratio))

# Main

In [9]:

# Initialize components
generator = TextGenerator(seed=42)
summarizer = Summarizer(context_window_size=4000)

# Generate test documents
docs = generator.generate_test_documents(num_docs=2, size_multiplier=2)
style_reference = docs[0]  # First document is style reference
document_to_summarize = docs[1]  # Second document needs summarization

# Create output directory
os.makedirs('output', exist_ok=True)

# Step 1: Measure document lengths
ref_length = summarizer.get_token_count(style_reference)
doc_length = summarizer.get_token_count(document_to_summarize)

print("-------------------------\n")
print(f"Style reference length: {ref_length} tokens")
print(f"Document to summarize length: {doc_length} tokens")

# Step 2: Compute target length proportionally
target_length = calculate_proportional_length(
    ref_length,
    doc_length,
    summarizer.context_window_size
)
print(f"Target summary length: {target_length} tokens")

# Save original documents
save_text(style_reference, 'output/style_reference.txt')
save_text(document_to_summarize, 'output/document_to_summarize.txt')

# Generate summary with style matching
if doc_length > summarizer.context_window_size:
    final_summary = summarizer.hierarchical_summarize(
        document_to_summarize,
        style_reference=style_reference,
        target_length=target_length
    )
else:
    final_summary = summarizer.generate_summary(
        document_to_summarize,
        style_reference=style_reference,
        target_length=target_length
    )

# Save the final summary
save_text(final_summary, 'output/final_summary.txt')
final_length = summarizer.get_token_count(final_summary)
print(f"Final summary length: {final_length} tokens")
print("Process completed successfully!")




[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


-------------------------

Style reference length: 5361 tokens
Document to summarize length: 14852 tokens
Target summary length: 4000 tokens
Final summary length: 3959 tokens
Process completed successfully!
