In [28]:
# Install required packages
%pip install -q PyMuPDF pillow imagehash nltk torch torchvision ftfy regex tqdm
%pip install -q git+https://github.com/openai/CLIP.git


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [29]:
# Import libraries
import fitz  # PyMuPDF
from PIL import Image
import imagehash
import os
import re
import nltk
import torch
import numpy as np
from collections import Counter
import math
import shutil
import io

# Download NLTK data
# NLTK 3.9+ requires 'punkt_tab' instead of 'punkt'
try:
    nltk.download('punkt_tab', quiet=True)
except:
    # Fallback to old 'punkt' for older NLTK versions
    nltk.download('punkt', quiet=True)

# NLTK 3.9+ requires 'averaged_perceptron_tagger_eng' instead of 'averaged_perceptron_tagger'
try:
    nltk.download('averaged_perceptron_tagger_eng', quiet=True)
except:
    # Fallback to old 'averaged_perceptron_tagger' for older NLTK versions
    nltk.download('averaged_perceptron_tagger', quiet=True)

nltk.download('stopwords', quiet=True)

print("✓ All libraries imported successfully")


✓ All libraries imported successfully


In [30]:
# Configuration
PDF_PATH = "Team 75 Report.pdf"
#PDF_PATH = "Team_99_report.pdf"
OUTPUT_FOLDER = "analysis_output"
CLEANED_TEXT_FILE = os.path.join(OUTPUT_FOLDER, "cleaned_text.txt")
IMAGES_FOLDER = os.path.join(OUTPUT_FOLDER, "images")
REPORT_FILE = os.path.join(OUTPUT_FOLDER, "analysis_report.txt")

print(f"Configuration:")
print(f"  PDF: {PDF_PATH}")
print(f"  Output: {OUTPUT_FOLDER}")


Configuration:
  PDF: Team 75 Report.pdf
  Output: analysis_output


In [31]:
# Clean output directory for fresh start
def clean_output_directory(output_folder, images_folder):
    """Remove old output and create fresh directories"""
    if os.path.exists(output_folder):
        try:
            shutil.rmtree(output_folder)
            print(f"  ✓ Cleaned old output directory")
        except Exception as e:
            print(f"  ⚠ Warning: Could not clean directory: {e}")
    
    os.makedirs(output_folder, exist_ok=True)
    os.makedirs(images_folder, exist_ok=True)
    print(f"  ✓ Created fresh output directories")

print("Preparing output directories...")
clean_output_directory(OUTPUT_FOLDER, IMAGES_FOLDER)
print()


Preparing output directories...
  ✓ Cleaned old output directory
  ✓ Created fresh output directories



## Text Extraction & Cleaning


In [32]:
def normalize(s):
    """Normalize string for matching"""
    return ' '.join(s.strip().split()).lower()

def detect_repeating_patterns(pdf_path, min_occurrences=3):
    """Detect repeating headers/footers"""
    doc = fitz.open(pdf_path)
    header_candidates = {}
    footer_candidates = {}
    
    for page in doc:
        rect = page.rect
        
        # Header area (top 100px)
        header_rect = fitz.Rect(rect.x0, rect.y0, rect.x1, rect.y0 + 100)
        header_lines = [l.strip() for l in page.get_text("text", clip=header_rect).split('\n') 
                       if l.strip() and len(l.strip()) > 2 and not l.strip().isdigit()]
        
        # Footer area (bottom 100px)
        footer_rect = fitz.Rect(rect.x0, rect.y1 - 100, rect.x1, rect.y1)
        footer_lines = [l.strip() for l in page.get_text("text", clip=footer_rect).split('\n') 
                       if l.strip() and len(l.strip()) > 2 and not l.strip().isdigit()]
        
        for line in header_lines:
            header_candidates[line] = header_candidates.get(line, 0) + 1
        for line in footer_lines:
            footer_candidates[line] = footer_candidates.get(line, 0) + 1
    
    headers = {line for line, count in header_candidates.items() if count >= min_occurrences}
    footers = {line for line, count in footer_candidates.items() if count >= min_occurrences}
    
    return headers, footers

def extract_text_without_headers_footers(pdf_path):
    """Extract text with header/footer removal"""
    
    # Detect patterns
    headers, footers = detect_repeating_patterns(pdf_path)
    headers_norm = {normalize(h) for h in headers}
    footers_norm = {normalize(f) for f in footers}
    
    # Extract and filter
    doc = fitz.open(pdf_path)
    all_lines = []
    removed_count = 0
    
    for page in doc:
        page_text = page.get_text("text")
        for line in page_text.split('\n'):
            line_norm = normalize(line)
            
            # Skip empty
            if not line_norm:
                continue
            
            # Skip headers/footers
            if line_norm in headers_norm or line_norm in footers_norm:
                removed_count += 1
                continue
            
            # Skip page numbers
            if line.strip().isdigit():
                continue
            
            # Skip separator lines
            if re.match(r'^[_\-\s]+$', line):
                continue
            
            all_lines.append(line)
    
    full_text = '\n'.join(all_lines)
    
    return full_text

def clean_text(text):
    """Clean extracted text - remove TOC, references, appendix, etc."""
    
    # 1. Extract from ABSTRACT onwards
    abstract_start_match = re.search(r'\s*ABSTRACT', text, re.IGNORECASE)
    if abstract_start_match:
        text_from_abstract_onwards = text[abstract_start_match.start():]
    else:
        print("  ⚠ Warning: ABSTRACT section not found.")
        text_from_abstract_onwards = text

    # 2. Remove REFERENCES/BIBLIOGRAPHY and everything after
    references_match = re.search(r'(^\s*(References|REFERENCES):?\s*$)|(^\s*\[\d+\].*)', text_from_abstract_onwards, re.MULTILINE)
    if references_match:
        text_before_references = text_from_abstract_onwards[:references_match.start()].strip()
    else:
        print("  ⚠ Warning: References section not found.")
        text_before_references = text_from_abstract_onwards

    # 3. Remove TOC, LIST OF FIGURES, LIST OF TABLES
    list_patterns = [
        r'^\s*TABLE OF CONTENTS.*?^\s*(INTRODUCTION|Chapter\s+I[:\s\.]|1\.\s+Introduction)', 
        r'^\s*LIST OF FIGURES.*?^\s*(INTRODUCTION|Chapter\s+I[:\s\.]|1\.\s+Introduction)', 
        r'^\s*LIST OF TABLES.*?^\s*(INTRODUCTION|Chapter\s+I[:\s\.]|1\.\s+Introduction)'
    ]
    
    cleaned_text = text_before_references
    found_introduction = False

    for pattern_str in list_patterns:
        pattern = re.compile(pattern_str, re.IGNORECASE | re.DOTALL | re.MULTILINE)
        match = pattern.search(cleaned_text)
        if match:
            stop_marker = match.group(1)
            if stop_marker:
                cleaned_text = pattern.sub(stop_marker, cleaned_text, count=1).strip()
                if re.match(r'INTRODUCTION|Chapter|1\.', stop_marker, re.IGNORECASE):
                    found_introduction = True
            else:
                cleaned_text = pattern.sub('', cleaned_text, count=1).strip()

    # Final trim
    cleaned_text = cleaned_text.strip()

    return cleaned_text

full_text = extract_text_without_headers_footers(PDF_PATH)
# Strategy: Skip past List of Figures/Tables content and find first content section
# Strategy: Find real content AFTER List of Figures table ends
# The LOF heading is followed by the table content (Figure No., Title, Page No., entries)
# Real content (Introduction, etc.) comes after this table

lof_match = re.search(r'^\s*(List of Figures|List of Tables)\s*$', full_text, re.IGNORECASE | re.MULTILINE)

if lof_match:
    
    # Skip past the LOF heading AND its table content
    # Look for real content starting at least 150 chars after LOF heading
    search_start = lof_match.end() + 150  # Skip past table content
    search_text = full_text[search_start:]
    
    # Look for first real section heading
    content_patterns = [
        r'(^\s*Introduction\s*$)',
        r'(^\s*INTRODUCTION\s*$)',
        r'(^\s*Problem Statement\s*$)',
        r'(^\s*PROBLEM STATEMENT\s*$)'
    ]
    
    content_start = None
    for pattern in content_patterns:
        match = re.search(pattern, search_text, re.MULTILINE)
        if match:
            # Verify it's real content (has paragraph text after it)
            text_after = search_text[match.end():match.end()+200]
            has_content = any(len(line.strip()) > 50 for line in text_after.split('\n'))
            
            if has_content:
                content_start = search_start + match.start()
                break
    
    if content_start:
        full_text = full_text[content_start:]
    else:
        # Fallback: just skip past LOF + 800 chars
        full_text = full_text[lof_match.end() + 300:]

# Remove references - look for References heading
ref_heading_match = re.search(r'^\s*(References|REFERENCES|References and Bibliography|REFERENCES/BIBLIOGRAPHY|Bibliography|BIBLIOGRAPHY):?\s*$', full_text, re.MULTILINE)

if ref_heading_match:
    full_text = full_text[:ref_heading_match.start()]



# Add blank line before headings for clarity
def add_spacing_before_headings(text):
    """Add a blank line before section headings for better readability"""
    lines = text.split('\n')
    formatted_lines = []
    
    for i, line in enumerate(lines):
        stripped = line.strip()
        
        # Check if this line is a heading
        is_heading = False
        
        if stripped and len(stripped) < 100:
            # Pattern 1: All caps headings (e.g., "INTRODUCTION", "ABSTRACT AND SCOPE")
            if stripped.isupper() and len(stripped.split()) <= 8:
                is_heading = True
            
            # Pattern 2: Numbered sections (e.g., "6.1.1", "6.1 :", "1.")
            elif re.match(r'^\d+\.\d*\.?\d*\s*:?\s*$', stripped):
                is_heading = True
            
            # Pattern 3: Numbered sections with text (e.g., "6.1.1 Introduction:")
            elif re.match(r'^\d+\.\d*\.?\d*\s*[:\-]?\s*[A-Z]', stripped):
                is_heading = True
            
            # Pattern 4: Reference markers (e.g., "Reference [1]")
            elif re.match(r'^Reference\s*\[\d+\]', stripped, re.IGNORECASE):
                is_heading = True
            
            # Pattern 5: Title case headings at start of line (e.g., "Introduction", "Problem Statement")
            elif re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\s*$', stripped) and len(stripped.split()) <= 5:
                is_heading = True
            
            # Pattern 6: Single word capitalized (e.g., "Objectives", "Evaluation")
            elif re.match(r'^[A-Z][a-z]+:?\s*$', stripped) and len(stripped) > 3:
                # Make sure previous line is not part of a sentence
                if i > 0:
                    prev_line = lines[i-1].strip()
                    # If previous line ends with period, this could be a heading
                    if not prev_line or prev_line.endswith('.') or prev_line.endswith(':'):
                        is_heading = True
        
        # Add blank line before heading (if not first line and previous line isn't blank)
        if is_heading and i > 0:
            prev_line = lines[i - 1].strip() if i > 0 else ''
            if prev_line:  # Only add if previous line isn't already blank
                formatted_lines.append('')
        
        formatted_lines.append(line)
    
    return '\n'.join(formatted_lines)

cleaned_text = full_text.strip()
cleaned_text = add_spacing_before_headings(cleaned_text)

with open(CLEANED_TEXT_FILE, 'w', encoding='utf-8') as f:
    f.write(cleaned_text)
print(f"✓ Saved to: {CLEANED_TEXT_FILE}")


✓ Saved to: analysis_output\cleaned_text.txt


## Heading Validation


In [33]:
def extract_headings_by_fontsize(pdf_path, min_fontsize=14):
    """Extract text with font size >= min_fontsize"""
    doc = fitz.open(pdf_path)
    headings = set()
    
    for page in doc:
        page_dict = page.get_text("dict")
        for block in page_dict["blocks"]:
            if block["type"] == 0:  # Text block
                for line in block["lines"]:
                    for span in line["spans"]:
                        if span["size"] >= min_fontsize:
                            text = span["text"].strip()
                            if text:
                                headings.add(text.lower())
    
    return headings

def validate_headings(pdf_path, min_fontsize=14):
    """Check if expected sections are present in the PDF"""
    
    # Expected sections based on table of contents
    EXPECTED_SECTIONS = [
        ["DECLARATION", "Declaration"],
        ["ACKNOWLEDGEMENT", "Acknowledgement"],
        ["TABLE OF CONTENTS", "Table of Contents"],
        ["LIST OF FIGURES", "List of Figures"],
        ["LIST OF TABLES", "List of Tables"],
        ["INTRODUCTION"],
        ["PROBLEM STATEMENT"],
        ["ABSTRACT AND SCOPE"],
        ["RESEARCH / TECHNOLOGY GAP AND CHALLENGES", "RESEARCH", "TECHNOLOGY GAP"],
        ["OBJECTIVES"],
        ["LITERATURE SURVEY"],
        ["Overview of Datasets"],
        ["CONCLUSION OF CAPSTONE PROJECT PHASE - 1", "CONCLUSION"],
        ["PLAN OF WORK FOR CAPSTONE PROJECT PHASE - 2", "PLAN OF WORK"],
        ["REFERENCES/BIBLIOGRAPHY", "REFERENCES", "BIBLIOGRAPHY"],
        ["APPENDIX A DEFINITIONS, ACRONYMS, AND ABBREVIATIONS", "APPENDIX"]
    ]
    
    extracted_headings = extract_headings_by_fontsize(pdf_path, min_fontsize)
    
    found = []
    missing = []
    
    for group in EXPECTED_SECTIONS:
        # Check if any synonym matches (handle compound headings with AND or /)
        is_found = False
        
        for syn in group:
            syn_lower = syn.lower()
            
            # Direct substring match
            if any(syn_lower in heading for heading in extracted_headings):
                is_found = True
                break
            
            # Handle compound headings: "ABSTRACT AND SCOPE" or "RESEARCH / TECHNOLOGY GAP"
            # Split by AND or / and check if ANY component matches
            if ' and ' in syn_lower or '/' in syn:
                components = re.split(r'\s+and\s+|/', syn_lower)
                components = [c.strip() for c in components if c.strip()]
                
                # Check if any component is found
                if any(comp in heading for comp in components for heading in extracted_headings):
                    is_found = True
                    break
        
        if is_found:
            found.append(group[0])
        else:
            missing.append(group[0])
    
    return found, missing, len(EXPECTED_SECTIONS)

found, missing, total = validate_headings(PDF_PATH)
print(f"\n✓ Found: {len(found)}/{total} expected sections")
if missing:
    print(f"  Missing: {', '.join(missing)}")



✓ Found: 15/16 expected sections
  Missing: LIST OF TABLES


## Image Extraction & CLIP Analysis


In [34]:
def find_content_start_page(pdf_path):
    """
    Find the page where main content starts (after List of Figures/Tables).
    Looks for "INTRODUCTION" or "ABSTRACT" as starting point.
    """
    doc = fitz.open(pdf_path)
    
    for page_num, page in enumerate(doc):
        text = page.get_text("text").upper()
        
        # Look for introduction or abstract section
        if re.search(r'\b(INTRODUCTION|ABSTRACT AND SCOPE|CHAPTER\s+I)\b', text):
            print(f"  Content starts at page {page_num + 1}")
            return page_num
        
        # Also check if we're past "LIST OF FIGURES"
        if "LIST OF FIGURES" in text:
            # Content likely starts on next page or soon after
            print(f"  Found 'List of Figures' on page {page_num + 1}, starting extraction from page {page_num + 2}")
            return page_num + 1
    
    # If not found, start from page 10 (conservative default)
    print(f"  Could not find content start, defaulting to page 10")
    return 9

def extract_images_with_text(pdf_path, output_folder, margin=100):
    """Extract unique images and nearby text (only from main content pages)"""
    doc = fitz.open(pdf_path)
    seen_hashes = set()
    image_data = []
    
    # Find where main content starts
    start_page = find_content_start_page(pdf_path)
    
    for page_num, page in enumerate(doc):
        # Skip pages before content starts
        if page_num < start_page:
            continue
        
        images = page.get_images(full=True)
        page_dict = page.get_text("dict")
        
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            
            # Check if unique
            pil_image = Image.open(io.BytesIO(image_bytes))
            normalized = pil_image.convert("RGB").resize((256, 256))
            img_hash = imagehash.phash(normalized)
            
            if img_hash in seen_hashes:
                continue
            seen_hashes.add(img_hash)
            
            # Save image
            filename = f"image_p{page_num+1}_i{img_index+1}.{image_ext}"
            filepath = os.path.join(output_folder, filename)
            with open(filepath, "wb") as f:
                f.write(image_bytes)
            
            # Extract nearby text (with header/footer filtering)
            img_rect = None
            for block in page_dict["blocks"]:
                if block["type"] == 1:  # Image block
                    img_blocks = [b for b in page_dict["blocks"] if b["type"] == 1]
                    if img_index < len(img_blocks):
                        img_rect = fitz.Rect(img_blocks[img_index]["bbox"])
                        break
            
            nearby_text = ""
            if img_rect:
                # Get text blocks (paragraphs) from area around image
                # Use larger area to ensure we get complete paragraphs
                expanded_rect = fitz.Rect(
                    max(0, img_rect.x0 - 200),
                    max(0, img_rect.y0 - 200),
                    min(page.rect.width, img_rect.x1 + 200),
                    min(page.rect.height, img_rect.y1 + 200)
                )
                
                # Get text with layout preservation
                raw_text = page.get_text("text", clip=expanded_rect)
                
                # Split into paragraphs (separated by double newlines or single newline with short lines)
                paragraphs = []
                current_para = []
                
                for line in raw_text.split('\n'):
                    line_stripped = line.strip()
                    
                    # Skip obvious headers/footers
                    if line_stripped.lower() in ['dept. of cse', 'department of cse']:
                        continue
                    if re.match(r'^(aug|jan|feb|mar|apr|may|jun|jul|sep|oct|nov|dec)[-\s]+(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec),?\s*\d{4}$', line_stripped, re.IGNORECASE):
                        continue
                    if line_stripped.lower() == 'capstone tracker with an integrated evaluation system':
                        continue
                    if re.match(r'^\d+$', line_stripped) and len(line_stripped) <= 3:
                        continue
                    if re.match(r'^[_\-]+$', line_stripped):
                        continue
                    
                    if not line_stripped:
                        # Empty line - end current paragraph
                        if current_para:
                            paragraphs.append(' '.join(current_para))
                            current_para = []
                    else:
                        current_para.append(line_stripped)
                
                # Don't forget the last paragraph
                if current_para:
                    paragraphs.append(' '.join(current_para))
                
                # Filter and select COMPLETE paragraphs (no cut-off sentences)
                good_paragraphs = []
                for para in paragraphs:
                    # Skip very short paragraphs (likely fragments)
                    if len(para) < 30:
                        continue
                    
                    # IMPORTANT: Skip paragraphs that start with lowercase (cut-off mid-sentence)
                    if para and para[0].islower():
                        continue
                    
                    # IMPORTANT: Skip paragraphs that don't end properly (no punctuation)
                    if para and para[-1] not in '.!?':
                        # Unless it's a figure caption or title
                        if not re.search(r'\b(Figure|Fig\.|Table|Chapter|Section)\s*\d+', para, re.IGNORECASE):
                            continue
                    
                    # Prefer paragraphs with figure captions
                    if re.search(r'\b(Figure|Fig\.|Table)\s*\d+', para, re.IGNORECASE):
                        good_paragraphs.insert(0, para)  # Put at front
                    else:
                        good_paragraphs.append(para)
                
                # Take up to 2-3 complete paragraphs (better context for CLIP)
                selected_paras = []
                total_length = 0
                max_chars = 1000  # Allow more chars for complete thoughts
                
                for para in good_paragraphs[:5]:  # Check first 5 paragraphs
                    if total_length + len(para) < max_chars:
                        selected_paras.append(para)
                        total_length += len(para)
                    elif total_length == 0:  # If first paragraph is long, take it anyway
                        selected_paras.append(para[:max_chars])
                        break
                    else:
                        break
                
                # If we have at least one paragraph, use it
                if selected_paras:
                    nearby_text = '\n\n'.join(selected_paras)
                else:
                    # Fallback: look for any complete sentence
                    all_text = ' '.join(paragraphs)
                    # Find first capital letter start
                    sentences = re.split(r'(?<=[.!?])\s+', all_text)
                    complete_sentences = [s for s in sentences if s and s[0].isupper() and len(s) > 20]
                    if complete_sentences:
                        nearby_text = ' '.join(complete_sentences[:3])
                    else:
                        nearby_text = all_text[:500] if all_text else ""
            
            # Save nearby text to .txt file (for demos)
            text_filename = f"image_p{page_num+1}_i{img_index+1}_text.txt"
            text_filepath = os.path.join(output_folder, text_filename)
            with open(text_filepath, "w", encoding="utf-8") as f:
                f.write(nearby_text.strip())
            
            # Store in memory for CLIP processing
            image_data.append({
                'path': filepath,
                'text': nearby_text.strip(),
                'page': page_num + 1,
                'text_file': text_filepath  # Also store path to text file
            })
    
    return image_data

print("="*60)
print("STEP 3: IMAGE EXTRACTION")
print("="*60)
print("\nExtracting images and nearby text...")

image_data = extract_images_with_text(PDF_PATH, IMAGES_FOLDER)
print(f"\n✓ Extracted {len(image_data)} unique images")


STEP 3: IMAGE EXTRACTION

Extracting images and nearby text...
  Content starts at page 5

✓ Extracted 9 unique images


In [35]:
# Load CLIP model
print("\nLoading CLIP model...")
try:
    import clip
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, preprocess = clip.load("ViT-B/32", device=device)
    print(f"✓ CLIP model loaded (device: {device})")
    clip_available = True
except Exception as e:
    print(f"⚠ Could not load CLIP: {e}")
    clip_available = False

def calculate_clip_similarity(image_path, text):
    """Calculate cosine similarity between image and text using CLIP"""
    if not clip_available or not text.strip():
        return 0.0
    
    try:
        # Encode image
        image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
        image_features = model.encode_image(image)
        
        # Encode text (truncate if too long)
        text_truncated = text[:200]
        text_tokens = clip.tokenize([text_truncated], truncate=True).to(device)
        text_features = model.encode_text(text_tokens)
        
        # Calculate cosine similarity
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
        similarity = (image_features @ text_features.T).item()
        
        return similarity
    except Exception as e:
        print(f"  Error processing {image_path}: {e}")
        return 0.0

# Calculate similarities
if clip_available and image_data:
    print("\nCalculating image-text similarities...")
    similarities = []
    for i, img_info in enumerate(image_data, 1):
        sim = calculate_clip_similarity(img_info['path'], img_info['text'])
        similarities.append(sim)
        img_info['similarity'] = sim
        if i % 5 == 0:
            print(f"  Processed {i}/{len(image_data)} images")
    print("Proccessed all pictures")
    avg_similarity = sum(similarities) / len(similarities) if similarities else 0
    print(f"\n✓ Average similarity: {avg_similarity:.4f}")
else:
    print("\n⚠ Skipping CLIP analysis")
    similarities = []
    avg_similarity = 0



Loading CLIP model...
✓ CLIP model loaded (device: cpu)

Calculating image-text similarities...
  Processed 5/9 images
Proccessed all pictures

✓ Average similarity: 0.2028


## Quality Metrics


In [36]:
# Quality metric functions
def count_syllables(word):
    """Count syllables in a word"""
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    previous_was_vowel = False
    
    for char in word:
        is_vowel = char in vowels
        if is_vowel and not previous_was_vowel:
            count += 1
        previous_was_vowel = is_vowel
    
    if word.endswith('e'):
        count -= 1
    if count == 0:
        count = 1
    
    return count

def gunning_fog_index(text):
    """Calculate Gunning Fog Index"""
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)
    words = [w for w in words if w.isalpha()]
    
    if not sentences or not words:
        return 0
    
    complex_words = [w for w in words if count_syllables(w) >= 3]
    
    avg_sentence_length = len(words) / len(sentences)
    percent_complex = (len(complex_words) / len(words)) * 100
    
    return 0.4 * (avg_sentence_length + percent_complex)

def automated_readability_index(text):
    """Calculate Automated Readability Index"""
    sentences = nltk.sent_tokenize(text)
    words = nltk.word_tokenize(text)
    words = [w for w in words if w.isalpha()]
    
    if not sentences or not words:
        return 0
    
    chars = sum(len(w) for w in words)
    
    return 4.71 * (chars / len(words)) + 0.5 * (len(words) / len(sentences)) - 21.43

def calculate_lexical_density(text):
    """Calculate lexical density (content words / total words)"""
    words = nltk.word_tokenize(text.lower())
    words = [w for w in words if w.isalpha()]
    
    if not words:
        return 0
    
    pos_tags = nltk.pos_tag(words)
    content_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'}
    content_words = [w for w, tag in pos_tags if tag in content_tags]
    
    return len(content_words) / len(words)


def indecisive_word_index(text):
    """Calculate indecisive word index (hedging vs assertive language)"""
    words = nltk.word_tokenize(text.lower())
    
    hedging_words = {'possibly', 'perhaps', 'might', 'could', 'may', 'seems', 'appears', 'suggests', 'indicates', 'likely'}
    assertive_words = {'definitely', 'certainly', 'clearly', 'obviously', 'undoubtedly', 'proves', 'demonstrates', 'shows'}
    booster_words = {'very', 'extremely', 'highly', 'strongly', 'significantly'}
    
    if not words:
        return 0
    
    # Position-weighted calculation
    score = 0
    for i, word in enumerate(words):
        position_weight = 1 - (i / len(words))
        if word in hedging_words:
            score += 1 * position_weight
        elif word in assertive_words:
            score -= 1 * position_weight
        elif word in booster_words:
            score -= 0.5 * position_weight
    
    return score / len(words)

fog = gunning_fog_index(cleaned_text)
ari = automated_readability_index(cleaned_text)
lex_density = calculate_lexical_density(cleaned_text)
indecisive_idx = indecisive_word_index(cleaned_text)

print(f"\n  Gunning Fog Index: {fog:.2f}")
print(f"  Automated Readability Index: {ari:.2f}")
print(f"  Lexical Density: {lex_density:.4f}")
if(indecisive_idx<0):
    print(f"  Indecisive Word Index: {0}")
else:
    print(f"  Indecisive Word Index: {indecisive_idx:.4f}")




  Gunning Fog Index: 17.50
  Automated Readability Index: 14.37
  Lexical Density: 0.6403
  Indecisive Word Index: 0
