In [4]:
import os
from pathlib import Path
import PyPDF2
from docx import Document
from typing import Optional, Dict
import pandas as pd

print("‚úÖ Libraries imported!")

class FileHandler:
    
    def __init__(self):
        self.supported_formats = ['.txt', '.pdf', '.docx']
        print("üìÅ FileHandler initialized")
        print(f"   Supported formats: {', '.join(self.supported_formats)}")
    
    def read_file(self, file_path: str) -> str:
        file_path = Path(file_path)
        
        if not file_path.exists():
            raise FileNotFoundError(f"‚ùå File not found: {file_path}")
        
        ext = file_path.suffix.lower()
        
        if ext not in self.supported_formats:
            raise ValueError(f"‚ùå Unsupported format: {ext}")
        
        if ext == '.txt':
            return self._read_txt(file_path)
        elif ext == '.pdf':
            return self._read_pdf(file_path)
        elif ext == '.docx':
            return self._read_docx(file_path)
    
    def _read_txt(self, file_path: Path) -> str:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                text = f.read()
            
            print(f"‚úÖ Read TXT: {file_path.name}")
            print(f"   Characters: {len(text)}")
            return text
            
        except Exception as e:
            print(f"‚ùå Error reading TXT: {e}")
            return ""
    
    def _read_pdf(self, file_path: Path) -> str:
        try:
            text = ""
            with open(file_path, 'rb') as f:
                pdf = PyPDF2.PdfReader(f)
                pages = len(pdf.pages)
                
                for page_num in range(pages):
                    page = pdf.pages[page_num]
                    text += page.extract_text()
            
            print(f"‚úÖ Read PDF: {file_path.name}")
            print(f"   Pages: {pages}")
            print(f"   Characters: {len(text)}")
            return text
            
        except Exception as e:
            print(f"‚ùå Error reading PDF: {e}")
            return ""
    
    def _read_docx(self, file_path: Path) -> str:
        try:
            doc = Document(file_path)
            text = ""
            
            for para in doc.paragraphs:
                text += para.text + "\n"
            
            for table in doc.tables:
                for row in table.rows:
                    for cell in row.cells:
                        text += cell.text + " "
                    text += "\n"
            
            print(f"‚úÖ Read DOCX: {file_path.name}")
            print(f"   Paragraphs: {len(doc.paragraphs)}")
            print(f"   Characters: {len(text)}")
            return text
            
        except Exception as e:
            print(f"‚ùå Error reading DOCX: {e}")
            return ""
    
    def get_file_info(self, file_path: str) -> Dict:
        file_path = Path(file_path)
        
        if not file_path.exists():
            return None
        
        stat = file_path.stat()
        
        return {
            'name': file_path.name,
            'extension': file_path.suffix,
            'size_bytes': stat.st_size,
            'size_kb': round(stat.st_size / 1024, 2),
            'full_path': str(file_path.absolute())
        }

handler = FileHandler()

def create_sample_files():
    test_dir = Path("../data/test_docs")
    test_dir.mkdir(parents=True, exist_ok=True)
    
    sample1 = """
    Machine learning is a subset of artificial intelligence that focuses on 
    developing algorithms that can learn from and make predictions on data. 
    These algorithms build mathematical models based on sample data, known as 
    training data, to make predictions or decisions without being explicitly 
    programmed to do so.
    """
    
    sample2 = """
    ML is a branch of AI that concentrates on creating algorithms capable of 
    learning from data and making forecasts. Such algorithms construct 
    mathematical frameworks using sample information, called training datasets, 
    to generate predictions or choices without explicit programming.
    """
    
    sample3 = """
    Climate change refers to long-term shifts in temperatures and weather patterns. 
    These shifts may be natural, but since the 1800s, human activities have been 
    the main driver of climate change, primarily due to the burning of fossil fuels 
    like coal, oil, and gas.
    """
    
    files = {
        'ml_original.txt': sample1,
        'ml_paraphrased.txt': sample2,
        'climate.txt': sample3
    }
    
    for filename, content in files.items():
        filepath = test_dir / filename
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(content.strip())
        print(f"üìù Created: {filename}")
    
    return test_dir

test_dir = create_sample_files()
print(f"\n‚úÖ Test files ready in: {test_dir}")

print("=" * 60)
print("üìñ READING TEST FILES")
print("=" * 60)

test_files = list(test_dir.glob("*.txt"))

for file in test_files:
    print(f"\nüìÑ Reading: {file.name}")
    print("-" * 40)
    
    content = handler.read_file(file)
    
    preview = content[:200] + "..." if len(content) > 200 else content
    print(f"\nüìù Preview:\n{preview}\n")

print("=" * 60)
print("üìä FILE INFORMATION")
print("=" * 60)

file_info_list = []

for file in test_files:
    info = handler.get_file_info(file)
    file_info_list.append(info)
    
    print(f"\nüìÑ {info['name']}")
    print(f"   Extension: {info['extension']}")
    print(f"   Size: {info['size_kb']} KB")

df = pd.DataFrame(file_info_list)
print("\nüìä Summary Table:")
print(df[['name', 'extension', 'size_kb']])

pdf_path = Path("../data/test_docs/sample.pdf")

if pdf_path.exists():
    print("üìï Testing PDF Reader...")
    pdf_content = handler.read_file(pdf_path)
    print(f"‚úÖ PDF read successfully!")
    print(f"   Characters extracted: {len(pdf_content)}")
    print(f"\n   Preview: {pdf_content[:200]}...")
else:
    print("‚ÑπÔ∏è  No PDF file found. To test PDF reading:")
    print("   1. Add a PDF file to data/test_docs/")
    print("   2. Name it 'sample.pdf'")
    print("   3. Run this cell again")

docx_path = Path("../data/test_docs/sample.docx")

if docx_path.exists():
    print("üìò Testing DOCX Reader...")
    docx_content = handler.read_file(docx_path)
    print(f"‚úÖ DOCX read successfully!")
    print(f"   Characters extracted: {len(docx_content)}")
    print(f"\n   Preview: {docx_content[:200]}...")
else:
    print("‚ÑπÔ∏è  No DOCX file found. To test DOCX reading:")
    print("   1. Add a Word document to data/test_docs/")
    print("   2. Name it 'sample.docx'")
    print("   3. Run this cell again")

def read_all_documents(directory: str) -> Dict[str, str]:
    dir_path = Path(directory)
    documents = {}
    
    print(f"üìÇ Reading all files from: {dir_path}")
    print("-" * 60)
    
    for file_path in dir_path.glob("*"):
        if file_path.suffix in handler.supported_formats:
            try:
                content = handler.read_file(file_path)
                documents[file_path.name] = content
                print(f"‚úÖ {file_path.name}")
            except Exception as e:
                print(f"‚ùå {file_path.name}: {e}")
    
    print(f"\nüìä Total files read: {len(documents)}")
    return documents

all_docs = read_all_documents("../data/test_docs")

print("\nüìã Documents Summary:")
for filename, content in all_docs.items():
    word_count = len(content.split())
    print(f"   ‚Ä¢ {filename}: {word_count} words")

print("üß™ Testing Error Handling")
print("=" * 60)

print("\n1. Testing non-existent file:")
try:
    handler.read_file("../data/test_docs/nonexistent.txt")
except FileNotFoundError as e:
    print(f"   ‚úÖ Caught error: {type(e).__name__}")

print("\n2. Testing unsupported format:")
try:
    dummy = Path("../data/test_docs/test.xyz")
    dummy.touch()
    handler.read_file(dummy)
    dummy.unlink()
except ValueError as e:
    print(f"   ‚úÖ Caught error: {type(e).__name__}")
    if dummy.exists():
        dummy.unlink()

print("\n‚úÖ Error handling working correctly!")

def print_completion_summary():
    print("=" * 60)
    print("‚ú® FILE HANDLER COMPLETE!")
    print("=" * 60)
    
    print("\n‚úÖ What We Built:")
    features = [
        "FileHandler class with unified interface",
        "Support for .txt, .pdf, .docx files",
        "File metadata extraction",
        "Batch file reading capability",
        "Robust error handling",
        "Test suite with sample files"
    ]
    
    for i, feature in enumerate(features, 1):
        print(f"   {i}. {feature}")
    
    print("\nüìä Files in Test Directory:")
    test_files = list(Path("../data/test_docs").glob("*"))
    for f in test_files[:5]:
        print(f"   ‚Ä¢ {f.name}")
    
    print("\nüéØ Next Steps:")
    print("   ‚Ä¢ Move to Notebook 3: Text Preprocessing")
    print("   ‚Ä¢ Implement text cleaning and tokenization")
    print("   ‚Ä¢ Build similarity calculator")
    
    print("\n" + "=" * 60)

print_completion_summary()

save_as_module = """
import os
from pathlib import Path
import PyPDF2
from docx import Document

class FileHandler:
    pass
"""

‚úÖ Libraries imported!
üìÅ FileHandler initialized
   Supported formats: .txt, .pdf, .docx
üìù Created: ml_original.txt
üìù Created: ml_paraphrased.txt
üìù Created: climate.txt

‚úÖ Test files ready in: ..\data\test_docs
üìñ READING TEST FILES

üìÑ Reading: climate.txt
----------------------------------------
‚úÖ Read TXT: climate.txt
   Characters: 276

üìù Preview:
Climate change refers to long-term shifts in temperatures and weather patterns. 
    These shifts may be natural, but since the 1800s, human activities have been 
    the main driver of climate change...


üìÑ Reading: ml_original.txt
----------------------------------------
‚úÖ Read TXT: ml_original.txt
   Characters: 331

üìù Preview:
Machine learning is a subset of artificial intelligence that focuses on 
    developing algorithms that can learn from and make predictions on data. 
    These algorithms build mathematical models bas...


üìÑ Reading: ml_paraphrased.txt
----------------------------------------
‚