# Notebook 3: Utility Functions & Helpers
## Common utilities for all modules

**Purpose**: Shared utility functions for text processing, data handling, and file I/O

**Can run independently?** ✅ YES (2 min)


## Installation

In [None]:
# Install dependencies
!pip install numpy pandas scikit-learn nltk -q
import nltk
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
print("✓ Dependencies installed")

## Imports

In [None]:
import re
import json
import pickle
from typing import List, Dict, Tuple, Any
from pathlib import Path
import numpy as np
import pandas as pd
from datetime import datetime
import difflib

print("✓ Imports successful")

## Text Processing Utilities

In [None]:
def clean_text(text: str) -> str:
    """
    Clean and normalize text.
    
    Args:
        text: Input text
    
    Returns:
        Cleaned text
    """
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Remove special characters but keep common punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,-]', '', text)
    
    # Lowercase
    text = text.lower()
    
    return text


def normalize_whitespace(text: str) -> str:
    """
    Normalize whitespace in text.
    
    Args:
        text: Input text
    
    Returns:
        Text with normalized whitespace
    """
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing spaces
    text = text.strip()
    
    return text


def extract_keywords(text: str, num_keywords: int = 10) -> List[str]:
    """
    Extract keywords from text (simple TF-IDF approach).
    
    Args:
        text: Input text
        num_keywords: Number of keywords to extract
    
    Returns:
        List of keywords
    """
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    # Simple keywords from text
    words = text.lower().split()
    
    # Filter common stop words
    stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'is', 'was', 'are', 'be', 'to', 'of', 'in', 'at', 'for'}
    keywords = [w for w in words if w not in stop_words and len(w) > 2]
    
    # Get unique and most frequent
    from collections import Counter
    keyword_counts = Counter(keywords)
    
    return [word for word, _ in keyword_counts.most_common(num_keywords)]


def remove_duplicates(text_list: List[str]) -> List[str]:
    """
    Remove duplicate strings while preserving order.
    
    Args:
        text_list: List of texts
    
    Returns:
        List with duplicates removed
    """
    seen = set()
    result = []
    
    for text in text_list:
        if text not in seen:
            seen.add(text)
            result.append(text)
    
    return result


def split_into_sentences(text: str) -> List[str]:
    """
    Split text into sentences.
    
    Args:
        text: Input text
    
    Returns:
        List of sentences
    """
    # Simple sentence splitting
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    
    return sentences

print("✓ Text processing utilities created")

## Data Processing Utilities

In [None]:
def batch_sequences(sequences: List[Any], batch_size: int):
    """
    Create batches from sequences.
    
    Args:
        sequences: List of sequences
        batch_size: Size of each batch
    
    Yields:
        Batches of sequences
    """
    for i in range(0, len(sequences), batch_size):
        yield sequences[i:i + batch_size]


def pad_sequences(sequences: List[List[int]], max_len: int, pad_value: int = 0) -> List[List[int]]:
    """
    Pad sequences to maximum length.
    
    Args:
        sequences: List of sequences
        max_len: Maximum length
        pad_value: Value to use for padding
    
    Returns:
        Padded sequences
    """
    padded = []
    
    for seq in sequences:
        if len(seq) < max_len:
            seq = seq + [pad_value] * (max_len - len(seq))
        else:
            seq = seq[:max_len]
        padded.append(seq)
    
    return padded


def split_data(data: List[Any], train_ratio: float = 0.8, shuffle: bool = True) -> Tuple[List[Any], List[Any]]:
    """
    Split data into train and test sets.
    
    Args:
        data: Data to split
        train_ratio: Ratio for training set
        shuffle: Whether to shuffle before split
    
    Returns:
        Tuple of (train_data, test_data)
    """
    if shuffle:
        import random
        data = random.sample(data, len(data))
    
    split_point = int(len(data) * train_ratio)
    
    return data[:split_point], data[split_point:]


def create_dataframe(data: List[Dict]) -> pd.DataFrame:
    """
    Create pandas DataFrame from list of dicts.
    
    Args:
        data: List of dictionaries
    
    Returns:
        DataFrame
    """
    return pd.DataFrame(data)

print("✓ Data processing utilities created")

## File I/O Utilities

In [None]:
def load_json(file_path: str) -> Dict:
    """
    Load JSON file.
    
    Args:
        file_path: Path to JSON file
    
    Returns:
        Loaded data
    """
    with open(file_path, 'r') as f:
        return json.load(f)


def save_json(data: Dict, file_path: str, pretty: bool = True) -> None:
    """
    Save data to JSON file.
    
    Args:
        data: Data to save
        file_path: Path to save to
        pretty: Whether to pretty-print
    """
    Path(file_path).parent.mkdir(parents=True, exist_ok=True)
    
    with open(file_path, 'w') as f:
        if pretty:
            json.dump(data, f, indent=2)
        else:
            json.dump(data, f)


def load_pickle(file_path: str) -> Any:
    """
    Load pickle file.
    
    Args:
        file_path: Path to pickle file
    
    Returns:
        Loaded data
    """
    with open(file_path, 'rb') as f:
        return pickle.load(f)


def save_pickle(data: Any, file_path: str) -> None:
    """
    Save data to pickle file.
    
    Args:
        data: Data to save
        file_path: Path to save to
    """
    Path(file_path).parent.mkdir(parents=True, exist_ok=True)
    
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)


def load_text_file(file_path: str) -> str:
    """
    Load text file.
    
    Args:
        file_path: Path to text file
    
    Returns:
        File contents
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()


def save_text_file(text: str, file_path: str) -> None:
    """
    Save text to file.
    
    Args:
        text: Text to save
        file_path: Path to save to
    """
    Path(file_path).parent.mkdir(parents=True, exist_ok=True)
    
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(text)

print("✓ File I/O utilities created")

## Text Similarity & Metrics

In [None]:
def calculate_similarity(text1: str, text2: str) -> float:
    """
    Calculate similarity between two texts (0-1).
    
    Args:
        text1: First text
        text2: Second text
    
    Returns:
        Similarity score
    """
    from difflib import SequenceMatcher
    
    return SequenceMatcher(None, text1.lower(), text2.lower()).ratio()


def calculate_levenshtein_distance(s1: str, s2: str) -> int:
    """
    Calculate Levenshtein distance between two strings.
    
    Args:
        s1: First string
        s2: Second string
    
    Returns:
        Distance (0 = identical)
    """
    if len(s1) < len(s2):
        return calculate_levenshtein_distance(s2, s1)
    
    if len(s2) == 0:
        return len(s1)
    
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]


def calculate_jaccard_similarity(set1: set, set2: set) -> float:
    """
    Calculate Jaccard similarity between two sets.
    
    Args:
        set1: First set
        set2: Second set
    
    Returns:
        Jaccard similarity (0-1)
    """
    if len(set1 | set2) == 0:
        return 0.0
    
    return len(set1 & set2) / len(set1 | set2)

print("✓ Text similarity utilities created")

## Testing

In [None]:
# Test 1: Text cleaning
test_text = "  Hello  WORLD  !  "
cleaned = clean_text(test_text)
assert "hello world" in cleaned.lower()
print(f"✓ clean_text: '{test_text}' → '{cleaned}'")

# Test 2: Keyword extraction
test_doc = "Python is a great programming language. Python is powerful."
keywords = extract_keywords(test_doc, num_keywords=5)
assert len(keywords) > 0
print(f"✓ extract_keywords: {keywords}")

# Test 3: Sentence splitting
test_sentence = "Hello world. How are you? I am fine."
sentences = split_into_sentences(test_sentence)
assert len(sentences) == 3
print(f"✓ split_into_sentences: {len(sentences)} sentences")

# Test 4: Batching
data = list(range(10))
batches = list(batch_sequences(data, 3))
assert len(batches) == 4
print(f"✓ batch_sequences: {len(batches)} batches from 10 items")

# Test 5: Padding
seqs = [[1, 2, 3], [1, 2]]
padded = pad_sequences(seqs, max_len=5, pad_value=0)
assert all(len(s) == 5 for s in padded)
print(f"✓ pad_sequences: Padded to length 5")

# Test 6: Train/Test split
data = list(range(100))
train, test = split_data(data, train_ratio=0.8)
assert len(train) == 80 and len(test) == 20
print(f"✓ split_data: {len(train)} train, {len(test)} test")

# Test 7: Similarity
sim = calculate_similarity("hello", "hello")
assert sim == 1.0
print(f"✓ calculate_similarity: 'hello' vs 'hello' = {sim:.2f}")

# Test 8: Levenshtein distance
dist = calculate_levenshtein_distance("kitten", "sitting")
assert dist == 3
print(f"✓ calculate_levenshtein_distance: 'kitten' vs 'sitting' = {dist}")

# Test 9: Jaccard similarity
set1 = {'a', 'b', 'c'}
set2 = {'b', 'c', 'd'}
jaccard = calculate_jaccard_similarity(set1, set2)
print(f"✓ calculate_jaccard_similarity: {jaccard:.2f}")

print("\n✅ All utility functions tested successfully!")

## Export Utilities Module

In [None]:
# Package all utilities
utils_module = {
    'text_processing': {
        'clean_text': clean_text,
        'normalize_whitespace': normalize_whitespace,
        'extract_keywords': extract_keywords,
        'remove_duplicates': remove_duplicates,
        'split_into_sentences': split_into_sentences,
    },
    'data_processing': {
        'batch_sequences': batch_sequences,
        'pad_sequences': pad_sequences,
        'split_data': split_data,
        'create_dataframe': create_dataframe,
    },
    'file_io': {
        'load_json': load_json,
        'save_json': save_json,
        'load_pickle': load_pickle,
        'save_pickle': save_pickle,
        'load_text_file': load_text_file,
        'save_text_file': save_text_file,
    },
    'similarity': {
        'calculate_similarity': calculate_similarity,
        'calculate_levenshtein_distance': calculate_levenshtein_distance,
        'calculate_jaccard_similarity': calculate_jaccard_similarity,
    }
}

# Save to pickle for use in other notebooks
save_pickle(utils_module, '/tmp/utils_module.pkl')

print("\n✓ Utilities module exported to /tmp/utils_module.pkl")
print(f"✓ Total functions: {sum(len(v) for v in utils_module.values())}")

## Summary

✅ **Notebook 3 Complete**

### Created Functions:
- **Text Processing**: 5 functions
- **Data Processing**: 4 functions
- **File I/O**: 6 functions
- **Similarity**: 3 functions

### Total: 18 utility functions

**Ready for use in Notebooks 4-7**