In [None]:
import os
import re
from pathlib import Path

In [16]:
def process_years_files(start_year=2002, end_year=2010):
    """
    Process text files for multiple years, creating separate 
    folders and cleaned files.
    """
    print(f"Processing files from {start_year} to {end_year}...")
    
    # Create base directory for processed files
    base_dir = Path.cwd().parent / "processed_files"

    os.makedirs(base_dir, exist_ok=True)
    
    for year in range(start_year, end_year + 1):
        input_file = Path.cwd().parent / f"./NewYorkTimes/2002-2010/{year}.txt"
        
        # Skip if file doesn't exist
        if not os.path.exists(input_file):
            print(f"File {input_file} not found. Skipping year {year}.")
            continue
            
        # Create year-specific directory
        year_dir = os.path.join(base_dir, str(year))
        os.makedirs(year_dir, exist_ok=True)
        
        # Process the file
        process_year_file(input_file, year_dir)
        
    print(f"Processing complete. Results stored in {base_dir}/")

def process_year_file(input_file, output_dir):
    """
    Process a single year's file and save articles to appropriate directory.
    """
    print(f"Processing {input_file}...")
    
    # Read the input file with error handling for encoding issues
    try:
        with open(input_file, "r", encoding="utf-8") as f:
            content = f.read()
    except UnicodeDecodeError:
        # Try alternative encoding if UTF-8 fails
        with open(input_file, "r", encoding="latin-1") as f:
            content = f.read()
    
    # Split content into articles using separator pattern
    article_separator = r'-{5,}'  # 5 or more consecutive dashes
    articles = re.split(article_separator, content)
    
    # Process and save each article
    article_count = 0
    for article in articles:
        # Skip empty articles
        if not article.strip():
            continue
        
        # Clean the article
        cleaned_article = clean_article(article)
        
        # Skip articles that are too short after cleaning
        if len(cleaned_article.split()) < 20:
            continue
            
        # Create filename based on country codes if available
        article_count += 1
        filename = f"article_{article_count}.txt"
            
        # Write to output file
        output_file = os.path.join(output_dir, filename)
        with open(output_file, "w", encoding="utf-8") as f:
            f.write(cleaned_article)
    
    print(f"  Extracted {article_count} articles to {output_dir}/")

def clean_article(text):
    """
    Clean an individual article by removing headers and standardizing format.
    Returns a cleaned_text
    """
    
    # Remove header patterns (e.g., ===== AFG-AUL ===== patterns)
    text = re.sub(r'={10,}[\s\S]*?={10,}', '', text)
    
    # Remove asterisk sections (e.g., *****)
    text = re.sub(r'\*{5,}', '', text)
    
    # Remove excessive whitespace and normalize newlines
    text = re.sub(r'\s*\n\s*\n\s*', '\n\n', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    return text

In [17]:
process_years_files()

Processing files from 2002 to 2010...
Processing /Users/daksh/Downloads/GSOC/NewYorkTimes/2002-2010/2002.txt...
  Extracted 16263 articles to /Users/daksh/Downloads/GSOC/processed_files/2002/
Processing /Users/daksh/Downloads/GSOC/NewYorkTimes/2002-2010/2003.txt...
  Extracted 13062 articles to /Users/daksh/Downloads/GSOC/processed_files/2003/
Processing /Users/daksh/Downloads/GSOC/NewYorkTimes/2002-2010/2004.txt...
  Extracted 13730 articles to /Users/daksh/Downloads/GSOC/processed_files/2004/
Processing /Users/daksh/Downloads/GSOC/NewYorkTimes/2002-2010/2005.txt...
  Extracted 8177 articles to /Users/daksh/Downloads/GSOC/processed_files/2005/
Processing /Users/daksh/Downloads/GSOC/NewYorkTimes/2002-2010/2006.txt...
  Extracted 14899 articles to /Users/daksh/Downloads/GSOC/processed_files/2006/
Processing /Users/daksh/Downloads/GSOC/NewYorkTimes/2002-2010/2007.txt...
  Extracted 12025 articles to /Users/daksh/Downloads/GSOC/processed_files/2007/
Processing /Users/daksh/Downloads/GSOC/