# Text Preprocessing for Keyword Extraction

This notebook cleans text files by removing URLs, symbols, and normalizing whitespace.

In [None]:
import re
import os

def preprocess_text(input_path, output_path):
    """
    Reads a text file, removes URLs, symbols, and normalizes whitespace,
    then saves the result to a new file.
    """
    if not os.path.exists(input_path):
        print(f"Error: File not found at {input_path}")
        return

    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # 1. Remove URLs
        text = re.sub(r'https?://\S+|www\.\S+', ' ', text)

        # 2. Replace tabs and newlines with space
        text = text.replace('\n', ' ').replace('\t', ' ')

        # 3. Remove symbols (Keep Korean, English, Numbers, Spaces)
        text = re.sub(r'[^a-zA-Z0-9가-힣\s]', ' ', text)

        # 4. Collapse multiple spaces
        text = re.sub(r'\s+', ' ', text).strip()

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(text)
        
        print(f"Successfully processed {len(text)} characters.")
        print(f"Saved cleaned data to {output_path}")
            
    except Exception as e:
        print(f"An error occurred: {e}")

In [2]:
# Execute Preprocessing
base_dir = os.getcwd()
input_file = os.path.join(base_dir, 'Crawled_Data', 'jeju_utd.txt')
output_file = os.path.join(base_dir, 'Crawled_Data', 'jeju_utd_cleaned.txt')

print(f"Processing {input_file}...")
preprocess_text(input_file, output_file)

Processing c:\Project\2025_Sports_Chatbot\My_Sandbox\F1_Data-Preprocessing\Crawled_Data\jeju_utd.txt...
Successfully processed 73443 characters.
Saved cleaned data to c:\Project\2025_Sports_Chatbot\My_Sandbox\F1_Data-Preprocessing\Crawled_Data\jeju_utd_cleaned.txt
