# Text Preprocessing for Keyword Extraction

This notebook cleans text files by removing URLs, symbols, and normalizing whitespace.

In [32]:
import re
import os

def preprocess_text(input_path, output_path):
    if not os.path.exists(input_path):
        print(f"Error: File not found at {input_path}")
        return

    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            text = f.read()
        # URL 제거
        text = re.sub(r'https?://\S+|www\.\S+', ' ', text)

        # 탭, 줄바꿈 제거
        text = text.replace('\n', '').replace('\t', ' ')
        
        # 중복 공백 제거
        text = re.sub(r'\s+', ' ', text).strip()

        # 기호 제거 (한국어, 영어, 숫자, 공백, 점 제외)
        text = re.sub(r'[^a-zA-Z0-9가-힣\s]', ' ', text)

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(text)
        
        print(f"Successfully processed {len(text)} characters.")
        print(f"Saved cleaned data to {output_path}")
            
    except Exception as e:
        print(f"An error occurred: {e}")

In [33]:
base_dir = os.getcwd()
input_file = os.path.join(base_dir, 'Crawled_Data', 'jeju_utd.txt')
output_file = os.path.join(base_dir, 'Crawled_Data', 'jeju_utd_cleaned.txt')

print(f"Processing {input_file}...")
preprocess_text(input_file, output_file)

Processing c:\Project\2025_Sports_Chatbot\My_Sandbox\F1_Data-Preprocessing\Crawled_Data\jeju_utd.txt...
Successfully processed 11667 characters.
Saved cleaned data to c:\Project\2025_Sports_Chatbot\My_Sandbox\F1_Data-Preprocessing\Crawled_Data\jeju_utd_cleaned.txt


In [34]:
import re
import os

def preprocess_text(input_path, output_path):
    if not os.path.exists(input_path):
        print(f"Error: File not found at {input_path}")
        return

    try:
        with open(input_path, 'r', encoding='utf-8') as f:
            text = f.read()
        # URL 제거
        text = re.sub(r'https?://\S+|www\.\S+', ' ', text)

        # 탭, 줄바꿈 제거
        text = text.replace('\n', '').replace('\t', ' ')
        
        # 중복 공백 제거
        text = re.sub(r'\s+', ' ', text).strip()

        # 기호 제거 (한국어, 영어, 숫자, 공백, 점 제외)
        text = re.sub(r'[^a-zA-Z0-9가-힣\s\.]', ' ', text)

        text = text.replace('.', '.\n')

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(text)
        
        print(f"Successfully processed {len(text)} characters.")
        print(f"Saved cleaned data to {output_path}")
            
    except Exception as e:
        print(f"An error occurred: {e}")

In [35]:
base_dir = os.getcwd()
input_file = os.path.join(base_dir, 'Crawled_Data', 'jeju_utd.txt')
output_file = os.path.join(base_dir, 'Crawled_Data', 'jeju_utd_cleaned2.txt')

print(f"Processing {input_file}...")
preprocess_text(input_file, output_file)

Processing c:\Project\2025_Sports_Chatbot\My_Sandbox\F1_Data-Preprocessing\Crawled_Data\jeju_utd.txt...
Successfully processed 11888 characters.
Saved cleaned data to c:\Project\2025_Sports_Chatbot\My_Sandbox\F1_Data-Preprocessing\Crawled_Data\jeju_utd_cleaned2.txt
