# Stage 1: EDA & Preprocessing

한국어 대화 요약 데이터셋에 대한 탐색적 데이터 분석과 전처리를 수행합니다.

## 목표
1. 데이터셋 로딩 및 기본 통계 확인
2. 텍스트 노이즈 분석 및 정제
3. PII 마스킹 패턴 추출
4. 데이터 시각화
5. 전처리된 데이터 저장

## 📋 Config

In [None]:
CONFIG = {
    "seed": 42,
    "data_dir": "../data",
    "output_dir": "../data/processed",
    "config_dir": "../configs",
    "train_file": "train.csv",
    "dev_file": "dev.csv",
    "test_file": "test.csv",
}

## 🔧 Setup

In [None]:
import os
import re
import json
from pathlib import Path
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from tqdm.auto import tqdm

# Custom utilities
from utils import set_seed, clean_dialogue, extract_special_tokens

# Set seed for reproducibility
set_seed(CONFIG["seed"])

# Create output directories
os.makedirs(CONFIG["output_dir"], exist_ok=True)
os.makedirs(CONFIG["config_dir"], exist_ok=True)

# Plot settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

## 📊 Data Loading

In [None]:
# Load datasets
train_df = pd.read_csv(os.path.join(CONFIG["data_dir"], CONFIG["train_file"]))
dev_df = pd.read_csv(os.path.join(CONFIG["data_dir"], CONFIG["dev_file"]))
test_df = pd.read_csv(os.path.join(CONFIG["data_dir"], CONFIG["test_file"]))

print(f"✅ Data loaded successfully")
print(f"📊 Train: {len(train_df):,} samples")
print(f"📊 Dev: {len(dev_df):,} samples")
print(f"📊 Test: {len(test_df):,} samples")

In [None]:
# Display first few rows
print("\n🔍 Train Dataset Preview:")
display(train_df.head())

print("\n📋 Columns:", train_df.columns.tolist())
print("\n📊 Data Types:")
print(train_df.dtypes)

## 🔬 Basic Statistics

In [None]:
def analyze_text_statistics(df, name="Dataset"):
    """텍스트 통계 분석."""
    stats = {}
    
    # 텍스트 길이 (문자 수)
    df['dialogue_len'] = df['dialogue'].str.len()
    df['summary_len'] = df['summary'].str.len() if 'summary' in df.columns else 0
    
    # 단어 수
    df['dialogue_words'] = df['dialogue'].str.split().str.len()
    df['summary_words'] = df['summary'].str.split().str.len() if 'summary' in df.columns else 0
    
    stats['dialogue_len'] = {
        'mean': df['dialogue_len'].mean(),
        'std': df['dialogue_len'].std(),
        'min': df['dialogue_len'].min(),
        'max': df['dialogue_len'].max(),
        'median': df['dialogue_len'].median()
    }
    
    if 'summary' in df.columns:
        stats['summary_len'] = {
            'mean': df['summary_len'].mean(),
            'std': df['summary_len'].std(),
            'min': df['summary_len'].min(),
            'max': df['summary_len'].max(),
            'median': df['summary_len'].median()
        }
    
    return stats

# Analyze train data
train_stats = analyze_text_statistics(train_df, "Train")
print("📊 Train Statistics:")
print(f"\nDialogue Length (chars):")
for k, v in train_stats['dialogue_len'].items():
    print(f"  {k}: {v:.1f}")

print(f"\nSummary Length (chars):")
for k, v in train_stats['summary_len'].items():
    print(f"  {k}: {v:.1f}")

In [None]:
# Check for missing values
print("🔍 Missing Values:")
print(train_df.isnull().sum())

# Check for duplicates
print(f"\n🔍 Duplicate Dialogues: {train_df['dialogue'].duplicated().sum()}")
print(f"🔍 Duplicate Summaries: {train_df['summary'].duplicated().sum()}")

## 🧹 Noise Analysis

In [None]:
def analyze_noise(text_series):
    """텍스트 노이즈 패턴 분석."""
    patterns = {
        'escaped_newlines': r'\\\\n',
        'html_br': r'<br>|<br/>|<br />',
        'repeated_chars': r'(.)\\1{3,}',
        'excessive_spaces': r'\s{2,}',
        'informal_tokens': r'ㅋ+|ㅎ+|ㅇㅇ',
    }
    
    results = {}
    for name, pattern in patterns.items():
        count = text_series.str.contains(pattern, regex=True).sum()
        percentage = (count / len(text_series)) * 100
        results[name] = {'count': count, 'percentage': percentage}
    
    return results

# Analyze noise in dialogues
print("🔍 Noise Analysis - Dialogues:")
noise_report = analyze_noise(train_df['dialogue'])
for name, stats in noise_report.items():
    print(f"  {name}: {stats['count']} ({stats['percentage']:.2f}%)")

In [None]:
# Sample noisy texts
print("\n📝 Sample Noisy Dialogue:")
noisy_samples = train_df[train_df['dialogue'].str.contains(r'\\\\n|<br>', regex=True)].head(2)
for idx, row in noisy_samples.iterrows():
    print(f"\nBefore cleaning:")
    print(row['dialogue'][:200], "...")
    print(f"\nAfter cleaning:")
    print(clean_dialogue(row['dialogue'])[:200], "...")

## 🏷️ Special Token Extraction

In [None]:
# Extract all special tokens from dialogues
all_tokens = set()
for text in tqdm(train_df['dialogue'], desc="Extracting special tokens"):
    tokens = extract_special_tokens(text)
    all_tokens.update(tokens)

# Sort tokens
special_tokens = sorted(list(all_tokens))

print(f"✅ Found {len(special_tokens)} unique special tokens:")
print(special_tokens)

# Count token occurrences
token_counts = Counter()
for text in train_df['dialogue']:
    tokens = extract_special_tokens(text)
    token_counts.update(tokens)

print(f"\n📊 Top 10 Most Common Tokens:")
for token, count in token_counts.most_common(10):
    print(f"  {token}: {count:,}")

In [None]:
# Save special tokens configuration
special_tokens_config = {
    "additional_special_tokens": special_tokens,
    "token_counts": {k: v for k, v in token_counts.most_common()}
}

config_path = os.path.join(CONFIG["config_dir"], "special_tokens.json")
with open(config_path, 'w', encoding='utf-8') as f:
    json.dump(special_tokens_config, f, ensure_ascii=False, indent=2)

print(f"✅ Special tokens saved to {config_path}")

## 📊 Text Preprocessing

In [None]:
# Apply cleaning to all datasets
print("🧹 Cleaning train dataset...")
train_df['dialogue_clean'] = train_df['dialogue'].apply(clean_dialogue)
train_df['summary_clean'] = train_df['summary'].apply(clean_dialogue)

print("🧹 Cleaning dev dataset...")
dev_df['dialogue_clean'] = dev_df['dialogue'].apply(clean_dialogue)
dev_df['summary_clean'] = dev_df['summary'].apply(clean_dialogue)

print("🧹 Cleaning test dataset...")
test_df['dialogue_clean'] = test_df['dialogue'].apply(clean_dialogue)
if 'summary' in test_df.columns:
    test_df['summary_clean'] = test_df['summary'].apply(clean_dialogue)

print("✅ Text cleaning completed!")

## 📈 Visualization

In [None]:
# Text length distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Dialogue length (before cleaning)
axes[0, 0].hist(train_df['dialogue_len'], bins=50, edgecolor='black')
axes[0, 0].set_title('Dialogue Length Distribution (Before Cleaning)')
axes[0, 0].set_xlabel('Characters')
axes[0, 0].set_ylabel('Frequency')

# Summary length (before cleaning)
axes[0, 1].hist(train_df['summary_len'], bins=50, edgecolor='black')
axes[0, 1].set_title('Summary Length Distribution (Before Cleaning)')
axes[0, 1].set_xlabel('Characters')
axes[0, 1].set_ylabel('Frequency')

# Dialogue words
axes[1, 0].hist(train_df['dialogue_words'], bins=50, edgecolor='black')
axes[1, 0].set_title('Dialogue Word Count Distribution')
axes[1, 0].set_xlabel('Words')
axes[1, 0].set_ylabel('Frequency')

# Summary words
axes[1, 1].hist(train_df['summary_words'], bins=50, edgecolor='black')
axes[1, 1].set_title('Summary Word Count Distribution')
axes[1, 1].set_xlabel('Words')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# WordCloud visualization
print("📊 Generating word clouds...")

# Combine all dialogues
all_dialogues = ' '.join(train_df['dialogue_clean'].head(1000))  # Sample for speed
all_summaries = ' '.join(train_df['summary_clean'].head(1000))

# Create word clouds
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Dialogue word cloud
wc_dialogue = WordCloud(width=800, height=400, background_color='white', 
                        font_path='/usr/share/fonts/truetype/nanum/NanumGothic.ttf').generate(all_dialogues)
axes[0].imshow(wc_dialogue, interpolation='bilinear')
axes[0].set_title('Dialogue Word Cloud', fontsize=16)
axes[0].axis('off')

# Summary word cloud
wc_summary = WordCloud(width=800, height=400, background_color='white',
                       font_path='/usr/share/fonts/truetype/nanum/NanumGothic.ttf').generate(all_summaries)
axes[1].imshow(wc_summary, interpolation='bilinear')
axes[1].set_title('Summary Word Cloud', fontsize=16)
axes[1].axis('off')

plt.tight_layout()
plt.show()

print("✅ Word clouds generated!")

## 💾 Save Processed Data

In [None]:
# Save preprocessed datasets
train_output = os.path.join(CONFIG["output_dir"], "train_processed.csv")
dev_output = os.path.join(CONFIG["output_dir"], "dev_processed.csv")
test_output = os.path.join(CONFIG["output_dir"], "test_processed.csv")

train_df.to_csv(train_output, index=False)
dev_df.to_csv(dev_output, index=False)
test_df.to_csv(test_output, index=False)

print(f"✅ Processed data saved to {CONFIG['output_dir']}/")
print(f"  - train_processed.csv: {len(train_df):,} rows")
print(f"  - dev_processed.csv: {len(dev_df):,} rows")
print(f"  - test_processed.csv: {len(test_df):,} rows")

## 📋 Summary

**완료된 작업**:
- ✅ 데이터셋 로딩 및 기본 통계 분석
- ✅ 텍스트 노이즈 분석 및 정제
- ✅ 특수 토큰 추출 및 저장
- ✅ 데이터 시각화 (히스토그램, 워드클라우드)
- ✅ 전처리된 데이터 저장

**다음 단계**: Stage 2 - Baseline Training