In [None]:
# EthioMart/notebooks/data_preprocessing_eda.ipynb

# --- Section 1: Setup and Configuration ---

# 1.1 Import necessary libraries
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add the project root to the system path to allow importing from config
project_root = Path.cwd().parent # This assumes you run the notebook from EthioMart/notebooks/
sys.path.append(str(project_root))

# Import configuration variables
try:
    from config.config import DATA_DIR
except ImportError:
    print("Error: Could not import configuration. "
          "Please ensure EthioMart/config/config.py exists and is correctly configured.")
    # Fallback paths for local testing if config import fails
    DATA_DIR = Path("../data/raw") # Fallback to raw data dir, then adjust for processed
    

# Define the path to your cleaned data CSV file
# This should point to EthioMart/data/processed/clean_telegram_data.csv
CLEANED_CSV_PATH = DATA_DIR.parent / "processed" / "clean_telegram_data.csv"


# --- Section 2: Load and Inspect Cleaned Data ---

print(f"Loading cleaned data from: {CLEANED_CSV_PATH}")

# Check if the file exists before attempting to load
if not CLEANED_CSV_PATH.exists():
    print(f"Error: Cleaned data CSV not found at {CLEANED_CSV_PATH}. "
          "Please ensure preprocessor.py has been run successfully.")
    df_clean = pd.DataFrame(columns=[ # Define expected columns to prevent errors in later cells
        'channel_title', 'message_id', 'date', 'text',
        'views', 'reactions_count', 'image_path', 'preprocessed_text'
    ]) 
else:
    try:
        df_clean = pd.read_csv(CLEANED_CSV_PATH, encoding='utf-8')
        print(f"Successfully loaded {len(df_clean)} cleaned messages.")
    except pd.errors.EmptyDataError:
        print(f"Warning: Cleaned data CSV at {CLEANED_CSV_PATH} is empty. No data to analyze.")
        df_clean = pd.DataFrame(columns=[
            'channel_title', 'message_id', 'date', 'text',
            'views', 'reactions_count', 'image_path', 'preprocessed_text'
        ])
    except Exception as e:
        print(f"Error loading cleaned CSV: {e}")
        df_clean = pd.DataFrame(columns=[
            'channel_title', 'message_id', 'date', 'text',
            'views', 'reactions_count', 'image_path', 'preprocessed_text'
        ])


# Display basic information about the DataFrame
print("\n--- Cleaned Data DataFrame Info ---")
df_clean.info()

# Display the first few rows of the DataFrame, focusing on original vs preprocessed text
print("\n--- First 5 Rows of Cleaned Data (Original vs. Preprocessed) ---")
# Ensure 'text' and 'preprocessed_text' columns exist before trying to display them
if not df_clean.empty and 'text' in df_clean.columns and 'preprocessed_text' in df_clean.columns:
    print(df_clean[['text', 'preprocessed_text']].head())
else:
    print("DataFrame is empty or missing 'text'/'preprocessed_text' columns. Cannot display head.")


# --- Section 3: Exploratory Data Analysis (EDA) on Cleaned Data ---

if not df_clean.empty and 'preprocessed_text' in df_clean.columns:
    print("\n--- Exploratory Data Analysis on Cleaned Data ---")

    # 3.1 Check for empty preprocessed messages
    # Messages that were originally empty will remain empty after preprocessing (which is expected)
    # We are interested in messages that had original text but became empty after cleaning
    df_clean['preprocessed_text_length'] = df_clean['preprocessed_text'].astype(str).apply(len)
    
    # Identify messages where original text existed but preprocessed text is empty
    messages_turned_empty = df_clean[
        (df_clean['text'].notnull()) & 
        (df_clean['text'] != '') & 
        (df_clean['preprocessed_text_length'] == 0)
    ]
    
    print(f"\nNumber of messages with original text that became empty after cleaning: {len(messages_turned_empty)}")
    if not messages_turned_empty.empty:
        print("Sample original texts that became empty:")
        print(messages_turned_empty['text'].head().tolist())


    # 3.2 Distribution of preprocessed message length (character count)
    print("\nPreprocessed Message Text Length Statistics:")
    print(df_clean['preprocessed_text_length'].describe())

    plt.figure(figsize=(10, 6))
    sns.histplot(df_clean['preprocessed_text_length'], bins=50, kde=True, color='purple')
    plt.title('Distribution of Preprocessed Message Text Lengths')
    plt.xlabel('Preprocessed Text Length (characters)')
    plt.ylabel('Number of Messages')
    plt.show()

    # 3.3 Compare original vs. preprocessed text lengths
    if 'text' in df_clean.columns and 'preprocessed_text' in df_clean.columns:
        df_clean['original_text_length'] = df_clean['text'].astype(str).apply(len)
        print("\nText Length Comparison (Original vs. Preprocessed):")
        print(df_clean[['original_text_length', 'preprocessed_text_length']].describe())

        plt.figure(figsize=(12, 6))
        sns.histplot(df_clean['original_text_length'], color='blue', label='Original Length', kde=True, alpha=0.5)
        sns.histplot(df_clean['preprocessed_text_length'], color='red', label='Preprocessed Length', kde=True, alpha=0.5)
        plt.title('Distribution of Original vs. Preprocessed Text Lengths')
        plt.xlabel('Text Length (characters)')
        plt.ylabel('Number of Messages')
        plt.legend()
        plt.show()
    
    # 3.4 Top N most frequent words in preprocessed text (basic tokenization for visualization)
    from collections import Counter
    import itertools

    all_words = list(itertools.chain.from_iterable(
        df_clean['preprocessed_text'].astype(str).apply(lambda x: x.split()).tolist()
    ))
    word_counts = Counter(all_words)
    
    print("\nTop 20 Most Frequent Words in Preprocessed Text:")
    top_words = word_counts.most_common(20)
    for word, count in top_words:
        print(f"- {word}: {count}")

    # Visualize top words
    if top_words:
        words, counts = zip(*top_words)
        plt.figure(figsize=(12, 7))
        sns.barplot(x=list(counts), y=list(words), palette='cubehelix')
        plt.title('Top 20 Most Frequent Words in Preprocessed Text')
        plt.xlabel('Frequency')
        plt.ylabel('Word')
        plt.tight_layout()
        plt.show()

else:
    print("\nNo cleaned data available for EDA. Please ensure preprocessor.py ran successfully.")


# --- Section 4: Visual Inspection of Preprocessing Examples ---

print("\n--- Visual Inspection of Preprocessing Examples ---")
# Get some actual samples from the cleaned dataframe to show original vs. processed
if not df_clean.empty:
    sample_df = df_clean[['text', 'preprocessed_text']].sample(min(5, len(df_clean)), random_state=42)
    for index, row in sample_df.iterrows():
        print(f"\n--- Sample Message ID: {df_clean.loc[index, 'message_id']} ---")
        print("Original:")
        print(row['text'])
        print("\nCleaned:")
        print(row['preprocessed_text'])
else:
    print("No data loaded to display visual inspection examples.")


# --- Section 5: Next Steps Summary ---

print("\n--- Summary of Preprocessing EDA and Next Steps ---")
print("This EDA helped us to:")
print("- Verify the loading of the 'clean_telegram_data.csv'.")
print("- Analyze the distribution of preprocessed text lengths and observe the impact of cleaning.")
print("- Identify common words and patterns in the cleaned data.")
print("\nNext, we will proceed with the crucial step of:")
print("1. **Data Labeling (`src/labeling.py`)**: Converting your existing 'labeled_telegram_product_price_location.txt' into the CoNLL format, and preparing it for NER model training.")
print("2. **Splitting Data**: Dividing the labeled data into training, validation, and test sets.")
print("This will complete the data preparation for the NER task.")

