# Data Exploration Notebook

This notebook explores the raw data acquired from the AFNC website to understand its structure and characteristics.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

In [None]:
# Load the raw data
try:
    df = pd.read_csv('../data/raw/news.csv')
    print(f"Dataset shape: {df.shape}")
    print("\nColumn names:")
    print(df.columns.tolist())
except FileNotFoundError:
    print("Raw data file not found. Please run the data acquisition notebook first.")

In [None]:
# Display basic information about the dataset
if 'df' in locals():
    print("\nDataset info:")
    print(df.info())
    
    print("\nFirst 5 rows:")
    display(df.head())
    
    print("\nBasic statistics:")
    display(df.describe(include='all'))

In [None]:
# Analyze label distribution
if 'df' in locals():
    print("Label distribution:")
    label_counts = df['label'].value_counts()
    print(label_counts)
    
    # Plot label distribution
    plt.figure(figsize=(8, 6))
    sns.barplot(x=label_counts.index, y=label_counts.values)
    plt.title('Distribution of News Labels')
    plt.xlabel('Label')
    plt.ylabel('Count')
    plt.show()

In [None]:
# Analyze headline length
if 'df' in locals():
    df['headline_length'] = df['headline'].astype(str).apply(len)
    
    print("Headline length statistics:")
    print(df['headline_length'].describe())
    
    # Plot headline length distribution
    plt.figure(figsize=(10, 6))
    plt.hist(df['headline_length'], bins=30, alpha=0.7)
    plt.title('Distribution of Headline Lengths')
    plt.xlabel('Headline Length (characters)')
    plt.ylabel('Frequency')
    plt.show()
    
    # Compare headline lengths by label
    plt.figure(figsize=(10, 6))
    sns.boxplot(data=df, x='label', y='headline_length')
    plt.title('Headline Length by Label')
    plt.xlabel('Label')
    plt.ylabel('Headline Length (characters)')
    plt.show()

In [None]:
# Text analysis
if 'df' in locals():
    # Function to clean text for analysis
    def clean_text_for_analysis(text):
        if pd.isna(text):
            return ""
        # Remove special characters and digits
        text = re.sub(r'[^\u0E00-\u0E7F\s]', '', str(text))
        # Remove extra whitespaces
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    # Clean headlines for analysis
    df['cleaned_headline'] = df['headline'].apply(clean_text_for_analysis)
    
    # Word clouds for each label
    for label in df['label'].unique():
        if pd.notna(label):
            text = ' '.join(df[df['label'] == label]['cleaned_headline'].dropna())
            if text:
                wordcloud = WordCloud(
                    width=800, 
                    height=400, 
                    background_color='white',
                    font_path='/System/Library/Fonts/Thonburi.ttc' if pd.os.name == 'posix' else None
                ).generate(text)
                
                plt.figure(figsize=(10, 5))
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis('off')
                plt.title(f'Word Cloud for {label} News')
                plt.show()
            else:
                print(f"No text data available for {label} news")

## Key Insights

Based on the exploration:
1. Dataset size and structure
2. Label distribution (Real vs Fake news)
3. Headline length characteristics
4. Common words in each category

## Next Steps

Proceed to the data preparation notebook to clean and prepare the data for model training.