In [None]:
%pip install -q seaborn

# 1. Imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import spacy
from pathlib import Path

# 2. Load Dataset
# Resolve path relative to notebook location
notebook_dir = Path().resolve()
project_root = notebook_dir.parent
csv_path = project_root / 'data' / 'bbc_news.csv'

df = pd.read_csv(csv_path)
print(f"Loaded dataset from: {csv_path}")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
df.head()

# 3. Basic Data Exploration
print('\nDataset Info:')
print('Shape:', df.shape)
print('Columns:', df.columns.tolist())

# Determine text column (BBC dataset uses 'description' or 'title')
if 'description' in df.columns:
    text_col = 'description'
    print(f"Using column: '{text_col}'")
elif 'text' in df.columns:
    text_col = 'text'
    print(f"Using column: '{text_col}'")
else:
    print("Warning: No 'description' or 'text' column found")
    print("Available columns:", df.columns.tolist())
    text_col = None

# 4. Text Length Analysis
if text_col:
    df['text_length'] = df[text_col].apply(lambda x: len(str(x)))
    sns.histplot(df['text_length'], bins=30)
    plt.title('Text Length Distribution')
    plt.show()
    
    print('Text Length Statistics:')
    print('Mean Length:', df['text_length'].mean())
    print('Median Length:', df['text_length'].median())
    print('Max Length:', df['text_length'].max())
    print('Min Length:', df['text_length'].min())

# 5. Download NLTK Data
print('\nDownloading NLTK resources...')
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt_tab', quiet=True)

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# 6. Tokenization & Stopword Removal Preview (NLTK)
if text_col:
    sample_text = str(df[text_col].iloc[0])
    print('\nSample Text (first 200 chars):')
    print(sample_text[:200])
    
    tokens = word_tokenize(sample_text.lower())
    filtered_tokens = [t for t in tokens if t.isalpha() and t not in stopwords.words('english')]
    print('\nSample Tokens (first 10):', filtered_tokens[:10])

# 7. SpaCy Noun Phrase Extraction Demo
print('\nLoading spaCy model...')
nlp = spacy.load('en_core_web_sm')

if text_col:
    doc = nlp(sample_text)
    noun_phrases = [chunk.text.lower() for chunk in doc.noun_chunks]
    print('Sample Noun Phrases (first 10):', noun_phrases[:10])

# 8. Most Frequent Words (NLTK)
if text_col:
    print('\nComputing most frequent words across corpus...')
    from collections import Counter
    
    all_tokens = []
    for text in df[text_col]:
        text_str = str(text).lower()
        tokens = word_tokenize(text_str)
        all_tokens.extend([t for t in tokens if t.isalpha()])
    
    filtered_all_tokens = [t for t in all_tokens if t not in stopwords.words('english')]
    freq_dist = Counter(filtered_all_tokens)
    print('Most Common Words (top 15):', freq_dist.most_common(15))
    
    # Visualize top words
    top_words = dict(freq_dist.most_common(20))
    plt.figure(figsize=(12, 6))
    plt.bar(top_words.keys(), top_words.values())
    plt.title('Top 20 Most Frequent Words (excluding stopwords)')
    plt.xlabel('Word')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

print('\nâœ… Exploratory analysis complete!')