# 01 — Exploratory Data Analysis

**Project:** Clickbait Headline Detector  
Explores the raw dataset and saves a cleaned version for downstream notebooks.

> **Run this notebook first** before `02_Classic_ML.ipynb` or `03_Deep_Learning.ipynb`.

In [None]:
# Install all project dependencies in one go
!pip install pandas numpy matplotlib seaborn nltk wordcloud scikit-learn tensorflow joblib -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
import warnings
from collections import Counter

import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
from wordcloud import WordCloud

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

STOP_WORDS = set(stopwords.words('english'))

## Config
Set the CSV path and column names below before running anything else.

In [None]:
# --- USER CONFIG ---
CSV_PATH     = 'data/clickbait_data.csv'  # path to the Kaggle CSV you downloaded
TEXT_COL     = 'headline'                 # column with the headline text
LABEL_COL    = 'clickbait'               # 1 = clickbait, 0 = real news
CLEANED_PATH = 'data/cleaned.csv'         # output used by notebooks 02 & 03

# Stops early if the file isn't in the expected place
assert os.path.exists(CSV_PATH), (
    f'File not found: {CSV_PATH!r}\n'
    'Download the dataset from Kaggle and place it in the data/ folder.'
)
print('Config OK.')

## Load & Inspect

In [None]:
df = pd.read_csv(CSV_PATH)

print(f'Shape: {df.shape}')
print(f'Columns: {df.columns.tolist()}\n')
display(df[[TEXT_COL, LABEL_COL]].head(5))

print('\nMissing values:')
print(df[[TEXT_COL, LABEL_COL]].isnull().sum())

print('\nClass distribution:')
print(df[LABEL_COL].value_counts())

## Class Balance

In [None]:
counts      = df[LABEL_COL].value_counts().sort_index()
label_names = {0: 'Real News', 1: 'Clickbait'}

fig, ax = plt.subplots(figsize=(6, 4))
bars = ax.bar(
    [label_names[k] for k in counts.index],
    counts.values,
    color=['steelblue', 'tomato'],
    edgecolor='black', linewidth=0.5
)
ax.set_title('Class Distribution', fontsize=14)
ax.set_ylabel('Count')

for bar, val in zip(bars, counts.values):
    ax.text(bar.get_x() + bar.get_width() / 2,
            bar.get_height() + 100,
            str(val), ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('data/class_distribution.png', dpi=120, bbox_inches='tight')
plt.show()

## Headline Length
Clickbait headlines tend to be slightly longer — checking if that's true here.

In [None]:
df['word_count'] = df[TEXT_COL].astype(str).apply(lambda x: len(x.split()))

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

for ax, label, name, color in zip(
    axes, [0, 1], ['Real News', 'Clickbait'], ['steelblue', 'tomato']
):
    data = df[df[LABEL_COL] == label]['word_count']
    ax.hist(data, bins=30, color=color, alpha=0.8, edgecolor='black', linewidth=0.4)
    ax.axvline(data.mean(), color='black', linestyle='--', linewidth=1.2,
               label=f'Mean: {data.mean():.1f}')
    ax.set_title(f'{name} — Word Count')
    ax.set_xlabel('Words per Headline')
    ax.set_ylabel('Count')
    ax.legend()

plt.suptitle('Headline Length by Class', fontsize=14, y=1.02)
plt.tight_layout()
plt.savefig('data/word_count_distribution.png', dpi=120, bbox_inches='tight')
plt.show()

## Most Frequent Words

In [None]:
def top_words(texts, n=20):
    """Return the n most common words after removing stopwords."""
    words = []
    for text in texts:
        tokens = re.sub('[^a-z ]', '', str(text).lower()).split()
        words.extend(w for w in tokens if w not in STOP_WORDS and len(w) > 2)
    return Counter(words).most_common(n)


fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for ax, label, name, color in zip(
    axes, [0, 1], ['Real News', 'Clickbait'], ['steelblue', 'tomato']
):
    top  = top_words(df[df[LABEL_COL] == label][TEXT_COL])
    wrds, cnts = zip(*top)
    ax.barh(list(wrds)[::-1], list(cnts)[::-1], color=color, edgecolor='black', linewidth=0.3)
    ax.set_title(f'Top 20 Words — {name}')
    ax.set_xlabel('Frequency')

plt.suptitle('Most Common Words (stopwords removed)', fontsize=13, y=1.02)
plt.tight_layout()
plt.savefig('data/top_words.png', dpi=120, bbox_inches='tight')
plt.show()

## Word Clouds

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for ax, label, name, cmap in zip(
    axes, [0, 1], ['Real News', 'Clickbait'], ['Blues', 'Reds']
):
    corpus = ' '.join(df[df[LABEL_COL] == label][TEXT_COL].astype(str).tolist())
    wc = WordCloud(
        width=700, height=400,
        background_color='white',
        stopwords=STOP_WORDS,
        colormap=cmap,
        max_words=100
    ).generate(corpus)
    ax.imshow(wc, interpolation='bilinear')
    ax.axis('off')
    ax.set_title(f'Word Cloud — {name}', fontsize=14)

plt.tight_layout()
plt.savefig('data/wordclouds.png', dpi=120, bbox_inches='tight')
plt.show()

## Save Cleaned Data
Normalise column names to `headline` / `label` so notebooks 02 and 03 work regardless of the original CSV naming.

In [None]:
df_clean = df[[TEXT_COL, LABEL_COL]].dropna().copy()
df_clean.columns = ['headline', 'label']       # standard names for downstream notebooks
df_clean['label'] = df_clean['label'].astype(int)

df_clean.to_csv(CLEANED_PATH, index=False)

print(f'Saved to {CLEANED_PATH!r}')
print(f'  Total rows : {len(df_clean)}')
print(f'  Clickbait  : {(df_clean["label"] == 1).sum()}')
print(f'  Real news  : {(df_clean["label"] == 0).sum()}')