In [None]:
# ==============================
# 1️⃣ IMPORT MODULES
# ==============================
from modules.eda import (
    load_news_data,
    preprocess_news_dataframe,
    dataset_overview,
    analyze_headline_length,
    count_articles_per_publisher,
    get_top_publishers,
    extract_email_domains,
    publisher_content_summary,
    extract_keywords,
    nlp_topic_modeling,
    daily_publication_count,
    detect_publication_spikes,
    hourly_publication_distribution,
    plot_headline_length_distribution,
    plot_top_publishers,
    plot_daily_publication,
    plot_spikes,
    plot_hourly_publication
)

# ==============================
# 2️⃣ LOAD DATA
# ==============================
file_path = "../data/news_data/news.csv"
df = load_news_data(file_path)

# ==============================
# 3️⃣ PREPROCESSING
# ==============================
df = preprocess_news_dataframe(df, text_column='headline', date_column='date')

# ==============================
# 4️⃣ DESCRIPTIVE STATISTICS
# ==============================
# Dataset overview
overview = dataset_overview(df)
print("\nNull Value Summary:\n", overview["null_summary"])

# Headline length analysis
df, headline_stats = analyze_headline_length(df)
print("\nHeadline Length Stats:\n", headline_stats)
plot_headline_length_distribution(df)

# Top publishers
publisher_counts = count_articles_per_publisher(df)
print("\nTop Publishers:\n", publisher_counts)
plot_top_publishers(publisher_counts)

# ==============================
# 5️⃣ PUBLISHER ANALYSIS
# ==============================
# Most active publishers
top_publishers = get_top_publishers(df)
print("\nTop Publishers:\n", top_publishers)
plot_top_publishers(top_publishers)

# Publisher domains (if emails)
top_domains = extract_email_domains(df)
print("\nTop Publisher Domains:\n", top_domains)

# Publisher content summary
content_summary = publisher_content_summary(df, text_column='headline', top_n_words=10)
print("\nPublisher Content Summary (First 10 Words):")
for pub, words in content_summary.items():
    print(f"{pub}: {words}")

# ==============================
# 6️⃣ TEXT ANALYSIS (NLP)
# ==============================
# Top keywords
top_keywords = extract_keywords(df, column='cleaned_text', top_n=15)
print("\nTop Keywords:\n", top_keywords)

# Topic modeling
topics = nlp_topic_modeling(df, column='cleaned_text', n_topics=5, n_words=8)
print("\nTopics Detected:")
for t in topics:
    print(t)

# ==============================
# 7️⃣ TIME SERIES ANALYSIS
# ==============================
# Daily publication counts
daily_counts = daily_publication_count(df)
plot_daily_publication(daily_counts)

# Detect spikes
spikes, threshold = detect_publication_spikes(daily_counts, threshold_factor=2)
print("\nSpike Threshold:", threshold)
print("Spike Dates:\n", spikes)
plot_spikes(daily_counts, spikes)

# Hourly distribution
hourly_counts = hourly_publication_distribution(df)
plot_hourly_publication(hourly_counts)
