In [None]:
# notebooks/task1_eda.ipynb

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from collections import Counter
import re

# Optional: Download stopwords if not already
nltk.download('stopwords')

# Load dataset
df = pd.read_csv('../data/fnspid.csv')  # Adjust path if needed
print("✅ Data loaded successfully")

# Preview
display(df.head())

# --- 1. Basic Info ---
print("\n🧾 Dataset Info:")
print(df.info())
print("\n📊 Null values:\n", df.isnull().sum())

# --- 2. Descriptive Statistics: Headline Length ---
df['headline_length'] = df['headline'].astype(str).apply(len)
print("\n📏 Headline Length Stats:")
print(df['headline_length'].describe())

# Plot headline length distribution
plt.figure(figsize=(8, 5))
sns.histplot(df['headline_length'], bins=30, kde=True)
plt.title('Distribution of Headline Lengths')
plt.xlabel('Length')
plt.ylabel('Frequency')
plt.show()

# --- 3. Articles per Publisher ---
publisher_counts = df['publisher'].value_counts().head(10)
print("\n📰 Top Publishers:")
print(publisher_counts)

plt.figure(figsize=(10, 5))
sns.barplot(x=publisher_counts.index, y=publisher_counts.values)
plt.xticks(rotation=45)
plt.title('Top 10 Publishers by Article Count')
plt.ylabel('Number of Articles')
plt.xlabel('Publisher')
plt.show()

# --- 4. Date Analysis ---
df['date'] = pd.to_datetime(df['date'], utc=True)
df['date_only'] = df['date'].dt.date
daily_counts = df.groupby('date_only').size()

plt.figure(figsize=(12, 5))
daily_counts.plot()
plt.title('Article Count Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.grid(True)
plt.show()

# --- 5. Publishing Hour Analysis ---
df['hour'] = df['date'].dt.hour
plt.figure(figsize=(10, 5))
sns.countplot(x='hour', data=df)
plt.title('Distribution of Article Publishing Hours')
plt.xlabel('Hour of Day (UTC)')
plt.ylabel('Number of Articles')
plt.show()

# --- 6. Word Cloud for Headlines ---
stop_words = set(stopwords.words('english'))
headlines = ' '.join(df['headline'].dropna().astype(str).tolist()).lower()
headlines = re.sub(r'[^a-z\s]', '', headlines)
headlines = ' '.join([word for word in headlines.split() if word not in stop_words])

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(headlines)

plt.figure(figsize=(15, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Headlines")
plt.show()
