# Imports

In [None]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer

# Paths
PROJECT_ROOT = Path("../")  # notebook relative path
DATA_DIR = PROJECT_ROOT / "data"
CLEAN_DATA_PATH = DATA_DIR / "clean_news.parquet"
FIGS_DIR = PROJECT_ROOT / "outputs/figs"
FIGS_DIR.mkdir(parents=True, exist_ok=True)

# Load cleaned news data
news_df = pd.read_parquet(CLEAN_DATA_PATH)
news_df.head()


ImportError: Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.

# Headline Length Statistics

In [None]:
# Descriptive stats
stats = news_df[['headline_length_chars', 'headline_length_tokens']].describe()
print(stats)

# Histogram of headline lengths (chars)
plt.figure(figsize=(8,4))
plt.hist(news_df['headline_length_chars'], bins=50, color='skyblue', edgecolor='black')
plt.title("Headline Length (Characters)")
plt.xlabel("Characters")
plt.ylabel("Frequency")
plt.grid(axis='y', alpha=0.75)
plt.savefig(FIGS_DIR / "headline_length_chars_hist.png")
plt.show()

# Histogram of headline lengths (tokens)
plt.figure(figsize=(8,4))
plt.hist(news_df['headline_length_tokens'], bins=50, color='lightgreen', edgecolor='black')
plt.title("Headline Length (Tokens)")
plt.xlabel("Tokens")
plt.ylabel("Frequency")
plt.grid(axis='y', alpha=0.75)
plt.savefig(FIGS_DIR / "headline_length_tokens_hist.png")
plt.show()

# Top Keywords

In [None]:
# Convert headlines to lowercase
headlines_lower = news_df['headline_lower'].fillna('')

# Extract unigrams and bigrams appearing in at least 5 headlines
vectorizer = CountVectorizer(ngram_range=(1,2), min_df=5, stop_words='english')
X = vectorizer.fit_transform(headlines_lower)

# Sum word counts
sums = X.sum(axis=0)
terms = [(term, sums[0, idx]) for term, idx in vectorizer.vocabulary_.items()]
top_terms = sorted(terms, key=lambda x: x[1], reverse=True)[:30]

# Display top 30 keywords/bigrams
top_terms

# Publisher Counts

In [None]:
publisher_counts = news_df['publisher'].value_counts().head(20)
print(publisher_counts)

# Plot
plt.figure(figsize=(10,5))
publisher_counts.plot(kind='bar', color='orange', edgecolor='black')
plt.title("Top 20 Publishers")
plt.ylabel("Number of Articles")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(FIGS_DIR / "top_20_publishers.png")
plt.show()

# Stock Ticker Counts

In [None]:
ticker_counts = news_df['stock'].value_counts().head(20)
print(ticker_counts)

# Plot
plt.figure(figsize=(10,5))
ticker_counts.plot(kind='bar', color='purple', edgecolor='black')
plt.title("Top 20 Stock Tickers")
plt.ylabel("Number of Articles")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(FIGS_DIR / "top_20_stocks.png")
plt.show()

# Daily Article Trends

In [None]:
# Group by date_only
daily_counts = news_df.groupby('date_only').size()

plt.figure(figsize=(12,5))
daily_counts.plot(color='green', marker='o')
plt.title("Daily Article Counts")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.grid(True)
plt.tight_layout()
plt.savefig(FIGS_DIR / "daily_article_counts.png")
plt.show()

# Rolling Spike Detection

In [None]:
rolling_window = 7
rolling_mean = daily_counts.rolling(rolling_window).mean()
rolling_std = daily_counts.rolling(rolling_window).std()
threshold = rolling_mean + 2*rolling_std

spike_dates = daily_counts[daily_counts > threshold].index
print("Spike Dates:", spike_dates)

plt.figure(figsize=(12,5))
daily_counts.plot(label="Daily Count")
threshold.plot(label="Spike Threshold", linestyle='--')
plt.scatter(spike_dates, daily_counts[spike_dates], color='red', label='Spike')
plt.title("Daily Article Counts with Spike Detection")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(FIGS_DIR / "daily_spikes.png")
plt.show()