In [None]:
# EDA Analysis Notebook
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from data_loader import DataLoader
from sentiment_analyzer import SentimentAnalyzer

# Initialize classes
data_loader = DataLoader()
sentiment_analyzer = SentimentAnalyzer()

# Load data
news_data = data_loader.load_news_data()
tickers = data_loader.get_available_tickers()
print(f"Available tickers: {tickers}")

# Basic EDA
print("Dataset Info:")
print(news_data.info())
print("\nBasic Statistics:")
print(news_data.describe())

# Publication frequency analysis
plt.figure(figsize=(12, 6))
news_data['date'].dt.date.value_counts().sort_index().plot()
plt.title('Daily Publication Frequency')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Publisher analysis
publisher_counts = news_data['publisher'].value_counts()
plt.figure(figsize=(10, 6))
publisher_counts.head(10).plot(kind='bar')
plt.title('Top 10 Publishers by Article Count')
plt.xlabel('Publisher')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Stock symbol distribution
stock_counts = news_data['stock'].value_counts()
plt.figure(figsize=(10, 6))
stock_counts.head(15).plot(kind='bar')
plt.title('Top 15 Stocks by News Coverage')
plt.xlabel('Stock Symbol')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Headline length analysis
news_data['headline_length'] = news_data['headline'].str.len()
plt.figure(figsize=(10, 6))
plt.hist(news_data['headline_length'].dropna(), bins=50, edgecolor='black')
plt.title('Distribution of Headline Lengths')
plt.xlabel('Headline Length (characters)')
plt.ylabel('Frequency')
plt.show()

# Word cloud for headlines
all_headlines = ' '.join(news_data['headline'].dropna().astype(str))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_headlines)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of News Headlines')
plt.show()