In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import sys

# Add src directory to path
sys.path.append('../src')

# Import custom modules
from data_loader import DataLoader, load_and_prepare_data
from eda import EDAAnalyzer

# Configuration
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

print("Libraries imported successfully!")

## 1. Load the Data

We'll start by loading a sample of the dataset to perform EDA. Given the large size (1.4M+ rows), we'll work with a manageable subset initially.

In [None]:
# Define data path
DATA_PATH = '../Data/newsData/raw_analyst_ratings.csv'

# Initialize data loader
loader = DataLoader(DATA_PATH)

# Load data - start with 100k rows for initial analysis
print("Loading data...")
df = loader.load_data(nrows=100000)

print(f"\nDataset shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## 2. Data Overview and Basic Information

In [None]:
# Display data info
print("=== Dataset Information ===")
print(df.info())

print("\n=== Missing Values ===")
print(df.isnull().sum())

print("\n=== Data Types ===")
print(df.dtypes)

print("\n=== Sample Records ===")
df.sample(10)

## 3. Data Preprocessing

In [None]:
# Preprocess the data
print("Preprocessing data...")
df = loader.preprocess()

print(f"\nDataset shape after preprocessing: {df.shape}")
print(f"\nNew columns added:")
print(df.columns.tolist())

df.head()

## 4. Descriptive Statistics

Let's examine basic statistics for textual lengths and other numerical features.

In [None]:
# Initialize EDA Analyzer
eda = EDAAnalyzer(df)

# Get descriptive statistics
stats = eda.descriptive_statistics()

print("=== DESCRIPTIVE STATISTICS ===\n")
print(f"Total Articles: {stats['total_articles']:,}")
print(f"Unique Stocks: {stats['unique_stocks']:,}")
print(f"Unique Publishers: {stats['unique_publishers']:,}")
print(f"\nDate Range:")
print(f"  From: {stats['date_range'][0]}")
print(f"  To: {stats['date_range'][1]}")

print("\n=== Headline Length Statistics ===")
print(stats['headline_length_stats'])

print("\n=== Word Count Statistics ===")
print(stats['word_count_stats'])

In [None]:
# Visualize headline length and word count distributions
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Headline length distribution
axes[0, 0].hist(df['headline_length'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Distribution of Headline Length', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Characters')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['headline_length'].mean(), color='red', linestyle='--', label='Mean')
axes[0, 0].legend()

# Word count distribution
axes[0, 1].hist(df['word_count'], bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_title('Distribution of Word Count', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Words')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].axvline(df['word_count'].mean(), color='red', linestyle='--', label='Mean')
axes[0, 1].legend()

# Box plots
axes[1, 0].boxplot(df['headline_length'], vert=False)
axes[1, 0].set_title('Headline Length - Box Plot', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Characters')

axes[1, 1].boxplot(df['word_count'], vert=False)
axes[1, 1].set_title('Word Count - Box Plot', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Words')

plt.tight_layout()
plt.show()

print("\n=== Key Insights ===")
print(f"Average headline length: {df['headline_length'].mean():.1f} characters")
print(f"Average word count: {df['word_count'].mean():.1f} words")
print(f"Median headline length: {df['headline_length'].median():.0f} characters")
print(f"Median word count: {df['word_count'].median():.0f} words")

## 5. Publisher Analysis

Identify which publishers are most active and their coverage patterns.

In [None]:
# Get publisher statistics
publisher_stats = eda.publisher_analysis()

print("=== TOP 20 PUBLISHERS BY ARTICLE COUNT ===\n")
print(publisher_stats.head(20))

print("\n=== Publisher Summary ===")
print(f"Total publishers: {len(publisher_stats)}")
print(f"Most active publisher: {publisher_stats.index[0]} ({publisher_stats.iloc[0]['article_count']:,} articles)")
print(f"Average articles per publisher: {publisher_stats['article_count'].mean():.1f}")
print(f"Median articles per publisher: {publisher_stats['article_count'].median():.0f}")

In [None]:
# Visualize top publishers
fig = eda.plot_publisher_distribution(top_n=25, figsize=(14, 10))
plt.show()

# Publisher concentration analysis
top_10_pct = (publisher_stats.head(10)['article_count'].sum() / publisher_stats['article_count'].sum()) * 100
top_20_pct = (publisher_stats.head(20)['article_count'].sum() / publisher_stats['article_count'].sum()) * 100

print(f"\n=== Publisher Concentration ===")
print(f"Top 10 publishers account for: {top_10_pct:.1f}% of all articles")
print(f"Top 20 publishers account for: {top_20_pct:.1f}% of all articles")

In [None]:
# Identify publisher domains
df['publisher_domain'] = df['publisher'].apply(
    lambda x: x.split('@')[1] if '@' in str(x) else 'Named Author'
)

domain_counts = df['publisher_domain'].value_counts().head(15)

print("=== TOP PUBLISHER DOMAINS ===\n")
print(domain_counts)

# Visualize
plt.figure(figsize=(12, 6))
domain_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Top 15 Publisher Domains', fontsize=14, fontweight='bold')
plt.xlabel('Domain')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 6. Stock Analysis

Analyze which stocks receive the most coverage.

In [None]:
# Get stock statistics
stock_stats = eda.stock_analysis()

print("=== TOP 30 STOCKS BY ARTICLE COUNT ===\n")
print(stock_stats.head(30))

print("\n=== Stock Coverage Summary ===")
print(f"Total unique stocks: {len(stock_stats)}")
print(f"Most covered stock: {stock_stats.index[0]} ({stock_stats.iloc[0]['article_count']:,} articles)")
print(f"Average articles per stock: {stock_stats['article_count'].mean():.1f}")
print(f"Median articles per stock: {stock_stats['article_count'].median():.0f}")

In [None]:
# Visualize top stocks
fig = eda.plot_stock_distribution(top_n=40, figsize=(14, 12))
plt.show()

# Stock coverage concentration
top_50_stocks_pct = (stock_stats.head(50)['article_count'].sum() / stock_stats['article_count'].sum()) * 100
top_100_stocks_pct = (stock_stats.head(100)['article_count'].sum() / stock_stats['article_count'].sum()) * 100

print(f"\n=== Stock Coverage Concentration ===")
print(f"Top 50 stocks account for: {top_50_stocks_pct:.1f}% of all articles")
print(f"Top 100 stocks account for: {top_100_stocks_pct:.1f}% of all articles")

## 7. Time Series Analysis

Analyze publication patterns over time - daily, hourly, and day-of-week trends.

In [None]:
# Get time series statistics
time_stats = eda.time_series_analysis()

print("=== TIME SERIES ANALYSIS ===\n")

print("Daily article statistics:")
print(f"  Average: {time_stats['daily'].mean():.1f} articles/day")
print(f"  Median: {time_stats['daily'].median():.0f} articles/day")
print(f"  Max: {time_stats['daily'].max():,} articles (on {time_stats['daily'].idxmax()})")
print(f"  Min: {time_stats['daily'].min():,} articles (on {time_stats['daily'].idxmin()})")

print("\nHourly distribution (top 5 hours):")
print(time_stats['hourly'].sort_values(ascending=False).head())

print("\nDay of week distribution:")
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
for day_idx, count in time_stats['day_of_week'].items():
    print(f"  {day_names[day_idx]}: {count:,} articles")

In [None]:
# Comprehensive time series visualization
fig = eda.plot_time_series(figsize=(16, 12))
plt.show()

In [None]:
# Analyze monthly trends
monthly_trend = df.groupby([df['year'], df['month']]).size()
monthly_trend.index = monthly_trend.index.map(lambda x: f"{x[0]}-{x[1]:02d}")

plt.figure(figsize=(16, 6))
monthly_trend.plot(kind='line', marker='o', linewidth=2, markersize=4)
plt.title('Monthly Article Publication Trend', fontsize=14, fontweight='bold')
plt.xlabel('Year-Month')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\n=== Monthly Trend Insights ===")
print(f"Highest publication month: {monthly_trend.idxmax()} with {monthly_trend.max():,} articles")
print(f"Lowest publication month: {monthly_trend.idxmin()} with {monthly_trend.min():,} articles")

In [None]:
# Analyze publication time patterns
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Hourly heatmap by day of week
hourly_dow = df.groupby(['dayofweek', 'hour']).size().unstack(fill_value=0)
sns.heatmap(hourly_dow, cmap='YlOrRd', ax=axes[0], cbar_kws={'label': 'Article Count'})
axes[0].set_title('Publication Heatmap: Hour vs Day of Week', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Hour of Day')
axes[0].set_ylabel('Day of Week')
axes[0].set_yticklabels(day_names, rotation=0)

# Weekend vs Weekday
df['is_weekend'] = df['dayofweek'].isin([5, 6])
weekend_counts = df.groupby('is_weekend').size()
axes[1].pie(weekend_counts, labels=['Weekday', 'Weekend'], autopct='%1.1f%%', 
           startangle=90, colors=['#ff9999', '#66b3ff'])
axes[1].set_title('Weekend vs Weekday Publication', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n=== Publication Timing Insights ===")
print(f"Weekday articles: {weekend_counts[False]:,} ({weekend_counts[False]/len(df)*100:.1f}%)")
print(f"Weekend articles: {weekend_counts[True]:,} ({weekend_counts[True]/len(df)*100:.1f}%)")

## 8. Text Analysis and Topic Modeling

Extract common keywords and themes from headlines.

In [None]:
# Extract top keywords
print("Extracting keywords from headlines...")
keywords = eda.extract_keywords(n_keywords=50)

print("\n=== TOP 50 KEYWORDS IN HEADLINES ===\n")
for i, (word, count) in enumerate(keywords, 1):
    print(f"{i:2d}. {word:20s} - {count:,} occurrences")

In [None]:
# Visualize top keywords
from wordcloud import WordCloud

# Create word frequency dictionary for word cloud
word_freq = dict(keywords[:100])

plt.figure(figsize=(16, 8))

# Word cloud
plt.subplot(1, 2, 1)
wordcloud = WordCloud(width=800, height=400, background_color='white', 
                     colormap='viridis', max_words=100).generate_from_frequencies(word_freq)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Top Keywords', fontsize=16, fontweight='bold')

# Bar chart of top 30 keywords
plt.subplot(1, 2, 2)
top_30_words = keywords[:30]
words, counts = zip(*top_30_words)
plt.barh(range(len(words)), counts, color='teal')
plt.yticks(range(len(words)), words)
plt.xlabel('Frequency')
plt.title('Top 30 Keywords Frequency', fontsize=16, fontweight='bold')
plt.gca().invert_yaxis()

plt.tight_layout()
plt.show()

In [None]:
# Identify common financial terms
financial_terms = {
    'price': ['price', 'target', 'pt'],
    'earnings': ['earnings', 'eps', 'revenue', 'sales'],
    'rating': ['rating', 'upgrade', 'downgrade', 'maintains', 'initiates'],
    'market': ['stock', 'stocks', 'shares', 'market'],
    'performance': ['high', 'low', 'gains', 'losses', 'beats', 'misses'],
    'action': ['buy', 'sell', 'hold', 'analyst', 'analysts']
}

# Count occurrences of each category
category_counts = {}
for category, terms in financial_terms.items():
    count = 0
    for term in terms:
        matching = [w for w, c in keywords if term in w.lower()]
        if matching:
            count += sum([c for w, c in keywords if term in w.lower()])
    category_counts[category] = count

print("\n=== FINANCIAL TERM CATEGORIES ===\n")
for category, count in sorted(category_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{category.capitalize():15s}: {count:,} occurrences")

# Visualize
plt.figure(figsize=(10, 6))
categories = list(category_counts.keys())
counts = list(category_counts.values())
plt.bar(categories, counts, color='coral', edgecolor='black')
plt.title('Financial Term Categories in Headlines', fontsize=14, fontweight='bold')
plt.xlabel('Category')
plt.ylabel('Occurrences')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Analyze headline sentiment indicators
sentiment_keywords = {
    'positive': ['gains', 'high', 'upgrade', 'beats', 'outperform', 'buy', 'rises', 'up'],
    'negative': ['losses', 'low', 'downgrade', 'misses', 'underperform', 'sell', 'falls', 'down'],
    'neutral': ['maintains', 'hold', 'neutral', 'equal', 'peer']
}

# Count sentiment indicators
sentiment_counts = {}
for sentiment, terms in sentiment_keywords.items():
    count = 0
    for term in terms:
        count += df['headline'].str.lower().str.contains(term, regex=False).sum()
    sentiment_counts[sentiment] = count

print("\n=== SENTIMENT INDICATORS IN HEADLINES ===\n")
for sentiment, count in sentiment_counts.items():
    pct = (count / len(df)) * 100
    print(f"{sentiment.capitalize():10s}: {count:,} occurrences ({pct:.1f}% of articles)")

# Visualize
plt.figure(figsize=(10, 6))
colors_map = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
colors = [colors_map[s] for s in sentiment_counts.keys()]
plt.bar(sentiment_counts.keys(), sentiment_counts.values(), color=colors, edgecolor='black', alpha=0.7)
plt.title('Sentiment Indicators in Headlines', fontsize=14, fontweight='bold')
plt.xlabel('Sentiment')
plt.ylabel('Occurrences')
plt.tight_layout()
plt.show()

## 9. Summary and Key Findings

Let's summarize the key insights from our EDA.

In [None]:
print("=" * 80)
print("EXPLORATORY DATA ANALYSIS - KEY FINDINGS")
print("=" * 80)

print("\nüìä DATASET OVERVIEW")
print(f"  ‚Ä¢ Total articles analyzed: {len(df):,}")
print(f"  ‚Ä¢ Unique stocks covered: {df['stock'].nunique():,}")
print(f"  ‚Ä¢ Unique publishers: {df['publisher'].nunique():,}")
print(f"  ‚Ä¢ Date range: {df['date'].min().date()} to {df['date'].max().date()}")

print("\nüìù HEADLINE CHARACTERISTICS")
print(f"  ‚Ä¢ Average headline length: {df['headline_length'].mean():.1f} characters")
print(f"  ‚Ä¢ Average word count: {df['word_count'].mean():.1f} words")
print(f"  ‚Ä¢ Typical headline: {df['headline_length'].median():.0f} characters, {df['word_count'].median():.0f} words")

print("\n‚úçÔ∏è PUBLISHER INSIGHTS")
top_pub = publisher_stats.iloc[0]
print(f"  ‚Ä¢ Most active publisher: {publisher_stats.index[0]}")
print(f"  ‚Ä¢ Top 10 publishers: {top_10_pct:.1f}% of content")
print(f"  ‚Ä¢ Publisher concentration: High (top 20 = {top_20_pct:.1f}%)")

print("\nüìà STOCK COVERAGE")
top_stock = stock_stats.iloc[0]
print(f"  ‚Ä¢ Most covered stock: {stock_stats.index[0]} ({top_stock['article_count']:,} articles)")
print(f"  ‚Ä¢ Top 50 stocks: {top_50_stocks_pct:.1f}% of coverage")
print(f"  ‚Ä¢ Coverage distribution: Highly concentrated on major stocks")

print("\nüìÖ TEMPORAL PATTERNS")
peak_hour = time_stats['hourly'].idxmax()
peak_day = time_stats['day_of_week'].idxmax()
print(f"  ‚Ä¢ Peak publication hour: {peak_hour}:00 ({time_stats['hourly'][peak_hour]:,} articles)")
print(f"  ‚Ä¢ Peak publication day: {day_names[peak_day]} ({time_stats['day_of_week'][peak_day]:,} articles)")
print(f"  ‚Ä¢ Weekday vs Weekend: {weekend_counts[False]/len(df)*100:.1f}% weekday")
print(f"  ‚Ä¢ Publication consistency: Active throughout business hours")

print("\nüî§ CONTENT ANALYSIS")
print(f"  ‚Ä¢ Top keywords: {', '.join([w for w, c in keywords[:5]])}")
print(f"  ‚Ä¢ Common themes: Price targets, earnings, analyst ratings")
print(f"  ‚Ä¢ Sentiment distribution:")
for sentiment, count in sentiment_counts.items():
    print(f"    - {sentiment.capitalize()}: {(count/len(df)*100):.1f}%")

print("\nüí° KEY INSIGHTS")
print("  ‚Ä¢ News is heavily concentrated on large-cap, actively traded stocks")
print("  ‚Ä¢ Publications peak during market hours (pre-market and trading hours)")
print("  ‚Ä¢ Majority of content comes from a small number of publishers")
print("  ‚Ä¢ Headlines focus on price targets, ratings, and earnings")
print("  ‚Ä¢ Sentiment language is prevalent in headlines")

print("\n" + "=" * 80)

## 10. Export Processed Data

Save the preprocessed data for use in subsequent tasks.

In [None]:
# Save processed data
output_path = '../Data/processed_news_sample.csv'
df.to_csv(output_path, index=False)
print(f"‚úÖ Processed data saved to: {output_path}")

print(f"\nüì¶ Saved {len(df):,} rows with {len(df.columns)} columns")
print(f"Columns: {', '.join(df.columns.tolist())}")

## Conclusion

This EDA has provided comprehensive insights into the financial news dataset:

1. **Data Quality**: Clean dataset with minimal missing values
2. **Publisher Landscape**: Dominated by major financial news outlets
3. **Stock Coverage**: Concentrated on high-profile stocks
4. **Temporal Patterns**: Clear market-hours publication patterns
5. **Content Themes**: Focus on ratings, price targets, and earnings

**Next Steps for Task 2**: 
- Download historical stock price data for covered stocks
- Calculate technical indicators (SMA, RSI, MACD)
- Visualize price movements and indicators

**Next Steps for Task 3**:
- Perform sentiment analysis on headlines
- Align news data with stock prices
- Analyze correlation between sentiment and returns