In [None]:
import pandas as pd
import logging
from collections import Counter
from datetime import datetime
import re

from .base_analyzer import BaseAnalyzer

class NewsAnalyzer(BaseAnalyzer):
    def __init__(self, filepath):
        super().__init__(filepath)
        self.top_publishers = None
        self.top_keywords = None

    def load_data(self):
        """Load and preprocess the financial news dataset"""
        try:
            self.data = pd.read_csv(self."C:/Users/firao/Desktop/PYTHON PROJECTS/KIAM PROJECTS/data/raw_analyst_ratings/raw_analyst_ratings.csv")
            self._preprocess()
            self.logger.info("News data loaded successfully")
        except FileNotFoundError as e:
            self.logger.error("Dataset not found. Please check the path.")
            raise
        except Exception as e:
            self.logger.error(f"Error loading dataset: {e}")
            raise

    def _preprocess(self):
        """Basic preprocessing of news headlines and dates"""
        try:
            # Ensure required columns exist
            required_columns = ['headline', 'publisher', 'date', 'stock']
            missing_cols = [col for col in required_columns if col not in self.data.columns]
            if missing_cols:
                raise KeyError(f"Missing required columns: {missing_cols}")

            # Convert date to datetime
            self.data['date'] = pd.to_datetime(self.data['date'], errors='coerce')
            self.data['headline_length'] = self.data['headline'].str.len()

            # Extract domain from email publishers
            def extract_domain(publisher):
                match = re.search(r'@([\w\.-]+)', str(publisher))
                return match.group(1) if match else publisher

            self.data['publisher_domain'] = self.data['publisher'].apply(extract_domain)

        except Exception as e:
            self.logger.error(f"Preprocessing failed: {e}")
            raise

    def get_top_publishers(self, top_n=10):
        """Get most active publishers"""
        try:
            self.top_publishers = self.data['publisher_domain'].value_counts().head(top_n)
            return self.top_publishers
        except Exception as e:
            self.logger.error(f"Error calculating top publishers: {e}")
            raise

    def analyze_headline_lengths(self):
        """Analyze headline length distribution"""
        try:
            return self.data['headline_length'].describe()
        except Exception as e:
            self.logger.error(f"Error analyzing headline lengths: {e}")
            raise

    def extract_keywords(self, top_n=10):
        """Extract basic keywords from headlines"""
        try:
            all_words = []
            for headline in self.data['headline']:
                words = re.findall(r'\b\w+\b', str(headline).lower())
                all_words.extend(words)

            word_counts = Counter(all_words)
            self.top_keywords = word_counts.most_common(top_n)
            return self.top_keywords
        except Exception as e:
            self.logger.error(f"Keyword extraction failed: {e}")
            raise

    def analyze_time_series(self, freq='D'):
        """Analyze news frequency over time"""
        try:
            time_df = self.data.resample(freq, on='date').size().reset_index(name='count')
            time_df.rename(columns={'date': 'time'}, inplace=True)
            return time_df
        except Exception as e:
            self.logger.error(f"Time series analysis failed: {e}")
            raise