In [1]:
# Import necessary libraries
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# --- Data Loading ---
def load_data(filepath: str) -> pd.DataFrame:
    """
    Load dataset from a CSV file.
    Args:
        filepath (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    df = pd.read_csv(filepath)
    return df

In [None]:
df = load_data("../data/raw_analyst_ratings.csv")

In [7]:
def perform_sentiment_analysis(df: pd.DataFrame, column: str = "headline") -> pd.DataFrame:
    """
    Perform sentiment analysis on the headlines.
    Args:
        df (pd.DataFrame): Input DataFrame.
        column (str): Column containing headlines.
    Returns:
        pd.DataFrame: DataFrame with sentiment results.
    """
    def get_sentiment(text):
        blob = TextBlob(text)
        return blob.sentiment.polarity, blob.sentiment.subjectivity

    df[['sentiment_polarity', 'sentiment_subjectivity']] = df[column].apply(lambda x: pd.Series(get_sentiment(x)))
    df['sentiment'] = df['sentiment_polarity'].apply(
        lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Neutral')
    )
    return df

In [None]:
df = perform_sentiment_analysis(df)
print(df['sentiment'].value_counts())

In [None]:
def perform_topic_modeling(df: pd.DataFrame, column: str = "headline", n_topics: int = 5) -> pd.DataFrame:
    """
    Perform topic modeling on the headlines.
    Args:
        df (pd.DataFrame): Input DataFrame.
        column (str): Column containing headlines.
        n_topics (int): Number of topics to extract.
    Returns:
        pd.DataFrame: DataFrame with topic assignments.
    """
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(df[column])
    kmeans = KMeans(n_clusters=n_topics, random_state=42)
    df['topic'] = kmeans.fit_predict(tfidf_matrix)
    return df

In [None]:
df = perform_topic_modeling(df)
print(df['topic'].value_counts())