In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
# --- Data Loading ---
def load_data(filepath: str) -> pd.DataFrame:
    """
    Load dataset from a CSV file.
    Args:
        filepath (str): Path to the CSV file.
    Returns:
        pd.DataFrame: Loaded DataFrame.
    """
    df = pd.read_csv(filepath)
    return df

In [None]:
df = load_data("../data/raw_analyst_ratings.csv")


In [None]:
def preprocess_data(df):
    """
    Convert the 'date' column to datetime and drop rows with invalid dates.
    """
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date'])
    return df

df = preprocess_data(df)
print(df)


In [None]:
def aggregate_by_interval(df, interval='D'):
    """
    Aggregates the data by a specified time interval (Daily, Weekly, Monthly).
    
    Parameters:
    - df: DataFrame containing the news articles
    - interval: Aggregation interval ('D' for daily, 'W' for weekly, 'M' for monthly)
    
    Returns:
    - DataFrame: Grouped by the specified interval with publication counts
    """
    if interval == 'D':
        df_grouped = df.groupby(df['date'].dt.date).size()
    elif interval == 'W':
        df_grouped = df.groupby(df['date'].dt.to_period('W')).size()
        df_grouped = df_grouped.reset_index()
        df_grouped['date'] = df_grouped['date'].dt.start_time  
    elif interval == 'M':
        df_grouped = df.groupby(df['date'].dt.to_period('M')).size()
        df_grouped = df_grouped.reset_index()
        df_grouped['date'] = df_grouped['date'].dt.start_time  
    
    if isinstance(df_grouped, pd.Series):
        df_grouped = df_grouped.reset_index(name='publication_count')
        df_grouped.rename(columns={'index': 'date'}, inplace=True)
    
    return df_grouped

daily_data = aggregate_by_interval(df, interval='D')
print(daily_data)

In [None]:
def detect_spikes(df, threshold=1.5, window=7):
    """
    Detects spikes in publication frequency based on a rolling average.
    
    Parameters:
    - df: DataFrame containing the publication counts
    - threshold: Factor to detect spikes (e.g., 1.5 means spikes greater than 1.5x the moving average)
    - window: Size of the rolling window (in days)
    
    Returns:
    - DataFrame: Original DataFrame with spike information
    """
    df['rolling_avg'] = df['publication_count'].rolling(window=window).mean()
    df['spike'] = df['publication_count'] > (df['rolling_avg'] * threshold)
    return df

spikes_df = detect_spikes(daily_data)
print(spikes_df)


In [None]:
def plot_publication_frequency(df, title='Publication Frequency'):
    """
    Plots the publication frequency based on the provided DataFrame.
    
    Parameters:
    - df: DataFrame containing publication frequency data
    - title: Title of the plot
    """
    plt.figure(figsize=(10, 6))
    sns.lineplot(x='date', y='publication_count', data=df)
    plt.title(title)
    plt.xlabel('Date')
    plt.ylabel('Number of Articles Published')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

plot_publication_frequency(daily_data)


In [None]:
def plot_spikes(df, title='Spikes in Publication Frequency'):
    """
    Plots the publication frequency and highlights the detected spikes.
    
    Parameters:
    - df: DataFrame containing publication frequency data with spike information
    - title: Title of the plot
    """
    plt.figure(figsize=(10, 6))
    sns.lineplot(x='date', y='publication_count', data=df, label='Publication Count')
    sns.lineplot(x='date', y='rolling_avg', data=df, label='Rolling Average', linestyle='--')
    plt.scatter(df[df['spike']]['date'], df[df['spike']]['publication_count'], color='red', label='Spikes')
    plt.title(title)
    plt.xlabel('Date')
    plt.ylabel('Number of Articles Published')
    plt.legend()
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

plot_spikes(spikes_df)


In [None]:
def analyze_publishing_times(df):
    """
    Analyzes the time of day when publications occur.
    
    Parameters:
    - df: DataFrame containing the publication data with a 'date' column
    
    Returns:
    - DataFrame: Publication counts by hour of the day
    """
    df['hour'] = df['date'].dt.hour
    publication_by_hour = df.groupby('hour').size().reset_index(name='publication_count')
    
    plt.figure(figsize=(10, 6))
    sns.barplot(x='hour', y='publication_count', data=publication_by_hour, palette="viridis")
    plt.title("Publication Frequency by Hour of the Day")
    plt.xlabel('Hour of the Day')
    plt.ylabel('Number of Articles Published')
    plt.tight_layout()
    plt.show()

analyze_publishing_times(df)
