### Correlation Analysis

Average Sentiment Scores over time across All Categories

In [3]:
import pandas as pd
import plotly.express as px

file_path = 'cnn_news_articles_final_cleaned.csv'
df = pd.read_csv(file_path)

df['date published'] = pd.to_datetime(df['date published'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

df['sentiment_score'] = pd.to_numeric(df['sentiment_score'], errors='coerce')

df['year_month'] = df['date published'].dt.to_period('M')

monthly_avg_sentiment = df.groupby(['year_month', 'category'])['sentiment_score'].mean().reset_index()

pivot_data = monthly_avg_sentiment.pivot(index='year_month', columns='category', values='sentiment_score')

pivot_data.reset_index(inplace=True)
pivot_data['year_month'] = pivot_data['year_month'].astype(str)

melted_data = pivot_data.melt(id_vars='year_month', var_name='category', value_name='sentiment_score')

fig = px.line(
    melted_data, 
    x='year_month', 
    y='sentiment_score', 
    color='category',
    title='Average Sentiment Scores over time across All Categories',
    labels={
        'year_month': 'Date (Year-Month)',
        'sentiment_score': 'Sentiment Score',
        'category': 'Category'
    }
)
fig.update_layout(
    xaxis_title='Date (Year-Month)',
    yaxis_title='Sentiment Score',
    legend_title='Category',
    xaxis=dict(tickangle=45),
    width=1000,
    height=600,
    margin=dict(l=20, r=20, t=60, b=20)
)

fig.show()

Topics over Time

In [4]:
import pandas as pd
import joblib

df = pd.read_csv('cnn_news_articles_final_cleaned.csv')
df = df.sample(n=9570, random_state=42)

df['date published'] = pd.to_datetime(df['date published'])  

timestamps = df['date published'].tolist()
texts = df['text'].tolist()

bertopic_model = joblib.load('berTopic.pkl')
print('BERTopic model loaded')

topics_over_time = bertopic_model.topics_over_time(docs=texts, 
                                                    timestamps=timestamps, 
                                                    nr_bins=20)

topics_over_time = bertopic_model.topics_over_time(docs=texts, 
                                                    timestamps=timestamps, 
                                                    nr_bins=20,
                                                    global_tuning=True, 
                                                    evolution_tuning=True)

bertopic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html



BERTopic model loaded
