# Install dependencies

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt


In [None]:
import panda as pd

# Load Dataset

In [None]:
df = pd.read_csv('path/to/fnspid.csv')  # Replace with actual path
df.head()


# Descriptive Statistics

In [None]:
df['headline_length'] = df['headline'].apply(len)
df['headline_length'].describe()

df['publisher'].value_counts().head(10)

df['date'] = pd.to_datetime(df['date'])
df['date_only'] = df['date'].dt.date
df.groupby('date_only').size().plot(kind='line', title='Articles per Day')

# Text Analysis - Topic Modeling

In [None]:
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(df['headline'].fillna(''))

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)

# Display keywords for each topic
for i, topic in enumerate(lda.components_):
    print(f"Topic {i+1}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])


# Time Series of Publication Frequency

In [None]:
df['hour'] = df['date'].dt.hour
df['hour'].value_counts().sort_index().plot(kind='bar', title='Articles by Hour')
plt.xlabel('Hour (UTC-4)')
plt.ylabel('Number of Articles')

# Publisher Domain Analysis

In [None]:
df['publisher_domain'] = df['publisher'].str.extract(r'@(.+)$')
df['publisher_domain'].value_counts().head(10)

# Articles per Day

In [None]:
plot_articles_per_day(df)

# Topic Modeling

In [None]:
topics = topic_modeling(df['headline'], n_topics=5)
for i, topic in enumerate(topics, 1):
    print(f"Topic {i}: {', '.join(topic)}")

# Articles by Hour

In [None]:
plot_articles_by_hour(df)