In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

news = pd.read_csv("data/news.csv", parse_dates=["date"])

news["headline_length"] = news["headline"].str.len()
news["word_count"] = news["headline"].str.split().str.len()

news[["headline_length","word_count"]].describe()


sns.histplot(news["headline_length"], bins=50)
plt.title("Headline Length Distribution")
plt.show()


publisher_counts = news["publisher"].value_counts()
publisher_counts.head(20).plot(kind="bar")
plt.title("Most Active Publishers")
plt.show()


news.set_index("date").resample("D").size().plot(figsize=(14,5))
plt.title("Articles per Day")
plt.show()


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

cv = CountVectorizer(stop_words="english", max_features=5000)
dtm = cv.fit_transform(news["headline"].astype(str))

lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(dtm)

words = cv.get_feature_names_out()
for i, topic in enumerate(lda.components_):
    top_words = [words[i] for i in topic.argsort()[-10:]]
    print(f"TOPIC {i}: {top_words}")


news["hour"] = news["date"].dt.hour

sns.countplot(x="hour", data=news)
plt.title("News by Hour of Day")
plt.show()


news["domain"] = news["publisher"].str.extract(r'@(.*)')
news["domain"].value_counts().head(10)
