In [1]:
import pandas as pd
import nltk
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from datetime import datetime

In [2]:
# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:

df = pd.read_csv(r"C:\10x AIMastery\Data\raw_analyst_ratings.csv")

In [19]:
# Descriptive Statistics
# 1. Headline length
df['headline_length'] = df['headline'].apply(len)
print("Headline Length Statistics:")
print(df['headline_length'].describe())

Headline Length Statistics:
count    1.407328e+06
mean     7.312051e+01
std      4.073531e+01
min      3.000000e+00
25%      4.700000e+01
50%      6.400000e+01
75%      8.700000e+01
max      5.120000e+02
Name: headline_length, dtype: float64


In [20]:
# 2. Articles per publisher
publisher_counts = df['publisher'].value_counts()
print("\nArticles per Publisher:")
print(publisher_counts.head(10))


Articles per Publisher:
publisher
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
Eddie Staley          57254
Hal Lindon            49047
ETF Professor         28489
Juan Lopez            28438
Benzinga Staff        28114
Name: count, dtype: int64


In [23]:
# 3. Publication date trends
df['date'] = pd.to_datetime(df['date'], format='ISO8601')
df['date_only'] = df['date'].dt.date
date_counts = df['date_only'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
date_counts.plot()
plt.title('Article Publication Frequency Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.savefig(r"C:\10x AIMastery\financial-news-analysis_1\notebooks\publication_trend.png")
plt.close()

In [25]:
# Text Analysis (Topic Modeling - Basic Keyword Extraction)
stop_words = set(nltk.corpus.stopwords.words('english'))
def extract_keywords(text):
    words = nltk.word_tokenize(text.lower())
    return [word for word in words if word.isalnum() and word not in stop_words]

df['keywords'] = df['headline'].apply(extract_keywords)
all_keywords = [keyword for keywords in df['keywords'] for keyword in keywords]
keyword_counts = Counter(all_keywords)
print("\nTop 10 Keywords:")
print(keyword_counts.most_common(10))


Top 10 Keywords:
[('stocks', 161702), ('vs', 138835), ('eps', 128801), ('est', 122289), ('shares', 114140), ('reports', 108688), ('update', 91645), ('market', 91080), ('earnings', 87183), ('sales', 79528)]


In [26]:
# Time Series Analysis (Publication time)
df['hour'] = df['date'].dt.hour
hour_counts = df['hour'].value_counts().sort_index()
plt.figure(figsize=(10, 6))
sns.barplot(x=hour_counts.index, y=hour_counts.values)
plt.title('Article Publication by Hour')
plt.xlabel('Hour of Day (UTC)')
plt.ylabel('Number of Articles')
plt.savefig(r"C:\10x AIMastery\financial-news-analysis_1\notebooks\publication_hour.png")
plt.close()

In [27]:
# Publisher Analysis (Domain Extraction if emails are used)
def extract_domain(publisher):
    if '@' in publisher:
        return publisher.split('@')[-1]
    return publisher

df['publisher_domain'] = df['publisher'].apply(extract_domain)
domain_counts = df['publisher_domain'].value_counts()
print("\nTop 10 Publisher Domains:")
print(domain_counts.head(10))


Top 10 Publisher Domains:
publisher_domain
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
Eddie Staley          57254
Hal Lindon            49047
ETF Professor         28489
Juan Lopez            28438
Benzinga Staff        28114
Name: count, dtype: int64
