In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from nltk.corpus import stopwords

In [None]:

# Load the data 

df = pd.read_csv("../data/raw_analyst_ratings.csv")

Descriptive Statistics

In [None]:
# Display the first few rows of the dataset for a quick overview
print("First few rows of the dataset:")
print(df.head())

In [None]:
# Basic descriptive statistics of the dataset
print("\nDescriptive statistics of numeric columns:")
print(df.describe())

In [None]:
# Convert 'date' column to datetime (automatically handles different formats)
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Make timezone-naive by localizing to None if timezone information exists
df['date'] = df['date'].dt.tz_localize(None)

In [None]:
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

In [None]:
# Handle missing data: Drop rows with missing headlines or dates
df.dropna(subset=['headline', 'date'], inplace=True)

In [None]:
# Descriptive statistics of the 'headline_length'
df['headline_length'] = df['headline'].apply(len)
print("\nDescriptive statistics for headline length:")
print(df['headline_length'].describe())

In [None]:
# Plot the distribution of headline lengths
sns.histplot(df['headline_length'], bins=30, kde=True)
plt.title("Distribution of Headline Lengths")
plt.xlabel("Headline Length")
plt.ylabel("Frequency")
plt.show()

Text Analysis(Sentiment analysis )

In [None]:
from textblob import TextBlob

# Function to calculate sentiment polarity
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Apply sentiment analysis to each headline
df['sentiment'] = df['headline'].apply(get_sentiment)

# Classify the sentiment as positive, negative, or neutral
df['sentiment_label'] = df['sentiment'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

# Display sentiment counts
print(df['sentiment_label'].value_counts())

# Plot the distribution of sentiment labels
sns.countplot(x='sentiment_label', data=df)
plt.title("Distribution of Sentiment Labels")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()

Topic Modeling and Keyword Extraction

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


# Function to clean text by removing punctuation and stopwords
def clean_text(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

# Clean headlines
df['cleaned_headline'] = df['headline'].apply(clean_text)

# Extract common keywords using CountVectorizer
vectorizer = CountVectorizer(max_features=20)
X = vectorizer.fit_transform(df['cleaned_headline'])
keywords = vectorizer.get_feature_names_out()

# Display the top 20 keywords
print("Top 20 keywords:")
print(keywords)

# Plot the frequency of the top keywords
keyword_counts = X.toarray().sum(axis=0)
sns.barplot(x=keywords, y=keyword_counts)
plt.title("Top 20 Keywords in Headlines")
plt.xlabel("Keywords")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.show()

Time Series Analysis

In [None]:
# Plot daily article count
plt.figure(figsize=(12, 6))
daily_article_count.plot()
plt.title("Daily Article Count")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.grid(True)
plt.show()

In [None]:
#Detecting Spikes in Publication Frequency

# Define a threshold for identifying spikes (e.g., mean + 2 standard deviations)
mean_count = daily_article_count.mean()
std_count = daily_article_count.std()
threshold = mean_count + 2 * std_count

# Identify days with spikes in publication frequency
spike_days = daily_article_count[daily_article_count > threshold]

# Plot daily publication count with spikes highlighted
plt.figure(figsize=(12, 6))
daily_article_count.plot(label='Daily Article Count', color='gray')
plt.scatter(spike_days.index, spike_days.values, color='red', label='Spikes', s=100, zorder=5)
plt.axhline(y=threshold, color='blue', linestyle='--', label=f'Spike Threshold ({round(threshold, 2)})')
plt.title("Daily Article Count with Detected Spikes")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.legend()
plt.grid(True)
plt.show()

Analyzing Publication Times

In [None]:
# Extract the hour of publication from the datetime index
df['hour'] = df.index.hour

# Count the number of articles published at each hour
hourly_article_count = df['hour'].value_counts().sort_index()

In [None]:

# Plot the distribution of publishing times by hour
plt.figure(figsize=(10, 6))
sns.barplot(x=hourly_article_count.index, y=hourly_article_count.values, palette="viridis")
plt.title("Distribution of Articles by Hour of the Day")
plt.xlabel("Hour of the Day")
plt.ylabel("Number of Articles")
plt.grid(True)
plt.show()

In [None]:
#Weekly Article Count

# Time Series Analysis: Articles published per week
weekly_article_count = df['headline'].resample('W').count()

In [None]:
# Plot weekly article count
plt.figure(figsize=(12, 6))
weekly_article_count.plot()
plt.title("Weekly Article Count")
plt.xlabel("Date")
plt.ylabel("Number of Articles")
plt.grid(True)
plt.show()

In [None]:
# Monthly and Yearly Article Count

# Check publication trends by month and year
monthly_article_count = df['headline'].resample('ME').count()
yearly_article_count = df['headline'].resample('YE').count()


In [None]:

# Plot monthly and yearly article counts
fig, ax = plt.subplots(2, 1, figsize=(12, 12))

monthly_article_count.plot(ax=ax[0], color='blue')
ax[0].set_title("Monthly Article Count")
ax[0].set_xlabel("Date")
ax[0].set_ylabel("Number of Articles")

yearly_article_count.plot(ax=ax[1], color='green')
ax[1].set_title("Yearly Article Count")
ax[1].set_xlabel("Date")
ax[1].set_ylabel("Number of Articles")

plt.tight_layout()
plt.show()

Publisher Analysis

Publisher Article Count

In [None]:

# Article count per publisher
publisher_counts = df['publisher'].value_counts()


In [None]:
# Plot article count per publisher
plt.figure(figsize=(12, 6))
sns.barplot(x=publisher_counts.index, y=publisher_counts.values)
plt.title("Article Count by Publisher")
plt.xlabel("Publisher")
plt.ylabel("Number of Articles")
plt.xticks(rotation=90)
plt.show()

Publisher Analysis

In [None]:
# Top 10 publishers by article count
top_10_publishers = publisher_counts.head(10)

In [None]:
# Plot top 10 publishers
plt.figure(figsize=(10, 6))
sns.barplot(x=top_10_publishers.index, y=top_10_publishers.values, palette="viridis")
plt.title("Top 10 Publishers by Number of Articles")
plt.xlabel("Publisher")
plt.ylabel("Number of Articles")
plt.xticks(rotation=45)
plt.show()

Publisher Domain Analysis (for Email Addresses)

In [None]:
# Check if publishers are emails (simplified example)
email_publishers = df[df['publisher'].str.contains("@")]

# Extract domains from email addresses
email_publishers['domain'] = email_publishers['publisher'].apply(lambda x: x.split('@')[-1])

# Count articles per domain
domain_counts = email_publishers['domain'].value_counts()

# Display top domains
print("Top domains by number of articles:")
print(domain_counts.head(10))

# Plot article count per domain
plt.figure(figsize=(12, 6))
sns.barplot(x=domain_counts.index, y=domain_counts.values)
plt.title("Article Count by Domain")
plt.xlabel("Domain")
plt.ylabel("Number of Articles")
plt.xticks(rotation=90)
plt.show()