In [6]:
%pip install seaborn textblob plotly


Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
     ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
     - ----------------------------------- 30.7/626.3 kB 435.7 kB/s eta 0:00:02
     - ----------------------------------- 30.7/626.3 kB 435.7 kB/s eta 0:00:02
     ---- -------------------------------- 71.7/626.3 kB 435.7 kB/s eta 0:00:02
     ------ ----------------------------- 112.6/626.3 kB 595.3 kB/s eta 0:00:01
     ------- ---------------------------- 122.9/626.3 kB 479.3 kB/s eta 0:00:02
     ---------- ------------------------- 174.1/626.3 kB 551.6 kB/s eta 0:00:01
     ------------ ----------------------- 225.3/626.3 kB 625.1 kB/s eta 0:00:01
     -------------- --------------------- 256.0/626.3 kB 628.5 kB/s eta 0:00:01
     ---------------- ------------------- 286.7/626.3 kB 655.2 kB/s eta 0:00:01
     -------------------- --------------- 358.4/62


[notice] A new release of pip is available: 23.0.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
import plotly.express as px

# Load the data using the custom function
from src.data_loader import load_data


# Use a relative path (adjust according to your project structure)
file_path = '../data/raw_analyst_ratings.csv'
df = load_data(file_path)

# Display basic information about the dataset
if df is not None:
    print(df.head())


ModuleNotFoundError: No module named 'src'

In [None]:
df.info()

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

In [None]:
# Descriptive Statistics
print("Descriptive statistics:")
print(df.describe())

In [None]:
# 1. Article length analysis
df['headline_length'] = df['headline'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(df['headline_length'], bins=30, kde=True)
plt.title('Distribution of Headline Length')
plt.xlabel('Headline Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
# 2. Publication Count by Publisher
publisher_count = df['publisher'].value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(x=publisher_count.index[:10], y=publisher_count.values[:10])
plt.title('Top 10 Publishers by Article Count')
plt.xticks(rotation=45)
plt.ylabel('Number of Articles')
plt.show()

In [None]:
# 3. Time Series Analysis of Publications
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
daily_publications = df.resample('D').size()
plt.figure(figsize=(12, 6))
plt.plot(daily_publications)
plt.title('Daily Number of Publications')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.show()

In [None]:
# 4. Sentiment Analysis
def analyze_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

df['sentiment_score'] = df['headline'].apply(analyze_sentiment)
plt.figure(figsize=(10, 6))
sns.histplot(df['sentiment_score'], bins=30, kde=True)
plt.title('Sentiment Score Distribution')
plt.xlabel('Sentiment Score')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Display correlation between sentiment and headline length
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df['headline_length'], y=df['sentiment_score'])
plt.title('Sentiment Score vs Headline Length')
plt.xlabel('Headline Length')
plt.ylabel('Sentiment Score')
plt.show()