
# News Sentiment Analysis and Topic Modeling Notebook

This notebook demonstrates how to use the `NewsNLPAnalyzer` class for:
- Loading news data from a CSV file with columns: `headline`, `url`, `publisher`, `date`, `stock_ticker`
- Cleaning and preprocessing text
- Conducting sentiment analysis using TextBlob and VADER
- Performing topic modeling (LDA) to identify main topics in headlines
- Visualizing sentiment scores and topic keywords interactively with Plotly


In [34]:
import sys
import os
# Go up one level to project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

# Add project root to sys.path if not already there
if project_root not in sys.path:
    sys.path.append(project_root)

In [18]:
# Step 0: Install and download necessary NLTK data (run once)
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Eldan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Eldan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Eldan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Eldan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Eldan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [35]:

# Import necessary libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scripts.sentiment_analysis import NewsNLPAnalyzer



In [36]:
# Step 1: Load your CSV file (adjust path as needed)
csv_path = "../data/raw/raw_analyst_ratings.csv"  # update with your path
df = pd.read_csv(csv_path)
print("Columns in dataset:", df.columns.tolist())
print(df.head(3))

print("Columns in dataset:", df.columns.tolist())
print(f"Data loaded with shape: {df.shape}")
print(f"Data loaded: {df.describe()}")

Columns in dataset: ['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock']
   Unnamed: 0                                    headline  \
0           0     Stocks That Hit 52-Week Highs On Friday   
1           1  Stocks That Hit 52-Week Highs On Wednesday   
2           2               71 Biggest Movers From Friday   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   

                        date stock  
0  2020-06-05 10:30:54-04:00     A  
1  2020-06-03 10:45:20-04:00     A  
2  2020-05-26 04:30:07-04:00     A  
Columns in dataset: ['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock']
Data loaded with shape: (1407328, 6)
Data loaded:          Unnamed: 0
count  1.407328e+06
mean   7.072454e+05
std    4.081009e+05
min    0

In [39]:
# Step 2: Initialize the analyzer with DataFrame and specify the text column
analyzer = NewsNLPAnalyzer(df, text_column="headline")


In [41]:
# Step 3: Calculate sentiment scores using TextBlob and VADER
# This will analyze the headlines and return polarity scores
# TextBlob provides polarity (-1 to 1) and VADER gives compound scores (-1 to 1)
df_with_sentiment = analyzer.calculate_sentiments()
print(df_with_sentiment[['headline', 'textblob_polarity', 'vader_compound']].head())


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\Eldan/nltk_data'
    - 'd:\\Week1\\Sentiment-analysis-for-stock\\env\\nltk_data'
    - 'd:\\Week1\\Sentiment-analysis-for-stock\\env\\share\\nltk_data'
    - 'd:\\Week1\\Sentiment-analysis-for-stock\\env\\lib\\nltk_data'
    - 'C:\\Users\\Eldan\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [None]:
# Step 1: Load your CSV file (adjust path as needed)
csv_path = "../data/raw/raw_analyst_ratings.csv"  # update with your path
df = pd.read_csv(csv_path)
print("Columns in dataset:", df.columns.tolist())
print(df.head(3))

print("Columns in dataset:", df.columns.tolist())
print(f"Data loaded with shape: {df.shape}")
print(f"Data loaded: {df.describe()}")

Columns in dataset: ['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock']
   Unnamed: 0                                    headline  \
0           0     Stocks That Hit 52-Week Highs On Friday   
1           1  Stocks That Hit 52-Week Highs On Wednesday   
2           2               71 Biggest Movers From Friday   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   

                        date stock  
0  2020-06-05 10:30:54-04:00     A  
1  2020-06-03 10:45:20-04:00     A  
2  2020-05-26 04:30:07-04:00     A  
Columns in dataset: ['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock']
Data loaded with shape: (1407328, 6)
Data loaded:          Unnamed: 0
count  1.407328e+06
mean   7.072454e+05
std    4.081009e+05
min    0

In [None]:
# Step 1: Load your CSV file (adjust path as needed)
csv_path = "../data/raw/raw_analyst_ratings.csv"  # update with your path
df = pd.read_csv(csv_path)
print("Columns in dataset:", df.columns.tolist())
print(df.head(3))

print("Columns in dataset:", df.columns.tolist())
print(f"Data loaded with shape: {df.shape}")
print(f"Data loaded: {df.describe()}")

Columns in dataset: ['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock']
   Unnamed: 0                                    headline  \
0           0     Stocks That Hit 52-Week Highs On Friday   
1           1  Stocks That Hit 52-Week Highs On Wednesday   
2           2               71 Biggest Movers From Friday   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   

                        date stock  
0  2020-06-05 10:30:54-04:00     A  
1  2020-06-03 10:45:20-04:00     A  
2  2020-05-26 04:30:07-04:00     A  
Columns in dataset: ['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock']
Data loaded with shape: (1407328, 6)
Data loaded:          Unnamed: 0
count  1.407328e+06
mean   7.072454e+05
std    4.081009e+05
min    0

In [None]:
# Step 1: Load your CSV file (adjust path as needed)
csv_path = "../data/raw/raw_analyst_ratings.csv"  # update with your path
df = pd.read_csv(csv_path)
print("Columns in dataset:", df.columns.tolist())
print(df.head(3))

print("Columns in dataset:", df.columns.tolist())
print(f"Data loaded with shape: {df.shape}")
print(f"Data loaded: {df.describe()}")

Columns in dataset: ['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock']
   Unnamed: 0                                    headline  \
0           0     Stocks That Hit 52-Week Highs On Friday   
1           1  Stocks That Hit 52-Week Highs On Wednesday   
2           2               71 Biggest Movers From Friday   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   

                        date stock  
0  2020-06-05 10:30:54-04:00     A  
1  2020-06-03 10:45:20-04:00     A  
2  2020-05-26 04:30:07-04:00     A  
Columns in dataset: ['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock']
Data loaded with shape: (1407328, 6)
Data loaded:          Unnamed: 0
count  1.407328e+06
mean   7.072454e+05
std    4.081009e+05
min    0

In [None]:

# Step 4: Extract top keywords/topics from headlines
keywords = analyzer.extract_keywords(top_n=30)
print(keywords)


In [None]:

# Step 5: (Optional) Plot sentiment comparison
fig = analyzer.plot_sentiment_comparison()
fig.show()


In [None]:

# Step 6: Visualize VADER sentiment distribution
fig2 = analyzer.plot_vader_distribution()
fig2.show()


In [None]:

# Step 7: Visualize topic modeling keywords
fig3 = analyzer.plot_topic_keywords()
fig3.show()
