# News Sentiment Analysis

## Import Libraries

In [1]:
# // TODO: TINGS
#     ✓ Find API for data collection
#     - GPU Selector
#     - Data Gathering
#     - Data Cleaning
#     - Feature generation
#     - Feature Engineering/selection
#     - Model Train
#     - Model Test
#     - Model Evaluation

In [2]:
# Common Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys

# Cleaner output
from tqdm import tqdm

# Deep Learning Libraries
import torch

# Add the path to the API Scraper
## Project Path
project_path = "../"

## Add the path to API Scraper
sys.path.append(os.path.abspath(os.path.join(project_path, "lib")))

# Custom API Scraping Libraries
from scraper import get_cached_news_metadata

## Fetch Data

In [3]:
def scrape_data(n_pages: int = 10, before_date: str = "2025-12", path: str = ".") -> pd.DataFrame:
    
    articles = []

    for i in tqdm(range(n_pages), desc="Fetching News Data...", unit="news"):
        metadata = get_cached_news_metadata(page=i, before_date=before_date, path=path)

        data_list = metadata.get("data", [])

        for article in data_list:
            # extract sentiment (first entity if exists)
            entities = article.get("entities", [])
            if entities and "sentiment_score" in entities[0]:
                sentiment = entities[0]["sentiment_score"]
            else:
                sentiment = None

            # store sentiment as its own field inside article
            article["sentiment"] = sentiment

            articles.append(article)

    # final dataframe
    return pd.DataFrame(articles)

### Caching

In [4]:
# caching the dataset
before_date = "2025-12"

data_path = os.path.join(project_path,f"news_cache/{before_date}/csv/")
os.makedirs(data_path, exist_ok=True) # if the directory exist no need to make

cached_file = os.path.join(data_path, f"{before_date}_news_data.csv")

get_new_data = False

# We will cache the data so that it will load faster
if os.path.exists(cached_file) and not get_new_data:
    print("Loading cached dataset...")
    news_df = pd.read_csv(cached_file)
    print("Cached dataset loaded")

elif os.path.exists(cached_file) and get_new_data:
    print("Overwriting old data and caching new data...")
    news_df = scrape_data(n_pages=95, before_date= before_date, path= project_path)
    news_df.to_csv(cached_file, index=False)
    print("Done Overwriting old data and caching new data...")

else:
    print("Creating and caching dataset...")
    news_df = scrape_data(n_pages=95, before_date= before_date, path= project_path)
    news_df.to_csv(cached_file, index=False)
    print("Finished Caching")


Loading cached dataset...
Cached dataset loaded


## EDA

### View the Data

In [5]:
news_df.head()

Unnamed: 0,uuid,title,description,keywords,snippet,url,image_url,language,published_at,source,relevance_score,entities,similar,sentiment
0,487e6a88-d3c2-4ae1-8dc2-26af6b31d688,2025: The Year Of Alphabet (GOOG),No stock has seen a bigger jump recently than ...,,vzphotos/iStock Editorial via Getty Images\n\n...,https://seekingalpha.com/article/4848680-2025-...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T05:30:00.000000Z,seekingalpha.com,,"[{'symbol': 'GOOGL', 'name': 'Alphabet Inc.', ...",[],0.0
1,92b5c2bd-d324-4ae8-b115-2cfd95a8fa98,Why I'm Doubling Down On My Adobe Position (NA...,"Adobe's revenue is highly predictable, driven ...",,To say that Adobe ( ADBE ) stock has not had a...,https://seekingalpha.com/article/4848762-why-i...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T05:25:01.000000Z,seekingalpha.com,,"[{'symbol': 'ADBE', 'name': 'Adobe Inc.', 'exc...",[],0.0
2,9084e5f1-75f5-4f15-aa3d-0676073b4aaf,Global week ahead: The start of a Santa Rally ...,,"STOXX 600, business news",And just like that... December is upon us. It'...,https://www.cnbc.com/2025/11/30/global-week-ah...,https://image.cnbcfm.com/api/v1/image/10823257...,en,2025-11-30T05:10:58.000000Z,cnbc.com,,"[{'symbol': 'M', 'name': ""Macy's, Inc."", 'exch...",[],0.6908
3,487e6a88-d3c2-4ae1-8dc2-26af6b31d688,2025: The Year Of Alphabet (GOOG),No stock has seen a bigger jump recently than ...,,vzphotos/iStock Editorial via Getty Images\n\n...,https://seekingalpha.com/article/4848680-2025-...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T05:30:00.000000Z,seekingalpha.com,,"[{'symbol': 'GOOGL', 'name': 'Alphabet Inc.', ...",[],0.0
4,92b5c2bd-d324-4ae8-b115-2cfd95a8fa98,Why I'm Doubling Down On My Adobe Position (NA...,"Adobe's revenue is highly predictable, driven ...",,To say that Adobe ( ADBE ) stock has not had a...,https://seekingalpha.com/article/4848762-why-i...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T05:25:01.000000Z,seekingalpha.com,,"[{'symbol': 'ADBE', 'name': 'Adobe Inc.', 'exc...",[],0.0


### Check for Missing Values

In [6]:
is_na = pd.DataFrame(news_df.isna().sum())
is_na.columns = ["Number_Missing"]
is_na["Missing_Percentage"] = (is_na["Number_Missing"] / len(news_df) * 100)
print(is_na)

                 Number_Missing  Missing_Percentage
uuid                          0            0.000000
title                         0            0.000000
description                   4            1.403509
keywords                    131           45.964912
snippet                       0            0.000000
url                           0            0.000000
image_url                     0            0.000000
language                      0            0.000000
published_at                  0            0.000000
source                        0            0.000000
relevance_score             285          100.000000
entities                      0            0.000000
similar                       0            0.000000
sentiment                     0            0.000000


In [12]:
from newspaper import Article

url = news_df["url"][1]
article = Article(url)
print(url)

https://seekingalpha.com/article/4848762-why-i-am-doubling-down-on-my-adobe-position


In [13]:
article.download()

In [14]:
article.parse()
# article.nlp()

In [15]:
print(article.summary)




In [16]:
article.text

"To say that Adobe ( ADBE ) stock has not had a good year is an understatement. The stock has tanked almost 30% year-to-date , while some of its tech peers, such as Microsoft ( MSFT ) or Google (\n\nI am an engineer and an individual who is passionate about finance/stock investment. I hold a Ph.D. in engineering and have been involved in acoustics and noise research for the past years. I am interested in capital appreciation opportunities. These include purchasing high-quality undervalued companies, and great companies that are growing at a high rate and that are reasonably priced. I also invest in dividend-paying stocks as long as these are good companies, and they are correctly priced. I hope to provide value for the community by sharing my ideas and opinions on investing and stock-picking.\n\nAnalyst’s Disclosure:I/we have a beneficial long position in the shares of ADBE either through stock ownership, options, or other derivatives. I wrote this article myself, and it expresses my o