# News Sentiment Analysis

## Import Libraries

In [1]:
# // TODO: TINGS
#     ✓ Find API for data collection
#     - GPU Selector
#     - Data Gathering
#     - Data Cleaning
#     - Feature generation
#     - Feature Engineering/selection
#     - Model Train
#     - Model Test
#     - Model Evaluation

In [2]:
# Common Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys

# Cleaner output
from tqdm import tqdm

# Deep Learning Libraries
import torch

# Add the path to the API Scraper
## Project Path
project_path = "../"

## Add the path to API Scraper
sys.path.append(os.path.abspath(os.path.join(project_path, "lib")))

# Custom API Scraping Libraries
from scraper import get_cached_news_metadata

## Fetch Data

In [3]:
def scrape_data(n_pages: int = 10, before_date: str = "2025-12", path: str = ".") -> pd.DataFrame:
    
    articles = []

    for i in tqdm(range(n_pages), desc="Fetching News Data...", unit="news"):
        metadata = get_cached_news_metadata(page=i, before_date=before_date, path=path)

        data_list = metadata.get("data", [])

        for article in data_list:
            # extract sentiment (first entity if exists)
            entities = article.get("entities", [])
            if entities and "sentiment_score" in entities[0]:
                sentiment = entities[0]["sentiment_score"]
            else:
                sentiment = None

            # store sentiment as its own field inside article
            article["sentiment"] = sentiment

            articles.append(article)

    # final dataframe
    return pd.DataFrame(articles)

### Caching

In [8]:
# caching the dataset
before_date = "2025-12"

data_path = os.path.join(project_path,f"news_cache/{before_date}/csv/")
os.makedirs(data_path, exist_ok=True) # if the directory exist no need to make

cached_file = os.path.join(data_path, f"{before_date}_news_data.csv")

get_new_data = False

# We will cache the data so that it will load faster
if os.path.exists(cached_file) and not get_new_data:
    print("Loading cached dataset...")
    news_df = pd.read_csv(cached_file)
    print("Cached dataset loaded")

elif os.path.exists(cached_file) and get_new_data:
    print("Overwriting old data and caching new data...")
    news_df = scrape_data(n_pages=97, before_date= before_date, path= project_path)
    news_df.to_csv(cached_file, index=False)
    print("Done Overwriting old data and caching new data...")

else:
    print("Creating and caching dataset...")
    news_df = scrape_data(n_pages=97, before_date= before_date, path= project_path)
    news_df.to_csv(cached_file, index=False)
    print("Finished Caching")


Loading cached dataset...
Cached dataset loaded


## EDA

### View the Data

In [9]:
news_df.head()

Unnamed: 0,uuid,title,description,keywords,snippet,url,image_url,language,published_at,source,relevance_score,entities,similar,sentiment
0,c3d88644-419a-4fe8-b446-628190c63b65,"AI tools are 'deskilling' workers, philosophy ...",A philosophy professor warns that AI reliance ...,,"AI is helping workers move faster, but a profe...",https://www.businessinsider.com/ai-tools-are-d...,https://i.insider.com/69297c44abd5e944effbaa28...,en,2025-11-29T10:55:36.000000Z,businessinsider.com,,[],[{'uuid': '3db5ec1d-d43f-4af5-a24e-7df27a7099a...,
1,abb28cdb-3dbe-483e-8d8c-38ba9a94bdfd,I landed a job at LinkedIn by posting on the p...,By sharing his work on LinkedIn and his websit...,,This story is available exclusively to Busines...,https://www.businessinsider.com/how-software-e...,https://i.insider.com/6927676f89026fbb4d0e8612...,en,2025-11-29T10:55:36.000000Z,businessinsider.com,,[],[],
2,55e32ef0-9620-4e32-b266-600453761c0e,10 companies that bounced back after bankruptcy,"Companies like Hooters, Marvel, Converse, and ...",,Hooters is once again owned by its founders af...,https://www.businessinsider.com/companies-bank...,https://i.insider.com/6926263689026fbb4d0e6d29...,en,2025-11-29T10:55:36.000000Z,businessinsider.com,,[],[],
3,c3d88644-419a-4fe8-b446-628190c63b65,"AI tools are 'deskilling' workers, philosophy ...",A philosophy professor warns that AI reliance ...,,"AI is helping workers move faster, but a profe...",https://www.businessinsider.com/ai-tools-are-d...,https://i.insider.com/69297c44abd5e944effbaa28...,en,2025-11-29T10:55:36.000000Z,businessinsider.com,,[],[{'uuid': '3db5ec1d-d43f-4af5-a24e-7df27a7099a...,
4,abb28cdb-3dbe-483e-8d8c-38ba9a94bdfd,I landed a job at LinkedIn by posting on the p...,By sharing his work on LinkedIn and his websit...,,This story is available exclusively to Busines...,https://www.businessinsider.com/how-software-e...,https://i.insider.com/6927676f89026fbb4d0e8612...,en,2025-11-29T10:55:36.000000Z,businessinsider.com,,[],[],


### Check for Missing Values

In [11]:
is_na = pd.DataFrame(news_df.isna().sum())
is_na.columns = ["Number_Missing"]
is_na["Missing_Percentage"] = (is_na["Number_Missing"] / len(news_df) * 100)
print(is_na)

                 Number_Missing  Missing_Percentage
uuid                          0            0.000000
title                         0            0.000000
description                  13            4.467354
keywords                    132           45.360825
snippet                       5            1.718213
url                           0            0.000000
image_url                    12            4.123711
language                      0            0.000000
published_at                  0            0.000000
source                        0            0.000000
relevance_score             291          100.000000
entities                      0            0.000000
similar                       0            0.000000
sentiment                   262           90.034364
