# News Sentiment Analysis

## Import Libraries

In [1]:
# // TODO: TINGS
#     ✓ Find API for data collection
#     - GPU Selector
#     - Data Gathering
#     - Data Cleaning
#     - Feature generation
#     - Feature Engineering/selection
#     - Model Train
#     - Model Test
#     - Model Evaluation

In [2]:
# Common Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import glob
import re

# Cleaner output
from tqdm import tqdm

# Deep Learning Libraries
import torch

# Add the path to the API Scraper
## Project Path
project_path = "../"

## Add the path to API Scraper
sys.path.append(os.path.abspath(os.path.join(project_path, "lib")))

# Custom API Scraping Libraries
from scraper import get_cached_news_metadata, extract_text_from_url

## Fetch Data

In [3]:
def scrape_data(n_request: int = 10, before_date: str = "2025-12", path: str = ".", last_idx:int = 0) -> pd.DataFrame:
    
    articles = []

    for i in tqdm(range(last_idx, last_idx + n_request), desc="Fetching News Data...", unit="news"):
        metadata = get_cached_news_metadata(page=i, before_date=before_date, path=path)

        data_list = metadata.get("data", [])

        for article in data_list:
            # extract sentiment (first entity if exists)
            entities = article.get("entities", [])
            if entities and "sentiment_score" in entities[0]:
                sentiment = entities[0]["sentiment_score"]
            else:
                sentiment = None

            # store sentiment as its own field inside article
            article["sentiment"] = sentiment

            articles.append(article)

    # final dataframe
    return pd.DataFrame(articles)

### Caching

In [10]:
# Arguments for the caching function
before_date = "2025-12"

data_path = os.path.join(project_path,f"news_cache/{before_date}/csv/")
os.makedirs(data_path, exist_ok=True) # if the directory exist no need to make

cached_file = os.path.join(data_path, f"{before_date}_news_data.csv")

get_new_data = True

# get the last index of the file 
json_files = os.path.join(project_path, f"news_cache/{before_date}/json/page-*.json")
list_of_files = glob.glob(json_files)

## Extract page numbers
get_new_requests = False
if get_new_requests:
    page_nums = []
    for f in list_of_files:
        match = re.search(r"page-(\d+)\.json$", os.path.basename(f))
        if match:
            page_nums.append(int(match.group(1)))

    last_idx = max(page_nums)
    n_requests = 101 # marketaux api limit

else:
    last_idx = 0
    n_requests = len(list_of_files) #number of pages we cached to be processed

In [12]:
# tqdm for cleaner output
tqdm.pandas(desc="Extracting News from URL's", unit="news")

# We will cache the data so that it will load faster
if os.path.exists(cached_file) and not get_new_data:
    print("Loading cached dataset...")
    news_df = pd.read_csv(cached_file)
    print("Cached dataset loaded")

elif os.path.exists(cached_file) and get_new_data:
    print("Overwriting old data and caching new data...")
    # Scrape the data
    news_df = scrape_data(n_request=n_requests, before_date= before_date, path= project_path, last_idx=last_idx)
    
    # Extract text from the news
    news_df["text"] = news_df["url"].progress_apply(extract_text_from_url)
    news_df.to_csv(cached_file, index=False)
    print("Done Overwriting old data and caching new data...")

else:
    print("Creating and caching dataset...")
    news_df = scrape_data(n_request=n_requests, before_date= before_date, path= project_path, last_idx=last_idx)
    news_df["text"] = news_df["url"].progress_apply(extract_text_from_url)
    news_df.to_csv(cached_file, index=False)
    print("Finished Caching")

Overwriting old data and caching new data...


Fetching News Data...: 100%|██████████| 194/194 [00:00<00:00, 766.46news/s]
Extracting News from URL's: 100%|██████████| 582/582 [13:23<00:00,  1.38s/news] 

Done Overwriting old data and caching new data...





### Fetch the Text from URL

## EDA

### View the Data

In [None]:
news_df.head(10)

Unnamed: 0,uuid,title,description,keywords,snippet,url,image_url,language,published_at,source,relevance_score,entities,similar,sentiment,text
0,487e6a88-d3c2-4ae1-8dc2-26af6b31d688,2025: The Year Of Alphabet (GOOG),No stock has seen a bigger jump recently than ...,,vzphotos/iStock Editorial via Getty Images\n\n...,https://seekingalpha.com/article/4848680-2025-...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T05:30:00.000000Z,seekingalpha.com,,"[{'symbol': 'GOOGL', 'name': 'Alphabet Inc.', ...",[],0.0,vzphotos/iStock Editorial via Getty Images\n\n...
1,92b5c2bd-d324-4ae8-b115-2cfd95a8fa98,Why I'm Doubling Down On My Adobe Position (NA...,"Adobe's revenue is highly predictable, driven ...",,To say that Adobe ( ADBE ) stock has not had a...,https://seekingalpha.com/article/4848762-why-i...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T05:25:01.000000Z,seekingalpha.com,,"[{'symbol': 'ADBE', 'name': 'Adobe Inc.', 'exc...",[],0.0,To say that Adobe ( ADBE ) stock has not had a...
2,9084e5f1-75f5-4f15-aa3d-0676073b4aaf,Global week ahead: The start of a Santa Rally ...,,"STOXX 600, business news",And just like that... December is upon us. It'...,https://www.cnbc.com/2025/11/30/global-week-ah...,https://image.cnbcfm.com/api/v1/image/10823257...,en,2025-11-30T05:10:58.000000Z,cnbc.com,,"[{'symbol': 'M', 'name': 'Macy's, Inc.', 'exch...",[],0.6908,And just like that... December is upon us. It'...
3,487e6a88-d3c2-4ae1-8dc2-26af6b31d688,2025: The Year Of Alphabet (GOOG),No stock has seen a bigger jump recently than ...,,vzphotos/iStock Editorial via Getty Images\n\n...,https://seekingalpha.com/article/4848680-2025-...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T05:30:00.000000Z,seekingalpha.com,,"[{'symbol': 'GOOGL', 'name': 'Alphabet Inc.', ...",[],0.0,vzphotos/iStock Editorial via Getty Images\n\n...
4,92b5c2bd-d324-4ae8-b115-2cfd95a8fa98,Why I'm Doubling Down On My Adobe Position (NA...,"Adobe's revenue is highly predictable, driven ...",,To say that Adobe ( ADBE ) stock has not had a...,https://seekingalpha.com/article/4848762-why-i...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T05:25:01.000000Z,seekingalpha.com,,"[{'symbol': 'ADBE', 'name': 'Adobe Inc.', 'exc...",[],0.0,To say that Adobe ( ADBE ) stock has not had a...
5,9084e5f1-75f5-4f15-aa3d-0676073b4aaf,Global week ahead: The start of a Santa Rally ...,,"STOXX 600, business news",And just like that... December is upon us. It'...,https://www.cnbc.com/2025/11/30/global-week-ah...,https://image.cnbcfm.com/api/v1/image/10823257...,en,2025-11-30T05:10:58.000000Z,cnbc.com,,"[{'symbol': 'M', 'name': 'Macy's, Inc.', 'exch...",[],0.6908,And just like that... December is upon us. It'...
6,7d36a275-f3a3-44ea-8cbc-caa0d67749c4,Global Risk Monitor: Week in Review – Nov 28,KEY ISSUES Silver surged 13% for the week and ...,,KEY ISSUES\n\nSilver surged 13% for the week a...,https://global-macro-monitor.com/2025/11/29/gl...,https://global-macro-monitor.com/wp-content/up...,en,2025-11-30T05:07:50.000000Z,global-macro-monitor.com,,"[{'symbol': 'NVDA', 'name': 'NVIDIA Corporatio...",[],-0.3612,KEY ISSUES\n\nSilver surged 13% for the week a...
7,42ba634c-b7ce-491a-91c0-e2b1424af827,"Mcap boost: 7 of top-10 firms gain ₹96,201 cr;...",Market valuations of seven top firms rose by ₹...,,The combined market valuation of seven of the ...,https://www.thehindubusinessline.com/markets/m...,https://bl-i.thgim.com/public/incoming/ji6cih/...,en,2025-11-30T05:04:20.000000Z,thehindubusinessline.com,,"[{'symbol': 'SBKFF', 'name': 'State Bank of In...",[],0.0,The combined market valuation of seven of the ...
8,47000f09-22ab-4309-9411-c0c738327c25,QQQX: Tax-Efficient Dividends From The Nasdaq-...,Discover why Nuveen NASDAQ 100 Dynamic Overwri...,,"With the rise of covered call ETFs, it can be ...",https://seekingalpha.com/article/4848757-qqqx-...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T04:23:00.000000Z,seekingalpha.com,,"[{'symbol': 'QQQX', 'name': 'Nuveen Nasdaq 100...",[],0.3715,"With the rise of covered call ETFs, it can be ..."
9,927ce408-c559-4f17-b673-37b0e9e301d7,Wall Street predicts rebound in Indian markets...,Wall Street giants anticipate a rebound for In...,"Indian markets rebound, Morgan Stanley India f...",Live Events\n\nBloomberg\n\nBloomberg\n\nRBI S...,https://economictimes.indiatimes.com/markets/s...,"https://img.etimg.com/thumb/msid-125668199,wid...",en,2025-11-30T04:14:43.000000Z,economictimes.indiatimes.com,,"[{'symbol': 'C', 'name': 'Citigroup Inc.', 'ex...",[{'uuid': 'd359704e-cc4f-4e18-a564-f8e189eae75...,-0.6369,It seems like you're already an ETPrime member...


In [None]:
news_df

Unnamed: 0,uuid,title,description,keywords,snippet,url,image_url,language,published_at,source,relevance_score,entities,similar,sentiment,text
0,487e6a88-d3c2-4ae1-8dc2-26af6b31d688,2025: The Year Of Alphabet (GOOG),No stock has seen a bigger jump recently than ...,,vzphotos/iStock Editorial via Getty Images\n\n...,https://seekingalpha.com/article/4848680-2025-...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T05:30:00.000000Z,seekingalpha.com,,"[{'symbol': 'GOOGL', 'name': 'Alphabet Inc.', ...",[],0.0000,vzphotos/iStock Editorial via Getty Images\n\n...
1,92b5c2bd-d324-4ae8-b115-2cfd95a8fa98,Why I'm Doubling Down On My Adobe Position (NA...,"Adobe's revenue is highly predictable, driven ...",,To say that Adobe ( ADBE ) stock has not had a...,https://seekingalpha.com/article/4848762-why-i...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T05:25:01.000000Z,seekingalpha.com,,"[{'symbol': 'ADBE', 'name': 'Adobe Inc.', 'exc...",[],0.0000,To say that Adobe ( ADBE ) stock has not had a...
2,9084e5f1-75f5-4f15-aa3d-0676073b4aaf,Global week ahead: The start of a Santa Rally ...,,"STOXX 600, business news",And just like that... December is upon us. It'...,https://www.cnbc.com/2025/11/30/global-week-ah...,https://image.cnbcfm.com/api/v1/image/10823257...,en,2025-11-30T05:10:58.000000Z,cnbc.com,,"[{'symbol': 'M', 'name': 'Macy's, Inc.', 'exch...",[],0.6908,And just like that... December is upon us. It'...
3,487e6a88-d3c2-4ae1-8dc2-26af6b31d688,2025: The Year Of Alphabet (GOOG),No stock has seen a bigger jump recently than ...,,vzphotos/iStock Editorial via Getty Images\n\n...,https://seekingalpha.com/article/4848680-2025-...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T05:30:00.000000Z,seekingalpha.com,,"[{'symbol': 'GOOGL', 'name': 'Alphabet Inc.', ...",[],0.0000,vzphotos/iStock Editorial via Getty Images\n\n...
4,92b5c2bd-d324-4ae8-b115-2cfd95a8fa98,Why I'm Doubling Down On My Adobe Position (NA...,"Adobe's revenue is highly predictable, driven ...",,To say that Adobe ( ADBE ) stock has not had a...,https://seekingalpha.com/article/4848762-why-i...,https://static.seekingalpha.com/cdn/s3/uploads...,en,2025-11-30T05:25:01.000000Z,seekingalpha.com,,"[{'symbol': 'ADBE', 'name': 'Adobe Inc.', 'exc...",[],0.0000,To say that Adobe ( ADBE ) stock has not had a...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,144b7211-1e1c-48c6-bae8-5d6398cb082e,"AYURCANN REPORTS Q1 2026 SALES OF $14,638,697","Toronto, Ontario, Nov. 28, 2025 (GLOBE NEWSWIR...","AYURCANN, REPORTS, Q1, 2026, SALES, OF, $14, 6...","Toronto, Ontario, Nov. 28, 2025 (GLOBE NEWSWIR...",https://www.manilatimes.net/2025/11/29/tmt-new...,https://www.manilatimes.net/manilatimes/upload...,en,2025-11-28T22:43:30.000000Z,manilatimes.net,,"[{'symbol': 'AYUR.CN', 'name': 'Ayurcann Holdi...",[],0.3291,
281,8f88685e-5da0-4689-aff5-044f40634581,Canadians Love GICs: Can They Do Better?,Explore the world of GICs and understand their...,,Editorial Note: Guaranteed Investment Certific...,https://www.looniedoctor.ca/2025/11/28/gic-alt...,https://i0.wp.com/www.looniedoctor.ca/wp-conte...,en,2025-11-28T22:30:00.000000Z,looniedoctor.ca,,"[{'symbol': 'BMMVF', 'name': 'BMO ASSET MANAGE...",[],0.2446,Editorial Note: Guaranteed Investment Certific...
282,a074478f-93e8-4208-9623-ab48a873164a,ERES Enters Into Agreements to Sell Three Prop...,"TORONTO, Nov. 28, 2025 (GLOBE NEWSWIRE) -- Eur...","ERES, Enters, Into, Agreements, to, Sell, Thre...","TORONTO, Nov. 28, 2025 (GLOBE NEWSWIRE) -- Eur...",https://www.manilatimes.net/2025/11/29/tmt-new...,https://www.manilatimes.net/manilatimes/upload...,en,2025-11-28T22:23:42.000000Z,manilatimes.net,,"[{'symbol': 'ERE-UN.TO', 'name': 'European Res...",[],0.2732,
283,13c02a78-fa81-46e7-a4f3-d19356c8ce13,Fairfax Announces Intention to Redeem Cumulati...,"TORONTO, Nov. 28, 2025 (GLOBE NEWSWIRE) -- Fai...","Fairfax, Announces, Intention, to, Redeem, Cum...","TORONTO, Nov. 28, 2025 (GLOBE NEWSWIRE) -- Fai...",https://www.manilatimes.net/2025/11/29/tmt-new...,https://www.manilatimes.net/manilatimes/upload...,en,2025-11-28T22:23:16.000000Z,manilatimes.net,,"[{'symbol': 'FRFFF', 'name': 'Fairfax Financia...",[{'uuid': 'ce51ee95-bf01-42de-a5e4-2fd428f9ea3...,0.9081,


### Check for Missing Values

In [None]:
is_na = pd.DataFrame(news_df.isna().sum())
is_na.columns = ["Number_Missing"]
is_na["Missing_Percentage"] = (is_na["Number_Missing"] / len(news_df) * 100)
print(is_na)

                 Number_Missing  Missing_Percentage
uuid                          0                 0.0
title                         0                 0.0
description                   0                 0.0
keywords                      0                 0.0
snippet                       0                 0.0
url                           0                 0.0
image_url                     0                 0.0
language                      0                 0.0
published_at                  0                 0.0
source                        0                 0.0
relevance_score             285               100.0
entities                      0                 0.0
similar                       0                 0.0
sentiment                     0                 0.0
text                          0                 0.0


## Data Cleaning

In [None]:
def clean_text(text: str, language: str='english', tokenize: bool = False, remove_stop_words: bool = False, stem_words: bool = False, remove_url: bool = False, remove_emojis: str = "convert", expand_abbreviations: bool=False):
    """
    #### Description:
    This function is to clean the text from stopwords, punctuation and return a clean text for further analysis

    Args:
        text (str):
            The dataframe containing the text data
        
        language (str):
            This are the available languages for the stopwords:
            - "catalan"
            - "czech"
            - "german"
            - "greek"
            - "english"
            - "spanish"
            - "finnish"
            - "french"
            - "hungarian"
            - "icelandic"
            - "italian"
            - "latvian"
            - "dutch"
            - "polish"
            - "portuguese"
            - "romanian"
            - "russian"
            - "slovak"
            - "slovenian"
            - "swedish"
            - "tamil"
        
        tokenize (bool):
            True = return tokenized data
            False = return untokenized data
        
        remove_stop_words (bool):
            True = remove stop words
            False = do not remove stop words

        stem_words (bool):
            True = get the base words (i.e. spraying -> spray)
            False = leave the words as is

        remove_url (bool):
            True = Remove the url in the text
            False = leave the text as is
        
        remove_emojis (str):
            "remove" = Removes the emoji in text
            "convert = converts emoji to text (e.g. ❤️ -> :red_heart:)
            "keep" = keeps the emoji as is
        
        expand_abbreviations (bool):
            True = Expand abbreviations (e.g. brb -> "be right back")
            False = Keep the abbriviations as is
    """

    # slang dictionary
    slang_dict = {
        "brb": "be right back",
        "afk": "away from keyboard",
        "gg": "good game",
        "ggwp": "good game well played",
        "lol": "laughing out loud",
        "idk": "I do not know",
        "imo": "in my opinion",
        "lmao": "laughing my ass off",
        "lmfao": "laughing fucking my ass off",
        "sus": "suspicious",
        "rekt": "wrecked",
        "noob": "new player",
        "af": "as hell",
        "wtf": "what the fuck",
        "wth": "what the heck",
        "omg": "oh my god",
        "ty": "thank you",
        "plz": "please",
        "pls": "please",
        "u": "you",
        "r": "are",
        "thx": "thanks",
        "fr": "for real",
        "til": "today i learned",
        "asap": "as soon as possible",
        "g2g": "got to go",
        "gtg": "got to go",
        "nc": "nice",
        "fyi": "for your information",
        "ttyl": "talk to you later",
        "fb": "facebook",
        "msg": "message",
        "hifw": "how i feel when",
        "tfw": "the feeling when",
        "mfw": "my face when",
        "mrw": "my reaction when",
        "ifyp": "i feel your pain",
        "tntl": "trying not to laugh",
        "jk": "just kidding",
        "idc": "i dont care",
        "ily": "i love you",
        "imu": "i miss you",
        "zzz": "sleeping, bored, tired",
        "ftw": "for the win",
        "tbh": "to be honest",
        "ftl": "for the loss",
        "smh": "shaking my head",
        "srsly": "seriously",
        "afaik": "as far as i know",
        "dm": "direct message",
        "tldr": "too long didnt read",
        "irl": "in real life",
        "gl": "goodluck",
        "ruok": "are you ok",
        "w": "win"
    }

    stemmer = PorterStemmer()
    stop_words = set(stopwords.words(language))

    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s)]
    
    def remove_special_characters(text):
        # keep letters, numbers, underscores, colons (for demojized emojis)
        text = re.sub('[^a-zA-Z0-9_]', ' ', text)
        text = re.sub('\s+', ' ', text)
        return text

    def stem_text(tokens):
        return [stemmer.stem(t) for t in tokens]

    def remove_stopwords_func(tokens):
        return [w for w in tokens if w not in stop_words]

    def remove_url_func(text):
        return re.sub(r'https?://\S+|www\.\S+', '', text)

    def expand_slang(text):
        words = text.split()
        return " ".join([slang_dict.get(w.lower(), w) for w in words])

    # Clean process
    text = contractions.fix(text)                        # fixing contraction

    text = text.strip().lower()                          # lowercase + trim

    if remove_url:
        text = remove_url_func(text)                     # remove url
    
    # Handle emojis
    if remove_emojis.lower() == "remove":
        text = remove_special_characters(text)  # removes emojis

    elif remove_emojis.lower() == "convert":
        text = emoji.demojize(text, language="en")  # e.g.,  -> ❤️ -> :red_heart:

    elif remove_emojis.lower() == "keep":
        pass

    if expand_abbreviations:
        text = remove_special_characters(text)   # <--- clean before slang expansion
        text = expand_slang(text)

    text = remove_special_characters(text)               # Remove other special characters (but preserve converted emojis with underscores)
    
    tokens = tokenize_text(text)                         # tokenize words

    if remove_stop_words:
        tokens = remove_stopwords_func(tokens)           # remove stopwords
        
    if stem_words:
        tokens = stem_text(tokens)                       # stemming

    if tokenize:
        return tokens                                    # return as tokens
    else:
        return " ".join(tokens)                          # return as string