In [3]:
!pip install trafilatura



In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import trafilatura
import time
import re
import html
from datetime import datetime

In [5]:
url = "https://finviz.com/quote.ashx?t=NVDA"
headers = {
    "User-Agent": "Mozilla/5.0"
}

response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

news_table = soup.find("table", class_="fullview-news-outer")

news = []
current_date = None

# Today's date in Finviz format (e.g., Jun-22-25)
today_finviz = datetime.today().strftime("%b-%d-%y")

for row in news_table.find_all("tr"):
    try:
        cols = row.find_all("td")
        if len(cols) < 2:
            continue

        timestamp_raw = cols[0].text.strip()
        headline_tag = cols[1].find("a")
        if not headline_tag:
            continue

        headline = headline_tag.text.strip()
        link = "https://finviz.com" + headline_tag["href"].strip()

        source_tag = cols[1].find("span", class_="nn")
        source = source_tag.text.strip("()") if source_tag else ""

        # Handle "Today" case
        if "Today" in timestamp_raw:
            time_part = timestamp_raw.replace("Today", "").strip()
            current_date = today_finviz
        elif "-" in timestamp_raw:  # New date row, e.g., Jun-21-25 07:19AM
            parts = timestamp_raw.split(" ")
            current_date = parts[0]
            time_part = parts[1] if len(parts) > 1 else ""
        else:  # Just time
            time_part = timestamp_raw

        # Combine date and time into full timestamp
        if current_date and time_part:
            try:
                dt_obj = datetime.strptime(f"{current_date} {time_part}", "%b-%d-%y %I:%M%p")
                full_timestamp = dt_obj.strftime("%Y-%m-%d %I:%M%p")
            except Exception as e:
                full_timestamp = f"{current_date} {time_part}"  # fallback
        else:
            full_timestamp = timestamp_raw

        news.append([full_timestamp, headline, source, link])
    except Exception as e:
        print("Skipping row due to error:", e)

df = pd.DataFrame(news, columns=["timestamp", "headline", "source", "url"])

In [6]:
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# Extract date from timestamp
df['date'] = df['timestamp'].dt.date

In [8]:
df

Unnamed: 0,timestamp,headline,source,url,date
0,2025-07-13 05:30:00,"Should You Invest $10,000 in Nvidia Stock Righ...",,https://finviz.com/news/102184/should-you-inve...,2025-07-13
1,2025-07-13 05:15:00,This Artificial Intelligence (AI) Stock Has Bi...,,https://finviz.com/news/102181/this-artificial...,2025-07-13
2,2025-07-13 05:04:00,The Median Retirement Savings for American Hou...,,https://finviz.com/news/102180/the-median-reti...,2025-07-13
3,2025-07-13 04:25:00,3 Artificial Intelligence (AI) Stocks That Are...,,https://finviz.com/news/102173/3-artificial-in...,2025-07-13
4,2025-07-13 03:51:00,Nvidia CEO Tops Buffett in Billionaire Index A...,,https://finviz.comhttps://finance.yahoo.com/ne...,2025-07-13
...,...,...,...,...,...
95,2025-07-11 07:04:00,Dimon Calls Out Market's Tariff Complacency. W...,,https://finviz.comhttps://www.barrons.com/arti...,2025-07-11
96,2025-07-11 07:00:00,Nvidia Is the First $4 Trillion Company. Here'...,,https://finviz.com/news/101092/nvidia-is-the-f...,2025-07-11
97,2025-07-11 06:20:00,Should SPDR S&P 500 ETF (SPY) Be on Your Inves...,,https://finviz.com/news/101081/should-spdr-sp-...,2025-07-11
98,2025-07-11 06:11:00,Bitcoin reaches new record high ahead of US Ho...,,https://finviz.comhttps://finance.yahoo.com/m/...,2025-07-11


In [9]:
# New list to store texts
article_texts = []

for url in df['url']:
    try:
        downloaded = trafilatura.fetch_url(url)
        if downloaded:
            article = trafilatura.extract(downloaded)
        else:
            article = ""
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        article = ""
    
    article_texts.append(article)
    time.sleep(1)  # be nice to websites



df['stock'] = 'nvidia'
df['article_text']= article_texts

In [10]:
df

Unnamed: 0,timestamp,headline,source,url,date,stock,article_text
0,2025-07-13 05:30:00,"Should You Invest $10,000 in Nvidia Stock Righ...",,https://finviz.com/news/102184/should-you-inve...,2025-07-13,nvidia,Key Points\nNvidia (NASDAQ: NVDA) has been an ...
1,2025-07-13 05:15:00,This Artificial Intelligence (AI) Stock Has Bi...,,https://finviz.com/news/102181/this-artificial...,2025-07-13,nvidia,|\n|||||\n|\nCoreWeave could be the biggest IP...
2,2025-07-13 05:04:00,The Median Retirement Savings for American Hou...,,https://finviz.com/news/102180/the-median-reti...,2025-07-13,nvidia,|\n|||||\n|\nAmericans aren't saving enough fo...
3,2025-07-13 04:25:00,3 Artificial Intelligence (AI) Stocks That Are...,,https://finviz.com/news/102173/3-artificial-in...,2025-07-13,nvidia,|\n|||||\n|\nGoogle parent Alphabet looks like...
4,2025-07-13 03:51:00,Nvidia CEO Tops Buffett in Billionaire Index A...,,https://finviz.comhttps://finance.yahoo.com/ne...,2025-07-13,nvidia,
...,...,...,...,...,...,...,...
95,2025-07-11 07:04:00,Dimon Calls Out Market's Tariff Complacency. W...,,https://finviz.comhttps://www.barrons.com/arti...,2025-07-11,nvidia,
96,2025-07-11 07:00:00,Nvidia Is the First $4 Trillion Company. Here'...,,https://finviz.com/news/101092/nvidia-is-the-f...,2025-07-11,nvidia,Key Points\nNvidia's market cap just hit $4 tr...
97,2025-07-11 06:20:00,Should SPDR S&P 500 ETF (SPY) Be on Your Inves...,,https://finviz.com/news/101081/should-spdr-sp-...,2025-07-11,nvidia,Looking for broad exposure to the Large Cap Bl...
98,2025-07-11 06:11:00,Bitcoin reaches new record high ahead of US Ho...,,https://finviz.comhttps://finance.yahoo.com/m/...,2025-07-11,nvidia,


In [11]:
def clean_article_text(text: str) -> str:
    if pd.isna(text) or text.strip() == "":
        return ""

    # Unescape HTML characters
    text = html.unescape(text)

    # Remove pipes and formatting symbols
    text = re.sub(r'[|]+', ' ', text)  # Convert multiple | to space

    # Remove HTML tags
    text = re.sub(r'<.*?>', ' ', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', ' ', text)

    # Remove escape sequences and non-ascii characters
    text = text.encode('ascii', 'ignore').decode('ascii')

    # Replace special quotes/dashes with standard characters
    text = text.replace('“', '"').replace('”', '"').replace("’", "'").replace('–', '-').replace('—', '-')

    # Remove bullet points and odd formatting marks
    text = re.sub(r'[\*\•\·\▪\◆\▶\-]', ' ', text)

    # Collapse all whitespace (newlines, tabs, multiple spaces)
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

# Apply to your DataFrame
df['article_text_clean'] = df['article_text'].apply(clean_article_text)

In [12]:
# Install dependencies (if not already installed)
!pip install transformers



In [13]:
df

Unnamed: 0,timestamp,headline,source,url,date,stock,article_text,article_text_clean
0,2025-07-13 05:30:00,"Should You Invest $10,000 in Nvidia Stock Righ...",,https://finviz.com/news/102184/should-you-inve...,2025-07-13,nvidia,Key Points\nNvidia (NASDAQ: NVDA) has been an ...,Key Points Nvidia (NASDAQ: NVDA) has been an e...
1,2025-07-13 05:15:00,This Artificial Intelligence (AI) Stock Has Bi...,,https://finviz.com/news/102181/this-artificial...,2025-07-13,nvidia,|\n|||||\n|\nCoreWeave could be the biggest IP...,CoreWeave could be the biggest IPO of 2025. Af...
2,2025-07-13 05:04:00,The Median Retirement Savings for American Hou...,,https://finviz.com/news/102180/the-median-reti...,2025-07-13,nvidia,|\n|||||\n|\nAmericans aren't saving enough fo...,Americans aren't saving enough for retirement....
3,2025-07-13 04:25:00,3 Artificial Intelligence (AI) Stocks That Are...,,https://finviz.com/news/102173/3-artificial-in...,2025-07-13,nvidia,|\n|||||\n|\nGoogle parent Alphabet looks like...,Google parent Alphabet looks like an absolute ...
4,2025-07-13 03:51:00,Nvidia CEO Tops Buffett in Billionaire Index A...,,https://finviz.comhttps://finance.yahoo.com/ne...,2025-07-13,nvidia,,
...,...,...,...,...,...,...,...,...
95,2025-07-11 07:04:00,Dimon Calls Out Market's Tariff Complacency. W...,,https://finviz.comhttps://www.barrons.com/arti...,2025-07-11,nvidia,,
96,2025-07-11 07:00:00,Nvidia Is the First $4 Trillion Company. Here'...,,https://finviz.com/news/101092/nvidia-is-the-f...,2025-07-11,nvidia,Key Points\nNvidia's market cap just hit $4 tr...,Key Points Nvidia's market cap just hit $4 tri...
97,2025-07-11 06:20:00,Should SPDR S&P 500 ETF (SPY) Be on Your Inves...,,https://finviz.com/news/101081/should-spdr-sp-...,2025-07-11,nvidia,Looking for broad exposure to the Large Cap Bl...,Looking for broad exposure to the Large Cap Bl...
98,2025-07-11 06:11:00,Bitcoin reaches new record high ahead of US Ho...,,https://finviz.comhttps://finance.yahoo.com/m/...,2025-07-11,nvidia,,


In [14]:
import os
def save_or_append_csv(df_new, path):
    """
    Appends df_new to an existing CSV file if it exists, otherwise creates a new one.
    Removes duplicates based on all columns.
    """
    if os.path.exists(path):
        df_existing = pd.read_csv(path)
        df_combined = pd.concat([df_existing, df_new], ignore_index=True)
        df_combined.drop_duplicates(inplace=True)
        df_combined.to_csv(path, index=False)
        print(f"Appended and saved to: {path}")
    else:
        df_new.to_csv(path, index=False)
        print(f"Created new file: {path}")

In [15]:
path_raw = "/kaggle/working/finviz_raw_articles.csv"
save_or_append_csv(df, path_raw)

Created new file: /kaggle/working/finviz_raw_articles.csv


In [16]:
df = pd.read_csv("/kaggle/working/finviz_raw_articles.csv")

In [17]:
# Step 1: Import required libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
import torch

# Step 2: Load FinBERT model & tokenizer
model_name = "yiyanghkust/finbert-tone"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Step 3: Create sentiment analysis pipeline
finbert_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, truncation=True)

# Step 4: Define a function to apply FinBERT to text
def get_finbert_sentiment(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return "NEUTRAL"  # default for empty or NaN
    result = finbert_pipeline(text[:512])[0]  # truncate to 512 tokens
    return result['label']

# Step 5: Apply to your DataFrame
df['sentiment_finbert'] = df['article_text_clean'].apply(get_finbert_sentiment)

2025-07-13 10:36:40.712084: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752403000.963184      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752403001.036941      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Device set to use cpu
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

In [18]:
df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,timestamp,headline,source,url,date,stock,article_text,article_text_clean,sentiment_finbert
0,2025-07-13 05:30:00,"Should You Invest $10,000 in Nvidia Stock Righ...",,https://finviz.com/news/102184/should-you-inve...,2025-07-13,nvidia,Key Points\nNvidia (NASDAQ: NVDA) has been an ...,Key Points Nvidia (NASDAQ: NVDA) has been an e...,Positive
1,2025-07-13 05:15:00,This Artificial Intelligence (AI) Stock Has Bi...,,https://finviz.com/news/102181/this-artificial...,2025-07-13,nvidia,|\n|||||\n|\nCoreWeave could be the biggest IP...,CoreWeave could be the biggest IPO of 2025. Af...,Negative
2,2025-07-13 05:04:00,The Median Retirement Savings for American Hou...,,https://finviz.com/news/102180/the-median-reti...,2025-07-13,nvidia,|\n|||||\n|\nAmericans aren't saving enough fo...,Americans aren't saving enough for retirement....,Positive
3,2025-07-13 04:25:00,3 Artificial Intelligence (AI) Stocks That Are...,,https://finviz.com/news/102173/3-artificial-in...,2025-07-13,nvidia,|\n|||||\n|\nGoogle parent Alphabet looks like...,Google parent Alphabet looks like an absolute ...,Positive
4,2025-07-13 03:51:00,Nvidia CEO Tops Buffett in Billionaire Index A...,,https://finviz.comhttps://finance.yahoo.com/ne...,2025-07-13,nvidia,,,NEUTRAL
...,...,...,...,...,...,...,...,...,...
95,2025-07-11 07:04:00,Dimon Calls Out Market's Tariff Complacency. W...,,https://finviz.comhttps://www.barrons.com/arti...,2025-07-11,nvidia,,,NEUTRAL
96,2025-07-11 07:00:00,Nvidia Is the First $4 Trillion Company. Here'...,,https://finviz.com/news/101092/nvidia-is-the-f...,2025-07-11,nvidia,Key Points\nNvidia's market cap just hit $4 tr...,Key Points Nvidia's market cap just hit $4 tri...,Positive
97,2025-07-11 06:20:00,Should SPDR S&P 500 ETF (SPY) Be on Your Inves...,,https://finviz.com/news/101081/should-spdr-sp-...,2025-07-11,nvidia,Looking for broad exposure to the Large Cap Bl...,Looking for broad exposure to the Large Cap Bl...,Neutral
98,2025-07-11 06:11:00,Bitcoin reaches new record high ahead of US Ho...,,https://finviz.comhttps://finance.yahoo.com/m/...,2025-07-11,nvidia,,,NEUTRAL


In [19]:
sentiment_map = {
    'Positive': 1,
    'POSITIVE':1,
    'NEUTRAL': 0,
    'Neutral': 0,
    'NEGATIVE': -1,
    'Negative': -1
}

df['sentiment_score'] = df['sentiment_finbert'].map(sentiment_map)

In [20]:
df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,timestamp,headline,source,url,date,stock,article_text,article_text_clean,sentiment_finbert,sentiment_score
0,2025-07-13 05:30:00,"Should You Invest $10,000 in Nvidia Stock Righ...",,https://finviz.com/news/102184/should-you-inve...,2025-07-13,nvidia,Key Points\nNvidia (NASDAQ: NVDA) has been an ...,Key Points Nvidia (NASDAQ: NVDA) has been an e...,Positive,1
1,2025-07-13 05:15:00,This Artificial Intelligence (AI) Stock Has Bi...,,https://finviz.com/news/102181/this-artificial...,2025-07-13,nvidia,|\n|||||\n|\nCoreWeave could be the biggest IP...,CoreWeave could be the biggest IPO of 2025. Af...,Negative,-1
2,2025-07-13 05:04:00,The Median Retirement Savings for American Hou...,,https://finviz.com/news/102180/the-median-reti...,2025-07-13,nvidia,|\n|||||\n|\nAmericans aren't saving enough fo...,Americans aren't saving enough for retirement....,Positive,1
3,2025-07-13 04:25:00,3 Artificial Intelligence (AI) Stocks That Are...,,https://finviz.com/news/102173/3-artificial-in...,2025-07-13,nvidia,|\n|||||\n|\nGoogle parent Alphabet looks like...,Google parent Alphabet looks like an absolute ...,Positive,1
4,2025-07-13 03:51:00,Nvidia CEO Tops Buffett in Billionaire Index A...,,https://finviz.comhttps://finance.yahoo.com/ne...,2025-07-13,nvidia,,,NEUTRAL,0


In [21]:
# Group by date and stock, then calculate mean sentiment score per day
daily_sentiment = df.groupby(['date', 'stock'])['sentiment_score'].mean().reset_index()

# Rename column for clarity
daily_sentiment.rename(columns={'sentiment_score': 'daily_sentiment_score'}, inplace=True)

# Display
daily_sentiment

Unnamed: 0,date,stock,daily_sentiment_score
0,2025-07-11,nvidia,0.058824
1,2025-07-12,nvidia,0.48
2,2025-07-13,nvidia,0.571429


In [None]:
#daily sentiment has been positive as per the Data from 2nd to 3rd , 3rd to 4th and 4th to 5th

In [22]:
!pip install alpha_vantage

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting alpha_vantage
  Downloading alpha_vantage-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading alpha_vantage-3.0.0-py3-none-any.whl (35 kB)
Installing collected packages: alpha_vantage
Successfully installed alpha_vantage-3.0.0


In [27]:
from alpha_vantage.timeseries import TimeSeries
import pandas as pd
from datetime import datetime

api_key = 'I5MZAM9S71PAA6YL'  # Replace with your real key

ts = TimeSeries(key=api_key, output_format='pandas')
stock_price, meta = ts.get_daily(symbol='NVDA', outputsize='compact')

# Clean and sort
stock_price = stock_price.sort_index()  # Oldest to newest
stock_price = stock_price[stock_price.index <= pd.Timestamp.today()]  # Exclude today

In [28]:
stock_price.reset_index(inplace= True)

In [29]:
stock_price

Unnamed: 0,date,1. open,2. high,3. low,4. close,5. volume
0,2025-02-18,141.270,143.44,137.9250,139.40,219176627.0
1,2025-02-19,139.510,141.36,137.2200,139.23,167536006.0
2,2025-02-20,140.030,140.66,136.7901,140.11,143903583.0
3,2025-02-21,140.040,141.46,134.0300,134.43,228217585.0
4,2025-02-24,136.560,138.59,130.0800,130.28,251381137.0
...,...,...,...,...,...,...
95,2025-07-07,158.200,159.31,157.3420,158.24,140138975.0
96,2025-07-08,159.330,160.22,158.3900,160.00,138133025.0
97,2025-07-09,161.220,164.42,161.1600,162.88,183656443.0
98,2025-07-10,164.320,164.50,161.6100,164.10,167704075.0


In [30]:
daily_sentiment

Unnamed: 0,date,stock,daily_sentiment_score
0,2025-07-11,nvidia,0.058824
1,2025-07-12,nvidia,0.48
2,2025-07-13,nvidia,0.571429


In [58]:
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])

# Shift sentiment backward — so sentiment of 2025-07-04 will be applied to 2025-07-03's price
daily_sentiment['date'] = daily_sentiment['date'] - pd.Timedelta(days=1)

# Now merge
merged_df = pd.merge(stock_price, daily_sentiment[['date', 'daily_sentiment_score']], on='date', how='left')

# Fill missing sentiment (if needed)
merged_df['daily_sentiment_score'] = merged_df['daily_sentiment_score'].fillna(0.0)

In [59]:
merged_df

Unnamed: 0,date,1. open,2. high,3. low,4. close,5. volume,daily_sentiment_score
0,2025-02-18,141.270,143.44,137.9250,139.40,219176627.0,0.000000
1,2025-02-19,139.510,141.36,137.2200,139.23,167536006.0,0.000000
2,2025-02-20,140.030,140.66,136.7901,140.11,143903583.0,0.000000
3,2025-02-21,140.040,141.46,134.0300,134.43,228217585.0,0.000000
4,2025-02-24,136.560,138.59,130.0800,130.28,251381137.0,0.000000
...,...,...,...,...,...,...,...
95,2025-07-07,158.200,159.31,157.3420,158.24,140138975.0,0.000000
96,2025-07-08,159.330,160.22,158.3900,160.00,138133025.0,0.000000
97,2025-07-09,161.220,164.42,161.1600,162.88,183656443.0,0.000000
98,2025-07-10,164.320,164.50,161.6100,164.10,167704075.0,0.116279
