In [23]:
%pip install selenium

import nltk
import requests
import os
from dotenv import load_dotenv
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
load_dotenv()

Note: you may need to restart the kernel to use updated packages.


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/aadi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [35]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
import sys
import os
sys.path.append(os.path.abspath(".."))  # add parent directory to sys.path


# --- Import and call functions from your scraping scripts ---
# Assuming your scra.py and collect.py have functions as described in the previous response
from WEB_SCRAPED.scra import scrape_bbc_headlines
from WEB_SCRAPED.collect import process_html_files, save_to_csv

if __name__ == "__main__":
    output_folder = "../WEB_SCRAPED/data"
    base_url = "https://www.bbc.com/"
    sections = ['business', 'news']
    output_file = "../WEB_SCRAPED/news_data.csv"

    # Step 1: Run your scraping scripts as functions
    scrape_bbc_headlines(base_url, sections, output_folder)
    df_scraped = process_html_files(output_folder)
    save_to_csv(df_scraped, output_file)

    # Step 2: Load news data from the generated CSV
    try:
        df = pd.read_csv(output_file)
    except FileNotFoundError:
        print(f"Error: The file '{output_file}' was not found. Please ensure the scraping scripts ran successfully.")
        exit()

    # Step 3: Sentiment Analysis
    sia = SentimentIntensityAnalyzer()

    def analyze_sentiment(news_list):
        results = []
        for news in news_list:
            sentiment = sia.polarity_scores(news)
            results.append({
                "news": news,
                "sentiment_score": sentiment["compound"]
            })
        return results

    # Let's explicitly call the analyze_sentiment method here
    headlines_to_analyze = df['Headline'].dropna().tolist()
    sentiment_results = analyze_sentiment(headlines_to_analyze)

    # Step 4: Display results including the sentiment
    for index, item in enumerate(sentiment_results):
        if index < len(df):  # Ensure index is within DataFrame bounds
            headline = df.iloc[index]['Headline']
            description = df.iloc[index]['Description']
            time = df.iloc[index]['Time']
            category = df.iloc[index]['Category']
            sentiment_score = item['sentiment_score']

            print(f"## {headline}\n"
                  f"Description: {description}\n"
                  f"Time: {time}\n"
                  f"Category: {category}\n"
                  f"→> Sentiment Score: {sentiment_score}\n")
        else:
            print(f"Warning: Sentiment result index {index} is out of bounds for the DataFrame.")

13 items found in business

25 items found in news

Successfully saved 38 records to ../WEB_SCRAPED/news_data.csv
## Is Piastri now favourite for drivers' title?
Description: BBC Sport F1 correspondent Andrew Benson answers your questions following the Bahrain Grand Prix.
Time: 3 hrs ago
Category: Formula 1
→> Sentiment Score: 0.0

## Watch from launch to landing: All-female crew travels into space
Description: The flight, carrying celebrities like Katy Perry and Gayle King, reached a height of about 60 miles (96.6km) above Earth.
Time: 18 hrs ago
Category: US & Canada
→> Sentiment Score: 0.0

## Trump blames Zelensky for starting war after massive Russian attack
Description: The US president said that you "don't start a war against someone 20 times your size" and hope people give you missiles.
Time: 6 mins ago
Category: World
→> Sentiment Score: -0.8658

## Couple arrested for breeding exotic cats in Spain
Description: The couple is suspected of selling exotic cats online, including p

In [32]:
# INSTALL ALL DEPENDENCIES
!pip install -q torch transformers sentencepiece accelerate llama-cpp-python huggingface-hub

# IMPORT LIBRARIES
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import torch

print(f"MPS (Apple GPU) available: {torch.backends.mps.is_available()}")
print(f"PyTorch device: {torch.device('mps')}")
model_path = os.getenv("model_path")

llm = Llama(
    model_path=model_path,
    n_ctx=2048,       # Context window size
    n_threads=8,      # CPU threads to use
    n_gpu_layers=5,
    n_batch=512    # Layers to offload to Metal GPU (adjust based on RAM)
)


def filter_news(news_list):
    relevant_articles = []
    
    for article in news_list:
        response = llm.create_chat_completion(
            messages=[{
                "role": "user", 
                "content": f"Is the following news article likely to influence stock markets?If yes,state yes and explain how: {article[:500]}"
            }],
            max_tokens=10,
            temperature=0.1  # Low temperature for yes/no answers
        )
        
        if "yes" in response['choices'][0]['message']['content'].lower():
            relevant_articles.append(article)
    
    return relevant_articles


print("\nFILTERING SAMPLE NEWS ARTICLES...")
news_list = [item["news"] for item in sentiment_result[0:20]]
relevant = filter_news(news_list)
print(news_list)

print("\nRELEVANT ARTICLES:")
for i, article in enumerate(relevant, 1):
    print(f"{i}. {article}")

MPS (Apple GPU) available: True
PyTorch device: mps


llama_model_load_from_file_impl: using device Metal (Apple M2) - 4588 MiB free
llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /Users/aadi/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.1-GGUF/snapshots/731a9fc8f06f5f5e2db8a0cf9d256197eb6e05d1/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:        


FILTERING SAMPLE NEWS ARTICLES...


llama_perf_context_print:        load time =    7175.54 ms
llama_perf_context_print: prompt eval time =    7175.33 ms /    50 tokens (  143.51 ms per token,     6.97 tokens per second)
llama_perf_context_print:        eval time =   56701.73 ms /     9 runs   ( 6300.19 ms per token,     0.16 tokens per second)
llama_perf_context_print:       total time =   63892.61 ms /    59 tokens
Llama.generate: 26 prefix-match hit, remaining 16 prompt tokens to eval
llama_perf_context_print:        load time =    7175.54 ms
llama_perf_context_print: prompt eval time =    7215.18 ms /    16 tokens (  450.95 ms per token,     2.22 tokens per second)
llama_perf_context_print:        eval time =   58596.92 ms /     9 runs   ( 6510.77 ms per token,     0.15 tokens per second)
llama_perf_context_print:       total time =   65833.47 ms /    25 tokens
Llama.generate: 26 prefix-match hit, remaining 19 prompt tokens to eval
llama_perf_context_print:        load time =    7175.54 ms
llama_perf_context_print: p

KeyboardInterrupt: 

In [None]:
def get_llm_relevance(news_text):
    prompt=f"Is the following news article likely to influence stock markets?If yes,state yes and explain how.\n\nArticle:{news_text}"
    response=llm.invoke(prompt)
    return response.strip()

In [None]:
def filter(news_list):
    relevent_articles=[]
    for article in news_list:
        relevance_score=get_llm_relevance(article)
        if "yes" in relevance_score.lower():
            relevent_articles.append(article)
    return relevent_articles

In [None]:
import pandas as pd
from datetime import datetime

In [None]:
news_list = [item["news"] for item in sentiment_result[0:10]]
imp=filter(news_list)
print(imp)

In [None]:
import pandas as pd
from datetime import datetime
import os
from statistics import mode

def process_and_save(input_csv="news_data.csv", output_csv="sentiment.csv"):
    """
    Process news data and save sentiment analysis results.
    
    Args:
        input_csv: Path to input CSV containing news data
        output_csv: Path to output CSV for sentiment results
    """
    # Load input data
    try:
        df = pd.read_csv(input_csv)
    except FileNotFoundError:
        raise FileNotFoundError(f"Input file {input_csv} not found")
    
    # Ensure required columns exist
    if 'Headline' not in df.columns:
        raise ValueError("Input CSV must contain 'Headline' column")
    
    # Process sentiment and filter relevant articles
    news_list = df['Headline'].dropna().tolist()
    sentiment_results = analyze_sentiment(news_list)
    relevant_articles = filter(news_list)
    
    # Prepare output data
    data = []
    for news, sentiment in zip(relevant_articles, sentiment_results):
        relevance_explanation = get_llm_relevance(news)
        
        data.append({
            "timestamp": datetime.now().isoformat(),
            "article": news,
            "sentiment_score": sentiment["compound"],
            "label": "Positive" if sentiment["compound"] > 0.2 else "Negative",
            "explanation": relevance_explanation
        })
    
    # Save results
    output_df = pd.DataFrame(data)
    write_header = not os.path.exists(output_csv)
    
    try:
        output_df.to_csv(output_csv, mode='a', index=False, header=write_header)
        print(f"Successfully saved results to {output_csv}")
    except Exception as e:
        print(f"Failed to save results: {e}")
        raise

# Example usage
if __name__ == "__main__":
    process_and_save(input_csv="../WEB_SCRAPED/news_data.csv")

In [None]:
process_and_save()