In [None]:
%pip install selenium

import nltk
import requests
import os
from dotenv import load_dotenv
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
load_dotenv()

In [None]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
import sys
import os
sys.path.append(os.path.abspath(".."))  # add parent directory to sys.path


# --- Import and call functions from your scraping scripts ---
# Assuming your scra.py and collect.py have functions as described in the previous response
from WEB_SCRAPED.scra import scrape_bbc_headlines
from WEB_SCRAPED.collect import process_html_files, save_to_csv

if __name__ == "__main__":
    output_folder = "../WEB_SCRAPED/data"
    base_url = "https://www.bbc.com/"
    sections = ['business', 'news']
    output_file = "../WEB_SCRAPED/news_data.csv"

    # Step 1: Run your scraping scripts as functions
    scrape_bbc_headlines(base_url, sections, output_folder)
    df_scraped = process_html_files(output_folder)
    save_to_csv(df_scraped, output_file)

    # Step 2: Load news data from the generated CSV
    try:
        df = pd.read_csv(output_file)
    except FileNotFoundError:
        print(f"Error: The file '{output_file}' was not found. Please ensure the scraping scripts ran successfully.")
        exit()

    # Step 3: Sentiment Analysis
    sia = SentimentIntensityAnalyzer()

    def analyze_sentiment(news_list):
        results = []
        for news in news_list:
            sentiment = sia.polarity_scores(news)
            results.append({
                "news": news,
                "sentiment_score": sentiment["compound"]
            })
        return results

    # Let's explicitly call the analyze_sentiment method here
    # Combine headline and description for more context
    headlines_to_analyze = (df['Headline'].fillna('') + ". " + df['Description'].fillna('')).tolist()
    sentiment_results = analyze_sentiment(headlines_to_analyze)

    # Step 4: Display results including the sentiment
    for index, item in enumerate(sentiment_results):
        if index < len(df):  # Ensure index is within DataFrame bounds
            headline = df.iloc[index]['Headline']
            description = df.iloc[index]['Description']
            time = df.iloc[index]['Time']
            category = df.iloc[index]['Category']
            sentiment_score = item['sentiment_score']

            print(f"## {headline}\n"
                  f"Description: {description}\n"
                  f"Time: {time}\n"
                  f"Category: {category}\n"
                  f"→> Sentiment Score: {sentiment_score}\n")
        else:
            print(f"Warning: Sentiment result index {index} is out of bounds for the DataFrame.")

In [None]:
# INSTALL ALL DEPENDENCIES
!pip install -q torch transformers sentencepiece accelerate llama-cpp-python huggingface-hub

# IMPORT LIBRARIES
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import torch

print(f"MPS (Apple GPU) available: {torch.backends.mps.is_available()}")
print(f"PyTorch device: {torch.device('mps')}")
model_path = os.getenv("model_path")

llm = Llama(
    model_path=model_path,
    n_ctx=2048,       # Context window size
    n_threads=8,      # CPU threads to use
    n_gpu_layers=5,
    n_batch=512    # Layers to offload to Metal GPU (adjust based on RAM)
)


def filter_news(news_list):
    relevant_articles = []
    
    for article in news_list:
        response = llm.create_chat_completion(
            messages=[{
                "role": "user", 
                "content": f"Is the following news article likely to influence stock markets?If yes,state yes and explain how: {article[:500]}"
            }],
            max_tokens=10,
            temperature=0.1  # Low temperature for yes/no answers
        )
        
        if "yes" in response['choices'][0]['message']['content'].lower():
            relevant_articles.append(article)
    
    return relevant_articles


print("\nFILTERING SAMPLE NEWS ARTICLES...")
news_list = [item["news"] for item in sentiment_results]
relevant = filter_news(news_list)
print(news_list)

print("\nRELEVANT ARTICLES:")
for i, article in enumerate(relevant, 1):
    print(f"{i}. {article}")

relevant_output_file = "../notebooks/rsentiments.csv"
df_relevant = pd.DataFrame(relevant, columns=["Relevant_Headline"])
df_relevant.to_csv(relevant_output_file, index=False)

In [None]:
def filter(news_list):
    relevent_articles=[]
    for article in news_list:
        relevance_score=get_llm_relevance(article)
        if "yes" in relevance_score.lower():
            relevent_articles.append(article)
    return relevent_articles

In [None]:
import pandas as pd
from datetime import datetime

In [None]:
news_list = [item["news"] for item in sentiment_result[0:10]]
imp=filter(news_list)
print(imp)

In [None]:
import pandas as pd
from datetime import datetime
import os
from statistics import mode

def process_and_save(input_csv="news_data.csv", output_csv="sentiment.csv"):
    """
    Process news data and save sentiment analysis results.
    
    Args:
        input_csv: Path to input CSV containing news data
        output_csv: Path to output CSV for sentiment results
    """
    # Load input data
    try:
        df = pd.read_csv(input_csv)
    except FileNotFoundError:
        raise FileNotFoundError(f"Input file {input_csv} not found")
    
    # Ensure required columns exist
    if 'Headline' not in df.columns:
        raise ValueError("Input CSV must contain 'Headline' column")
    
    # Process sentiment and filter relevant articles
    news_list = df['Headline'].dropna().tolist()
    sentiment_results = analyze_sentiment(news_list)
    relevant_articles = filter(news_list)
    
    # Prepare output data
    data = []
    for news, sentiment in zip(relevant_articles, sentiment_results):
        relevance_explanation = get_llm_relevance(news)
        
        data.append({
            "timestamp": datetime.now().isoformat(),
            "article": news,
            "sentiment_score": sentiment["compound"],
            "label": "Positive" if sentiment["compound"] > 0.2 else "Negative",
            "explanation": relevance_explanation
        })
    
    # Save results
    output_df = pd.DataFrame(data)
    write_header = not os.path.exists(output_csv)
    
    try:
        output_df.to_csv(output_csv, mode='a', index=False, header=write_header)
        print(f"Successfully saved results to {output_csv}")
    except Exception as e:
        print(f"Failed to save results: {e}")
        raise

# Example usage
if __name__ == "__main__":
    process_and_save(input_csv="../WEB_SCRAPED/news_data.csv")

In [None]:
process_and_save()