In [5]:
import tensorflow as tf
from transformers import BertTokenizer
from tensorflow.keras.models import load_model
from transformers import TFBertModel
import tensorflow_addons as tfa
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
from newsapi import NewsApiClient
from datetime import date, timedelta
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.cell.cell import Cell
from openpyxl.styles import Font, Color, PatternFill
from openpyxl.utils import get_column_letter
from openpyxl.drawing.image import Image
import os
import re


def get_article_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # Modify this part of the code based on the HTML structure of the article
        paragraphs = soup.find_all('p')

        # Concatenate the text of the article paragraphs
        article_text = '\n'.join([paragraph.text.strip() for paragraph in paragraphs])

        return article_text
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            print(f"Article not found: {url}")
        else:
            print(f"HTTP Error: {e.response.status_code} - {e.response.reason}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


def save_text_to_file(text, file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(text)


def create_excel_with_links(df, file_path):
    wb = Workbook()
    ws = wb.active

    # Add the DataFrame data to the Excel file
    for row in dataframe_to_rows(df, index=False, header=True):
        new_row = []
        for cell in row:
            if isinstance(cell, tf.Tensor):
                new_row.append(float(cell.numpy()))
            else:
                new_row.append(cell)
        ws.append(new_row)

    # Make the links clickable in the "Link" column
    for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=3, max_col=3):
        for cell in row:
            cell.hyperlink = cell.value
            cell.style = "Hyperlink"

    # Adjust column widths
    for column in ws.columns:
        max_length = 0
        column = [cell for cell in column]
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(cell.value)
            except:
                pass
        adjusted_width = (max_length + 2) * 1.2
        ws.column_dimensions[get_column_letter(column[0].column)].width = adjusted_width

    # Save the Excel file
    wb.save(file_path)


def search_articles(keyword, num_articles=10):
    # Initialize the API client with your API key
    newsapi = NewsApiClient(api_key='736bb81752694ad390f723084ed3279d')
    # Define the period to search (from 30 days before the current date to the current date)
    end_date = date.today()
    start_date = end_date - timedelta(days=30)

    # Retrieve the specified number of articles with the specified keyword
    articles = newsapi.get_everything(q=keyword, language='en', from_param=start_date, to=end_date, page_size=num_articles)

    total_articles = min(articles['totalResults'], len(articles['articles']))
    print(f"Total articles found: {total_articles}")

    # Extract article information
    article_data = []
    sentiment_labels = []
    counter = 0
    for i, article in enumerate(articles['articles']):
        title = article['title']
        url = article['url']
        published_date = article['publishedAt']
        source = article['source']['name']

        # Check if the article source is in the allowed websites list
        if any(allowed_website in url for allowed_website in allowed_websites):
            article_text = get_article_text(url)
            if article_text is not None:
                counter += 1
                print(f"Processing article {counter} of {total_articles}")
                article_data.append({'Date': published_date, 'Title': title, 'Link': url, 'Source': source})

                # Perform sentiment analysis on the article text
                sentiment_label, sentiment_probability = perform_sentiment_analysis(article_text)

                # Add sentiment analysis results to the article data
                article_data[-1]['Label'] = sentiment_label
                article_data[-1]['Probability'] = sentiment_probability

                sentiment_labels.append(sentiment_label)

                # Save the article text in a .txt file
                folder_name = keyword
                os.makedirs(folder_name, exist_ok=True)
                file_name = os.path.join(folder_name, f"article_{counter}.txt")
                save_text_to_file(article_text, file_name)

    # Create a DataFrame from the article data
    #df = pd.DataFrame(article_data)
    df = pd.DataFrame(article_data)
    df.to_pickle("NVIDIA_American.pkl")

    # Convert probabilities to percentages
    df['Probability'] = df['Probability'].apply(lambda x: f"{x * 100:.2f}%")

    # Display the DataFrame with sentiment analysis results
    print(df)


    # Save the DataFrame to an Excel file with clickable links
    excel_file = f"{keyword}_articles.xlsx"
    folder_name = keyword
    os.makedirs(folder_name, exist_ok=True)
    excel_filepath = os.path.join(folder_name, excel_file)
    create_excel_with_links(df, excel_filepath)


def perform_sentiment_analysis(text):
    # Preprocess the text
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        truncation=True,
        return_tensors='tf',
        return_token_type_ids=True
    )
    token_type_ids = inputs['token_type_ids']
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Make a prediction with the loaded model
    outputs = loaded_model([input_ids, token_type_ids, attention_mask])

    # Extract the predicted probabilities from the output tensors
    probs = outputs[0]

    # Convert probabilities to class labels (positive or negative)
    class_labels = ['Negative', 'Positive']
    predicted_class_index = np.argmax(probs)
    predicted_class_label = class_labels[predicted_class_index]

    # Get the predicted probability for the predicted class
    predicted_class_probability = probs[predicted_class_index]

    return predicted_class_label, predicted_class_probability


# Load the BERT model
bert_type = 'bert-base-cased'
bert = TFBertModel.from_pretrained(bert_type)
tokenizer = BertTokenizer.from_pretrained(bert_type)

# Load the sentiment analysis model including the TFBertModel object in custom_objects
def custom_objects():
    return {"F1Score": tfa.metrics.F1Score, "TFBertModel": TFBertModel}


loaded_model = tf.keras.models.load_model('modele_bert.h5', custom_objects=custom_objects())

# Specify the number of articles to scrape
num_articles = 20

# Define the list of allowed websites
allowed_websites = [
    'bloomberg.com', 'cnbc.com', 'reuters.com', 'wsj.com', 'finance.yahoo.com',
    'marketwatch.com', 'forbes.com', 'ft.com', "barrons.com", 'investing.com'
]


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [6]:
search_articles("NVIDIA", 100)

Total articles found: 100
Processing article 1 of 100
Processing article 2 of 100
Processing article 3 of 100
Processing article 4 of 100
Article not found: https://finance.yahoo.com/news/1-solid-ai-stock-thats-111500631.html/
Article not found: https://finance.yahoo.com/news/move-over-nvidia-billionaires-selling-083300873.html/
HTTP Error: 502 - Next Hop Connection Failed
Article not found: https://finance.yahoo.com/news/once-decade-investment-opportunity-1-123000805.html/
Article not found: https://finance.yahoo.com/news/billionaires-warren-buffett-david-tepper-090600540.html/
                   Date                                              Title  \
0  2024-09-09T18:45:00Z  NVIDIA Wipes Out $400B in Value: Buy, Hold, or...   
1  2024-09-06T10:09:29Z  BofA’s Hartnett says US semiconductor stocks t...   
2  2024-09-16T21:14:15Z  NVIDIA Corporation (NVDA) a Good Big Tech Stoc...   
3  2024-09-10T13:04:57Z  Exclusive-Data center operator Switch weighs I...   

                       

In [7]:
df = pd.read_pickle("NVIDIA_American.pkl")

In [8]:
df

Unnamed: 0,Date,Title,Link,Source,Label,Probability
0,2024-09-09T18:45:00Z,"NVIDIA Wipes Out $400B in Value: Buy, Hold, or...",https://finance.yahoo.com/news/nvidia-wipes-40...,Yahoo Entertainment,Negative,"tf.Tensor(0.97275037, shape=(), dtype=float32)"
1,2024-09-06T10:09:29Z,BofA’s Hartnett says US semiconductor stocks t...,https://finance.yahoo.com/news/bofa-hartnett-s...,Yahoo Entertainment,Negative,"tf.Tensor(0.73628885, shape=(), dtype=float32)"
2,2024-09-16T21:14:15Z,NVIDIA Corporation (NVDA) a Good Big Tech Stoc...,https://finance.yahoo.com/news/nvidia-corporat...,Yahoo Entertainment,Positive,"tf.Tensor(0.9384008, shape=(), dtype=float32)"
3,2024-09-10T13:04:57Z,Exclusive-Data center operator Switch weighs I...,https://finance.yahoo.com/news/exclusive-data-...,Yahoo Entertainment,Positive,"tf.Tensor(0.88540053, shape=(), dtype=float32)"


In [9]:
# Convert the EagerTensors to Python floats
df["Probability"] = df["Probability"].apply(lambda x: float(x))

In [10]:
df

Unnamed: 0,Date,Title,Link,Source,Label,Probability
0,2024-09-09T18:45:00Z,"NVIDIA Wipes Out $400B in Value: Buy, Hold, or...",https://finance.yahoo.com/news/nvidia-wipes-40...,Yahoo Entertainment,Negative,0.97275
1,2024-09-06T10:09:29Z,BofA’s Hartnett says US semiconductor stocks t...,https://finance.yahoo.com/news/bofa-hartnett-s...,Yahoo Entertainment,Negative,0.736289
2,2024-09-16T21:14:15Z,NVIDIA Corporation (NVDA) a Good Big Tech Stoc...,https://finance.yahoo.com/news/nvidia-corporat...,Yahoo Entertainment,Positive,0.938401
3,2024-09-10T13:04:57Z,Exclusive-Data center operator Switch weighs I...,https://finance.yahoo.com/news/exclusive-data-...,Yahoo Entertainment,Positive,0.885401


In [11]:
df.rename(columns={'Probability': 'Categorical_Accuracy'}, inplace=True)

In [12]:
df

Unnamed: 0,Date,Title,Link,Source,Label,Categorical_Accuracy
0,2024-09-09T18:45:00Z,"NVIDIA Wipes Out $400B in Value: Buy, Hold, or...",https://finance.yahoo.com/news/nvidia-wipes-40...,Yahoo Entertainment,Negative,0.97275
1,2024-09-06T10:09:29Z,BofA’s Hartnett says US semiconductor stocks t...,https://finance.yahoo.com/news/bofa-hartnett-s...,Yahoo Entertainment,Negative,0.736289
2,2024-09-16T21:14:15Z,NVIDIA Corporation (NVDA) a Good Big Tech Stoc...,https://finance.yahoo.com/news/nvidia-corporat...,Yahoo Entertainment,Positive,0.938401
3,2024-09-10T13:04:57Z,Exclusive-Data center operator Switch weighs I...,https://finance.yahoo.com/news/exclusive-data-...,Yahoo Entertainment,Positive,0.885401


In [13]:
df['Categorical_Accuracy'] = df['Categorical_Accuracy'] * 100

In [14]:
df

Unnamed: 0,Date,Title,Link,Source,Label,Categorical_Accuracy
0,2024-09-09T18:45:00Z,"NVIDIA Wipes Out $400B in Value: Buy, Hold, or...",https://finance.yahoo.com/news/nvidia-wipes-40...,Yahoo Entertainment,Negative,97.275037
1,2024-09-06T10:09:29Z,BofA’s Hartnett says US semiconductor stocks t...,https://finance.yahoo.com/news/bofa-hartnett-s...,Yahoo Entertainment,Negative,73.628885
2,2024-09-16T21:14:15Z,NVIDIA Corporation (NVDA) a Good Big Tech Stoc...,https://finance.yahoo.com/news/nvidia-corporat...,Yahoo Entertainment,Positive,93.84008
3,2024-09-10T13:04:57Z,Exclusive-Data center operator Switch weighs I...,https://finance.yahoo.com/news/exclusive-data-...,Yahoo Entertainment,Positive,88.540053


In [15]:
df.to_pickle("Final_NVIDIA_American.pkl")