In [2]:
pip install googletrans==4.0.0-rc1


Collecting httpx==0.13.3 (from googletrans==4.0.0-rc1)
  Obtaining dependency information for httpx==0.13.3 from https://files.pythonhosted.org/packages/54/b4/698b284c6aed4d7c2b4fe3ba5df1fcf6093612423797e76fbb24890dd22f/httpx-0.13.3-py3-none-any.whl.metadata
  Downloading httpx-0.13.3-py3-none-any.whl.metadata (25 kB)
Collecting httpcore==0.9.* (from httpx==0.13.3->googletrans==4.0.0-rc1)
  Obtaining dependency information for httpcore==0.9.* from https://files.pythonhosted.org/packages/dd/d5/e4ff9318693ac6101a2095e580908b591838c6f33df8d3ee8dd953ba96a8/httpcore-0.9.1-py3-none-any.whl.metadata
  Downloading httpcore-0.9.1-py3-none-any.whl.metadata (4.6 kB)
Collecting h11<0.10,>=0.8 (from httpcore==0.9.*->httpx==0.13.3->googletrans==4.0.0-rc1)
  Obtaining dependency information for h11<0.10,>=0.8 from https://files.pythonhosted.org/packages/5a/fd/3dad730b0f95e78aeeb742f96fa7bbecbdd56a58e405d3da440d5bfb90c6/h11-0.9.0-py2.py3-none-any.whl.metadata
  Downloading h11-0.9.0-py2.py3-none-any.w

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
python-telegram-bot 21.0.1 requires httpx~=0.27, but you have httpx 0.13.3 which is incompatible.


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from newsapi import NewsApiClient
from datetime import date, timedelta
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import os
from googletrans import Translator
from openpyxl.utils import get_column_letter
import tensorflow as tf
from transformers import BertTokenizer
from tensorflow.keras.models import load_model
from transformers import TFBertModel
import tensorflow_addons as tfa
import numpy as np

def translate_to_english(text):
    translator = Translator()
    max_chunk_size = 450  # Use a value slightly less than 500 to stay within the limit

    # Split the text into chunks of maximum allowed length
    chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]

    # Translate each chunk and join the translations
    translated_chunks = [translator.translate(chunk, src='fr', dest='en').text for chunk in chunks]
    translated_text = " ".join(translated_chunks)

    return translated_text


def get_articles_from_api(api_key, keyword, sources, from_date, to_date):
    base_url = "https://newsapi.org/v2/everything"
    params = {
        "q": keyword,
        "language": "fr",
        "sources": ",".join(sources),
        "from": from_date.isoformat(),
        "to": to_date.isoformat(),
        "pageSize": 100,  # Set a larger page size to get more articles in a single request
        "apiKey": api_key
    }

    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()

        return data["articles"]

    except requests.exceptions.RequestException as e:
        print("Error during API request:", e)
        return []

def get_article_text(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # List of potential tags for article content
        possible_tags = ["p", "div", "article", "section"]

        # Find the first tag that contains text
        for tag in possible_tags:
            tag_elements = soup.find_all(tag)
            if tag_elements:
                article_text = " ".join([element.get_text().strip() for element in tag_elements])
                return article_text

        # If no text found, return None
        return None

    except requests.exceptions.RequestException as e:
        print("Error during article request:", e)
        return None

def save_text_to_file(text, file_name):
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(text)

def create_excel_with_links(df, file_path):
    wb = Workbook()
    ws = wb.active

    # Add column headers
    headers = ["Date", "Title", "Link", "Source", "Label", "Probability"]
    ws.append(headers)

    # Add the DataFrame data to the Excel file
    for row in dataframe_to_rows(df, index=False, header=True):
        new_row = []
        for cell in row:
            if isinstance(cell, tf.Tensor):
                new_row.append(float(cell.numpy()))
            else:
                new_row.append(cell)
        ws.append(new_row)

    # Make the links clickable in the "Link" column
    for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=3, max_col=3):
        for cell in row:
            cell.hyperlink = cell.value
            cell.style = "Hyperlink"
            
        # Adjust column widths
    for column in ws.columns:
        max_length = 0
        column = [cell for cell in column]
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(cell.value)
            except:
                pass
        adjusted_width = (max_length + 2) * 1.2
        ws.column_dimensions[get_column_letter(column[0].column)].width = adjusted_width

    # Save the Excel file
    wb.save(file_path)

    
def perform_sentiment_analysis(text):
    # Preprocess the text
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        padding='max_length',
        truncation=True,
        return_tensors='tf',
        return_token_type_ids=True
    )
    token_type_ids = inputs['token_type_ids']
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    # Make a prediction with the loaded model
    outputs = loaded_model([input_ids, token_type_ids, attention_mask])

    # Extract the predicted probabilities from the output tensors
    probs = outputs[0]

    # Convert probabilities to class labels (positive or negative)
    class_labels = ['Negative', 'Positive']
    predicted_class_index = np.argmax(probs)
    predicted_class_label = class_labels[predicted_class_index]

    # Get the predicted probability for the predicted class
    predicted_class_probability = probs[predicted_class_index]

    return predicted_class_label, predicted_class_probability

def search_articles(keyword, number_articles=5):
    # Your NewsAPI API key
    api_key = "4a3afce11f134327a142694aa36649bd"

    # Date range for search (30 days before the current date until the current date)
    from_date = date.today() - timedelta(days=30)
    to_date = date.today()

    # Sources to search for articles
    sources_francais = [
        'le-monde', 'le-figaro', 'usinenouvelle', 'challenges', 'tradingsat', 'capital', 'liberation', 'le-parisien', '20-minutes',
        'bfmtv', 'france24', 'les-echos', 'courrier-international', 'la-croix'
    ]

    articles = get_articles_from_api(api_key, keyword, sources_francais, from_date, to_date)

    # Determine the number of articles to process
    num_articles_to_process = min(number_articles, len(articles))

    # Extract information from articles, translate to English, and save text to .txt files
    #folder_name = keyword
    folder_name = f"{keyword}_French"
    os.makedirs(folder_name, exist_ok=True)
    for i, article in enumerate(articles[:num_articles_to_process], 1):
        title = article["title"]
        url = article["url"]
        article_text = get_article_text(url)
        if article_text is not None:
            translated_text = translate_to_english(article_text)  # Translate to English
            file_name = os.path.join(folder_name, f"article_Francais_{i}.txt")
            save_text_to_file(translated_text, file_name)  # Save the translated text

    # Create a DataFrame and save the articles to an Excel file
    article_data = []
    for article in articles[:num_articles_to_process]:
        title = article["title"]
        url = article["url"]
        source = article["source"]["name"]  # Ajout: Récupérer la source de l'article

        article_text = get_article_text(url)
        if article_text is not None:
            translated_text = translate_to_english(article_text)  # Translate to English
            label, probability = perform_sentiment_analysis(translated_text)  # Analyse des sentiments

            # Ajout: Ajouter les données au DataFrame
            article_info = {
                "Date": article["publishedAt"],
                "Title": title,
                "Link": url,
                "Source": source,
                "Label": label,
                "Probability": probability * 100  # Convertir la probabilité en pourcentage
            }
            article_data.append(article_info)

            # Sauvegarder le texte traduit dans un fichier .txt (comme dans le code actuel)
            file_name = os.path.join(folder_name, f"article_francais_{i}.txt")
            save_text_to_file(translated_text, file_name)

    # Create DataFrame
    df = pd.DataFrame(article_data)
    df.to_pickle("NVIDIA_French.pkl")
    excel_file = f"{keyword}_articles_Francais.xlsx"
    excel_filepath = os.path.join(folder_name, excel_file)
    create_excel_with_links(df, excel_filepath)



    
# Load the BERT model
bert_type = 'bert-base-cased'
bert = TFBertModel.from_pretrained(bert_type)
tokenizer = BertTokenizer.from_pretrained(bert_type)

# Load the sentiment analysis model including the TFBertModel object in custom_objects
def custom_objects():
    return {"F1Score": tfa.metrics.F1Score, "TFBertModel": TFBertModel}


loaded_model = tf.keras.models.load_model('modele_bert.h5', custom_objects=custom_objects())


    


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected

In [2]:
# Test la fonction avec l'exemple que vous avez fourni
keyword = "NVIDIA"
search_articles(keyword, number_articles=100)  # Fetch and process 10 articles

In [3]:
df = pd.read_pickle("NVIDIA_French.pkl")

In [4]:
df

Unnamed: 0,Date,Title,Link,Source,Label,Probability
0,2024-09-23T08:25:39Z,« Le déclin d’Intel est la conséquence de troi...,https://www.lemonde.fr/economie/article/2024/0...,Le Monde,Negative,"tf.Tensor(75.792984, shape=(), dtype=float32)"
1,2024-09-13T12:00:12Z,« La géopolitique s’invite avec fracas dans un...,https://www.lemonde.fr/idees/article/2024/09/1...,Le Monde,Negative,"tf.Tensor(75.792984, shape=(), dtype=float32)"
2,2024-09-17T07:15:35Z,Intel repousse ses projets d’usine en Allemagn...,https://www.liberation.fr/economie/economie-nu...,Libération,Negative,"tf.Tensor(81.864044, shape=(), dtype=float32)"


In [5]:
df["Probability"] = df["Probability"].apply(lambda x: float(x))


In [6]:
df

Unnamed: 0,Date,Title,Link,Source,Label,Probability
0,2024-09-23T08:25:39Z,« Le déclin d’Intel est la conséquence de troi...,https://www.lemonde.fr/economie/article/2024/0...,Le Monde,Negative,75.792984
1,2024-09-13T12:00:12Z,« La géopolitique s’invite avec fracas dans un...,https://www.lemonde.fr/idees/article/2024/09/1...,Le Monde,Negative,75.792984
2,2024-09-17T07:15:35Z,Intel repousse ses projets d’usine en Allemagn...,https://www.liberation.fr/economie/economie-nu...,Libération,Negative,81.864044


In [7]:
df.rename(columns={'Probability': 'Categorical_Accuracy'}, inplace=True)

In [8]:
df

Unnamed: 0,Date,Title,Link,Source,Label,Categorical_Accuracy
0,2024-09-23T08:25:39Z,« Le déclin d’Intel est la conséquence de troi...,https://www.lemonde.fr/economie/article/2024/0...,Le Monde,Negative,75.792984
1,2024-09-13T12:00:12Z,« La géopolitique s’invite avec fracas dans un...,https://www.lemonde.fr/idees/article/2024/09/1...,Le Monde,Negative,75.792984
2,2024-09-17T07:15:35Z,Intel repousse ses projets d’usine en Allemagn...,https://www.liberation.fr/economie/economie-nu...,Libération,Negative,81.864044


In [9]:
df.to_pickle("Final_NVIDIA_French.pkl")