In [1]:
import pandas as pd
import re
import streamlit as st
import plotly.express as px
import plotly.graph_objects as go
from eventregistry import *
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates

# Collecte des données

In [2]:
# Configuration de l'API Key
api_key = "6d15fe13-b16a-4080-bbff-dc81f97f3d0d"

@st.cache_data
def get_articles():
    er = EventRegistry(apiKey=api_key)

    columns_to_include = [
        'lang', 'url', 'sentiment', 'date', 'relevance', 'title', 'location', 'isDuplicate', 'sim'
    ]

    articles = []
    q = QueryArticlesIter(keywords=QueryItems.AND(["LLM", "model"]), lang="eng")

    for art in q.execQuery(er,
                           returnInfo=ReturnInfo(
                               articleInfo=ArticleInfoFlags(
                                   concepts=True, 
                                   categories=True, 
                                   location=True, 
                                   image=True, 
                                   links=True, 
                                   videos=True
                               ))):
        article_data = {col: art.get(col, None) for col in columns_to_include}
        articles.append(article_data)
    
    return pd.DataFrame(articles)

# Charger les articles dans un DataFrame
df_articles = get_articles()

2024-11-23 02:28:16.511 
  command:

    streamlit run /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-11-23 02:28:16.511 No runtime found, using MemoryCacheStorageManager


# Exploration des données 

In [3]:
df_articles.head()

  lang                                                url  sentiment  \
0  eng  https://sloanreview.mit.edu/article/a-practica...   0.309804   
1  eng  https://unit42.paloaltonetworks.com/privilege-...  -0.011765   
2  eng  https://www.expresscomputer.in/artificial-inte...   0.113725   
3  eng  https://www.globaltrademag.com/how-autonomous-...   0.270588   
4  eng  https://aws.amazon.com/blogs/security/securing...   0.576471   

         date  relevance                                              title  \
0  2024-11-04        100  A Practical Guide to Gaining Value From LLMs |...   
1  2024-11-12         85  ModeLeak: Privilege Escalation to LLM Model Ex...   
2  2024-11-11         76  From MLOps to LLMOps - Evolution of the LLM Ec...   
3  2024-11-14         67  How Autonomous Mobile Robots (AMRs) are Revolu...   
4  2024-10-29         63  Securing generative AI: An introduction to the...   

  location  isDuplicate       sim  
0     None        False  0.658824  
1     None        Fa

In [4]:
df_articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2451 entries, 0 to 2450
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   lang         2451 non-null   object 
 1   url          2451 non-null   object 
 2   sentiment    2451 non-null   float64
 3   date         2451 non-null   object 
 4   relevance    2451 non-null   int64  
 5   title        2451 non-null   object 
 6   location     214 non-null    object 
 7   isDuplicate  2451 non-null   bool   
 8   sim          2451 non-null   float64
dtypes: bool(1), float64(2), int64(1), object(5)
memory usage: 155.7+ KB
None


In [5]:
df_articles.describe()

         sentiment    relevance          sim
count  2451.000000  2451.000000  2451.000000
mean      0.246985     5.298654     0.366775
std       0.186570     7.206669     0.367939
min      -0.694118     1.000000     0.000000
25%       0.129412     1.000000     0.000000
50%       0.254902     3.000000     0.462745
75%       0.364706     6.000000     0.717647
max       0.937255   100.000000     0.992157


In [6]:
df_articles.sort_values(by='relevance', ascending=False).head(5)

  lang                                                url  sentiment  \
0  eng  https://sloanreview.mit.edu/article/a-practica...   0.309804   
1  eng  https://unit42.paloaltonetworks.com/privilege-...  -0.011765   
2  eng  https://www.expresscomputer.in/artificial-inte...   0.113725   
3  eng  https://www.globaltrademag.com/how-autonomous-...   0.270588   
4  eng  https://aws.amazon.com/blogs/security/securing...   0.576471   

         date  relevance                                              title  \
0  2024-11-04        100  A Practical Guide to Gaining Value From LLMs |...   
1  2024-11-12         85  ModeLeak: Privilege Escalation to LLM Model Ex...   
2  2024-11-11         76  From MLOps to LLMOps - Evolution of the LLM Ec...   
3  2024-11-14         67  How Autonomous Mobile Robots (AMRs) are Revolu...   
4  2024-10-29         63  Securing generative AI: An introduction to the...   

  location  isDuplicate       sim  
0     None        False  0.658824  
1     None        Fa

In [7]:
# Histogramme des sentiments
fig = px.histogram(
    df_articles,
    x='sentiment',
    nbins=20,
    title="Distribution des sentiments des articles",
    labels={'sentiment': 'Score de sentiment'}
)
fig.show()

In [8]:
# Conversion de la colonne date en type datetime si nécessaire
df_articles['date'] = pd.to_datetime(df_articles['date'], errors='coerce')

# Graphique montrant la pertinence moyenne par date
fig = px.line(
    df_articles.groupby('date')['sentiment'].mean().reset_index(),
    x='date',
    y='sentiment',
    title="sentiment moyen des articles au fil du temps",
    labels={'date': 'Date', 'sentiment': 'sentiment moyen'}
)
fig.show()

# Prétraitement des données

In [9]:
# Conversion des dates et extraction des pays
df_articles['date'] = pd.to_datetime(df_articles['date'])

def extract_country_from_object(location_entry):
    if isinstance(location_entry, dict):
        return location_entry.get('country', {}).get('label', {}).get('eng', None)
    elif isinstance(location_entry, str):
        match = re.search(r"'country': \{.*?'label': \{'eng': '(.*?)'\}", location_entry)
        if match:
            return match.group(1)
    return None

df_articles['country'] = df_articles['location'].apply(extract_country_from_object)
df_articles.drop(columns=['location'], inplace=True)
df_articles = df_articles.dropna(subset=['country'])

# Création d'une copie pour manipulations
df_articles_llm = df_articles.copy()

# Carte interactive :

In [10]:
# Calcul des statistiques par pays
country_stats = df_articles_llm.groupby('country').agg(
    num_articles=('country', 'size'),
    avg_sentiment=('sentiment', 'mean')
).reset_index()

# Création de la carte interactive
fig = px.scatter_geo(
    country_stats,
    locations="country",
    locationmode="country names",
    hover_name="country",
    size="num_articles",
    color="avg_sentiment",
    color_continuous_scale=px.colors.sequential.YlGn,
    projection="natural earth",  # Type de projection
    title="Carte des Articles par Pays"
)

# Personnalisation de la mise en page
fig.update_layout(
    title_font_size=20,
    margin={"r": 0, "t": 50, "l": 0, "b": 0},
    geo=dict(showframe=False, showcoastlines=True, projection_type="natural earth"),
)

# Afficher la carte interactive
fig.show()

Graphique temporel :

In [11]:
# --- Calcul des statistiques temporelles ---
time_stats = df_articles_llm.groupby('date').agg(
    num_articles=('sentiment', 'size'),
    avg_sentiment=('sentiment', 'mean')
).reset_index().sort_values('date')

# --- Création du graphique temporel ---
fig = go.Figure()

# Ajouter une barre pour le nombre d'articles
fig.add_trace(go.Bar(
    x=time_stats['date'],
    y=time_stats['num_articles'],
    name='Nombre d\'articles',
    marker_color='skyblue',
    yaxis='y1',
    opacity=0.7  # Transparence pour rendre les barres moins dominantes
))

# Ajouter une ligne pour le sentiment moyen
fig.add_trace(go.Scatter(
    x=time_stats['date'],
    y=time_stats['avg_sentiment'],
    name='Sentiment Moyen',
    mode='lines+markers',
    marker=dict(color='firebrick', size=8),  # Points plus visibles
    line=dict(width=2, color='firebrick'),  # Ligne plus épaisse
    yaxis='y2'
))

# Mise à jour des axes et de la mise en page
fig.update_layout(
    title=dict(
        text="Évolution du Nombre d'Articles et du Sentiment Moyen",
        x=0.5,  # Centrer le titre
        font=dict(size=18)
    ),
    xaxis=dict(
        title='Date',
        tickangle=45,  # Incliner les étiquettes de date
        showgrid=True  # Grille pour améliorer la lisibilité
    ),
    yaxis=dict(
        title='Nombre d\'articles',
        titlefont=dict(color='skyblue'),
        tickfont=dict(color='skyblue'),
        side='left'
    ),
    yaxis2=dict(
        title='Sentiment Moyen',
        titlefont=dict(color='firebrick'),
        tickfont=dict(color='firebrick'),
        overlaying='y',  # Superposition avec l'axe y1
        side='right'
    ),
    legend=dict(
        x=0.5, 
        y=-0.2,  # Position de la légende en dessous
        xanchor='center', 
        orientation='h',  # Disposition horizontale
        font=dict(size=12)
    ),
    hovermode='x unified',  # Unifie les infobulles sur la même date
    margin=dict(r=20, t=70, l=50, b=70),  # Marges ajustées pour la légende
    width=1000,  # Largeur augmentée
    height=600  # Hauteur ajustée
)

# Afficher le graphique
fig.show()

Camembert :

In [12]:
# --- Calcul des statistiques par pays ---
country_counts = df_articles_llm['country'].value_counts().reset_index()
country_counts.columns = ['country', 'num_articles']

# --- Création du camembert ---
fig = px.pie(
    country_counts,
    names='country',
    values='num_articles',
    title="Répartition des Articles par Pays",
    hole=0.4,  # Optionnel : ajoute un trou au centre pour un "donut chart"
    color_discrete_sequence=px.colors.qualitative.Set3  # Palette de couleurs qualitative
)

# Personnalisation des étiquettes
fig.update_traces(
    textposition='inside',  # Place les pourcentages à l'intérieur des segments
    textinfo='percent+label'  # Affiche les pourcentages et les noms des pays
)

# Afficher le camembert
fig.show()