<a href="https://colab.research.google.com/github/Diego-Hernandez-Jimenez/brand-monitoring-tool/blob/main/app_brand_monitoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive, userdata
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/Colab_Notebooks/Brand monitoring"

Mounted at /content/drive
/content/drive/MyDrive/Colab_Notebooks/Brand monitoring


In [None]:
# python version
!python --version

In [3]:
!uv pip install gradio --quiet
!uv pip install bertopic --quiet
!uv pip install groq --quiet

In [None]:
!uv --version

In [None]:
%%writefile requirements.txt
altair==5.5.0
bertopic==0.17.0
gradio==5.31.0
gradio-client==1.10.1
groq==0.25.0
hdbscan==0.8.40
nltk==3.9.1
numpy==2.0.2
pandas==2.2.2
requests==2.32.3
safetensors==0.5.3
scikit-learn==1.6.1
sentence-transformers==4.1.0
textblob==0.19.0
umap-learn==0.5.7

In [4]:
from os import environ

LANG = 'en'

NEWS_API_KEY = userdata.get('news_api_key')
DIFFBOT_API_KEY = userdata.get('diffbot_key')
environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY')

EXCLUDED_SOURCES = 'stacksocial.com,bringatrailer.com,frequentmiler.com,slickdeals.net,dealcatcher.com,pypi.org'
MAX_DAYS_AGO = 30
MAX_ARTICLES = 100

RPS_LIMIT = 5 # Requests Per Second (RPS) limit with diffbot plan

EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
GROQ_MODEL = 'gemma2-9b-it'

NEWSAPI_ERROR_MSG = 'Something went wrong with the article search. Please try again in a moment.'
DIFFBOT_DISCLAIMER_MSG = "Your request is being processed. The analysis may take a few minutes, up to 30 minutes, based on your selected settings. We appreciate your patience!"
TOPIC_ANALYSIS_ERROR_MSG = 'Unable to complete topic analysis. Please consider increasing the sample size.'
TOPIC_ANALYSIS_DISCLAIMER_MSG = 'The analysis is underway. Changes will appear on the page shortly.'

## Data extraction

### Data extraction I: News API

In [5]:
# %%writefile src/newsapiextraction.py

from os import environ
from json import load as json_load
from datetime import datetime, timedelta
import requests
import pandas as pd
# from constants import LANG, EXCLUDED_SOURCES, NEWSAPI_ERROR_MSG
from gradio import Error as gradioError, Accordion as gradioAccordion

# NEWS_API_KEY = environ.get('NEWS_API_KEY')

def news_api_extraction(search_query: str, n_days_ago: int, search_filter: str | list[str]) -> tuple[list[dict], int]:
  """Searches for articles from the NewsAPI based on a query and date range.

  Args:
      search_query (str): The keywords or phrases to search for in the article title and body.
      n_days_ago (int): The number of days ago to search for articles, up to a maximum of 30 days.
      search_filter (str | list[str]): Where to search in the article.
          Can be 'title', 'description' or list of these options (e.g., 'title,description' or ['title', 'description']).

  Returns:
      tuple[list[dict], int]: A tuple containing:
          - A list of dictionaries, where each dictionary represents an article
            with keys like 'title', 'url', 'markdown_title', 'publish_date', and 'source'.
          - An integer representing the total number of articles retrieved.

  Raises:
      gradioError: If the NewsAPI request returns an error status code.
  """

  # Keywords or phrases to search for in the article title and body.
  today = datetime.today().strftime('%Y-%m-%d')
  from_date = (datetime.today() - timedelta(days=n_days_ago)).strftime('%Y-%m-%d')

  if isinstance(search_filter, str):
    search_in = search_filter
  elif isinstance(search_filter, list):
    search_in = ','.join(search_filter)
  else:
    search_in = 'title,description'
  newsapi_url = (
      'https://newsapi.org/v2/everything?'
      f'q={search_query}'
      f'&language={LANG}'
      f'&searchIn={search_in}'
      f'&from={from_date}'
      f'&to={today}'
      f'&excludeDomains={EXCLUDED_SOURCES}'
      '&sortBy=relevancy'
      f'&apiKey={NEWS_API_KEY}'
      )

  newsapi_response = requests.get(newsapi_url)

  if newsapi_response.status_code == 200:
    newsapi_dict = newsapi_response.json()
    contents = []
    articles_retrieved = newsapi_dict['totalResults']
    for article in newsapi_dict['articles']:
      contents.append({
          'title': article['title'],
          'url': article['url'],
          'markdown_title': f'[{article["title"]}]({article["url"]})',
          'publish_date': datetime.strptime(article['publishedAt'], '%Y-%m-%dT%H:%M:%SZ').strftime('%d %b %Y'),
          'source': article['source']['name']
      })

    return contents, articles_retrieved
  else:
    print('Error:', newsapi_response.status_code)
    raise gradioError(NEWSAPI_ERROR_MSG, duration=10)




def generate_preliminary_search_results(search_query: str, n_days_ago: int, search_filter: str | list[str], state: dict) -> tuple[pd.DataFrame, int, int, gradioAccordion, dict]:
  """Generates preliminary news search results for display in a Gradio application.

  This function fetches news articles based on a search query, a date range, and
  search filters. It handles a special case for an example query to load
  pre-saved results. The results are then processed into a pandas DataFrame
  and returned along with other relevant information for the Gradio UI.

  Args:
      search_query (str): The keyword or phrase to search for in news articles.
      n_days_ago (int): The number of days back from today to search for articles.
      search_filter (str | list[str]): Specifies where to search within the articles
          (e.g., 'title', 'description' or a combination).
      state (dict): A dictionary representing the current state of the Gradio application,
          used to store and retrieve data between function calls.

  Returns:
      tuple[pd.DataFrame, int, int, gradioAccordion, dict]: A tuple containing:
          - df_display (pd.DataFrame): A DataFrame with 'Title', 'Date', and 'Source'
            columns, formatted for display in the Gradio app.
          - n_articles_newsapi (int): The total number of articles retrieved from NewsAPI.
          - n_sources_newsapi (int): The number of unique news sources found.
          - gradioAccordion: A Gradio Accordion component set to be visible and open.
          - state (dict): The updated state dictionary, including the search query
            and the raw NewsAPI results.
  """

  if search_query == 'Tesla (example)':
      with open('examples/newsapi_example_tesla30.json', 'r') as f:
        newsapi_results = json_load(f)
      n_articles_newsapi = 97
  else:
    newsapi_results, n_articles_newsapi = news_api_extraction(search_query, n_days_ago, search_filter)

  df_newsapi = pd.DataFrame(newsapi_results).sort_values(
      by='publish_date', ascending=True, key=lambda x: pd.to_datetime(x, format='%d %b %Y')
  )

  n_sources_newsapi = df_newsapi['source'].nunique()

  state['search_query'] = search_query
  state['newsapi_results'] = newsapi_results

  df_display = df_newsapi[['markdown_title', 'publish_date', 'source']] \
  .rename(columns={'markdown_title': 'Title', 'publish_date': 'Date', 'source': 'Source'})

  return [
      df_display,
      n_articles_newsapi,
      n_sources_newsapi,
      gradioAccordion(visible=True, open=True),
      state
  ]

### Data extraction II: Diffbot

In [6]:
# %%writefile src/diffbotextraction.py

from os import environ
import requests
from json import load as json_load, dump as json_dump
from time import sleep
# from constants import DIFFBOT_DISCLAIMER_MSG, RPS_LIMIT, LANG
import numpy as np
import pandas as pd
from gradio import Info as gradioInfo, Column as gradioColumn, Dropdown as gradioDropdown

# DIFFBOT_API_KEY = environ.get('DIFFBOT_API_KEY')

def diffbot_extraction(newsapi_results: list[dict], max_articles: int) -> tuple[list[str], list[dict], list[str]]:
  """Extracts content, summaries, and sentiment from articles using the Diffbot API.

  This function iterates through a list of news articles obtained from NewsAPI,
  sends requests to the Diffbot Article API for each, and extracts the full
  content, a summary, and sentiment analysis. It includes rate limiting
  considerations and error handling for API requests and parsing.

  Args:
      newsapi_results (list[dict]): A list of dictionaries, where each dictionary
          represents an article from the NewsAPI, typically containing 'title',
          'url', 'markdown_title', 'source', and 'publish_date'.
      max_articles (int): The maximum number of articles to extract content from.

  Returns:
      tuple[list[str], list[dict], list[str]]: A tuple containing:
          - documents (list[str]): A list of strings, where each string is the
            full extracted text content of an article.
          - list_metadata (list[dict]): A list of dictionaries, where each
            dictionary contains metadata for an extracted article, including
            'title', 'url', 'markdown_title', 'publish_date', 'source',
            'sentiment', and 'summary'.
          - categories (list[str]): A flat list of all categories identified
            across all extracted articles.
  """

  gradioInfo(DIFFBOT_DISCLAIMER_MSG, duration=10)
  documents = []
  list_metadata = []
  categories = []
  n_extracted = 0
  for i, article in enumerate(newsapi_results):
    title = article['title']
    url = article['url']
    markdown_title = article['markdown_title']
    source = article['source']
    publish_date = article['publish_date']
    diffbot_api_url = (
        f'https://api.diffbot.com/v3/article?url={url}'
        f'&token={DIFFBOT_API_KEY}'
        f'&naturalLanguage=categories,sentiment,summary'
        f'&summaryNumSentences=4'
        f'&discussion=false'
    )
    # when close to RPS limit, sleep for some time to avoid hitting it
    if i % (RPS_LIMIT - 1) == 0:
      sleep(0.01)

    print(url)
    diffbot_response = requests.get(diffbot_api_url, headers={'accept': 'application/json'})
    request_status = diffbot_response.status_code
    if request_status == 200:
      diffbot_results = diffbot_response.json()
      parsing_status = diffbot_results.get('errorCode', 200)
      if parsing_status == 200:
        try:
          diffbot_objects = diffbot_results['objects'][0]
          if diffbot_objects['humanLanguage'] == LANG:
            content = diffbot_objects['text']
            metadata = {
                'title': title,
                'url': url,
                'markdown_title': markdown_title,
                'publish_date': publish_date,
                'source': source,
                'sentiment':diffbot_objects['sentiment'],
                'summary': diffbot_objects['naturalLanguage']['summary']
              }

            if diffbot_objects.get('categories', None) is not None:
              doc_categories = [category['name'] for category in diffbot_objects['categories']]
              categories.extend(doc_categories)
            else:
              pass

            documents.append(content)
            list_metadata.append(metadata)
            n_extracted += 1
            print('Content successfully extracted:')
            print(title)
            print('\n')
            if n_extracted >= max_articles:
              return documents, list_metadata, categories

        except:
          print('Article skipped (unknown error during extraction):')
          print(title)
          print('\n')
          continue
      else:
        print('Article skipped (parsing error):')
        print(title)
        print('\n')
        continue
    else:
      print('Article skipped (request error):')
      print(title)
      print('\n')
      continue

  return documents, list_metadata, categories




def style_title_with_tooltip(row: pd.Series) -> str:
  """Styles a DataFrame row, specifically the 'title' column, with a background color
  based on sentiment and adds a tooltip displaying the sentiment score.

  Args:
      row (pd.Series): A pandas Series representing a row from a DataFrame.
                        It is expected to contain 'sentiment_class' (str),
                        'sentiment' (float or int), and 'title' (str) columns.

  Returns:
      str: An HTML string representing the styled title with a tooltip.
  """

  sentiment = row['sentiment_class']
  tooltip = f'Sentiment: {row["sentiment"]}'

  if sentiment == 'positive':
      bgcolor = 'lightgreen'
  elif sentiment == 'negative':
      bgcolor = 'lightcoral'
  else:
      bgcolor = 'lightgray'

  return f"""
  <div title="{tooltip}" style="
      background-color: {bgcolor};
      display: table;
      width: 100%;
      padding: 10px;
  ">
      {row["title"]}
  </div>
    """



def generate_final_results(max_articles: int, state: dict) -> tuple[pd.DataFrame, int, int, float, pd.DataFrame, pd.DataFrame, gradioColumn, dict]:
  """Generates and processes final news search results for display in a Gradio application.

  This function performs the following steps:
  1. Loads example data or extracts article content and metadata using Diffbot API.
  2. Calculates top categories from the extracted articles.
  3. Computes descriptive statistics such as the number of articles, unique sources,
      and average sentiment.
  4. Updates the application's state with the processed data.
  5. Formats a DataFrame for display in the Gradio front end, including
      sentiment-based styling for article titles.

  Args:
      max_articles (int): The maximum number of articles for which to extract
          detailed content and metadata.
      state (dict): A dictionary representing the current state of the Gradio application,
          containing 'search_query' and 'newsapi_results'.

  Returns:
      tuple[pd.DataFrame, int, int, float, pd.DataFrame, pd.DataFrame, gradioColumn, dict]: A tuple containing:
          - df_display (pd.DataFrame): A DataFrame with 'Title' (styled with sentiment tooltip),
            'Date', and 'Source' columns, prepared for Gradio display.
          - n_articles (int): The total number of articles for which content was extracted.
          - n_sources (int): The number of unique news sources from the extracted articles.
          - avg_sentiment (float): The average sentiment score of the extracted articles.
          - source_counts (pd.DataFrame): A DataFrame showing the top 5 news sources by article count.
          - top_categories (pd.DataFrame): A DataFrame showing the top 5 categories by percentage.
          - gradioColumn: A Gradio Column component set to be visible.
          - state (dict): The updated state dictionary, including 'documents', 'list_metadata',
            'df_metadata', 'list_categories', and 'random_seed'.
  """

  target = state['search_query']
  if target == 'Tesla (example)':
    # load example data
    with open(f'examples/diffbot_example_documents_tesla.json', 'r', encoding='utf-8') as f:
      documents = json_load(f)

    with open(f'examples/diffbot_example_metadata_tesla.json', 'r', encoding='utf-8') as f:
      list_metadata = json_load(f)

    with open('examples/diffbot_example_categories_tesla.txt', 'r') as f:
      list_categories = f.read().splitlines()

  else:
    # extract articles and generate summaries (included in metadata)
    documents, list_metadata, list_categories = diffbot_extraction(state['newsapi_results'], max_articles)

    # save for optional download
    with open(f'downloads/{target}_documents.json', 'w', encoding='utf-8') as f:
      json_dump(documents, f)
    with open(f'downloads/{target}_metadata.json', 'w', encoding='utf-8') as f:
      json_dump(list_metadata, f)

  # get top categories
  top_categories = pd.DataFrame(list_categories, columns=['Category']) \
  .value_counts(ascending=False, normalize=True) \
  .mul(100) \
  .round(3) \
  .reset_index() \
  .head(5)

  # get some descriptive statistics
  df_metadata = pd.DataFrame(list_metadata).sort_values(
      by='publish_date', ascending=True, key=lambda x: pd.to_datetime(x, format='%d %b %Y')
  )
  df_metadata['sentiment_class'] = df_metadata['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')
  n_articles = len(df_metadata)
  n_sources = df_metadata['source'].nunique()
  avg_sentiment = df_metadata['sentiment'].mean().round(3)
  source_counts = df_metadata['source'] \
  .value_counts() \
  .reset_index() \
  .sort_values(by='count', ascending=False) \
  .head(5)

  # update app state
  state['documents'] = documents
  state['list_metadata'] = list_metadata
  state['df_metadata'] = df_metadata
  state['list_categories'] = list_categories
  state['random_seed'] = np.random.randint(0,100)

  # adapt dataframe for front end
  df_display = df_metadata.copy()
  df_display['title'] = df_display.apply(style_title_with_tooltip, axis=1)
  df_display = df_display[['title', 'publish_date', 'source']] \
  .rename(columns={'title': 'Title', 'publish_date': 'Date', 'source': 'Source'})

  return [
      df_display,
      n_articles,
      n_sources,
      avg_sentiment,
      source_counts,
      top_categories,
      gradioColumn(visible=True),
      state
  ]


def download_docs(search_query: gradioDropdown) -> str:
  """Download diffbot documents as json"""
  if search_query == 'Tesla (example)':
    return 'examples/diffbot_example_documents_tesla.json'
  else:
    return f'downloads/{search_query}_documents.json'


def download_metadata(search_query: gradioDropdown) -> str:
  """Download diffbot metadata as json"""
  if search_query == 'Tesla (example)':
    return 'examples/diffbot_example_metadata_tesla.json'
  else:
    return f'downloads/{search_query}_metadata.json'


## Sentiment analysis

In [7]:
# %%writefile src/sentimentanalysis.py

import pandas as pd
import altair as alt

def generate_sentiment_results(state: dict) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, pd.DataFrame, str]:
  """Generates and processes sentiment-related results for display in a Gradio application.

  This function takes the processed article metadata from the application state
  and computes various sentiment-based metrics, including:
  1. Average sentiment and frequency of articles per source.
  2. Distribution of sentiment classes (positive, neutral, negative).
  3. Identifies the articles with the most positive and most negative sentiment,
      along with their summaries.

  Args:
      state (dict): A dictionary representing the current state of the Gradio application,
                    expected to contain 'df_metadata' (a pandas DataFrame with
                    article metadata, including 'sentiment' and 'sentiment_class').

  Returns:
      tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, pd.DataFrame, str]: A tuple containing:
          - sentiment_by_source (pd.DataFrame): A DataFrame showing the top 10
            sources by article frequency, with their average sentiment and frequency.
          - pct_sentiment (pd.DataFrame): A DataFrame showing the percentage
            distribution of sentiment classes.
          - min_article (pd.DataFrame): A DataFrame (transposed) containing details
            of the article with the most negative sentiment.
          - min_summary (str): The summary of the article with the most negative sentiment.
          - max_article (pd.DataFrame): A DataFrame (transposed) containing details
            of the article with the most positive sentiment.
          - max_summary (str): The summary of the article with the most positive sentiment.
  """

  df_metadata = state['df_metadata']

  # source by sentiment
  sentiment_by_source = df_metadata \
  .groupby('source', as_index=False)['sentiment'] \
  .agg(['mean', 'count']) \
  .round(3) \
  .sort_values(by='count', ascending=False) \
  .head(10) \
  .rename(columns={
      'source': 'Source',
      'mean': 'Average sentiment',
      'count':' Frequency'
  })

  # sentiment distribution
  pct_sentiment = df_metadata \
  .value_counts('sentiment_class', normalize=True) \
  .mul(100) \
  .reindex(['negative', 'neutral', 'positive']) \
  .reset_index() \
  .rename(columns={
      'sentiment_class': 'Sentiment',
      'proportion':'% of ocurrences'
  })


  # most positive article and most negative article
  max_sentiment = df_metadata['sentiment'].max()
  min_sentiment = df_metadata['sentiment'].min()

  max_min_articles = df_metadata.loc[
      (df_metadata['sentiment'] == max_sentiment) | (df_metadata['sentiment'] == min_sentiment),
      ['title', 'publish_date', 'source', 'sentiment', 'summary']
  ] \
  .sort_values(by='sentiment') \
  .rename(columns={'title': 'Title', 'publish_date': 'Date', 'source': 'Source', 'sentiment': 'Sentiment'})
  max_min_summaries = max_min_articles['summary'].values

  min_article = max_min_articles.drop(columns='summary').head(1).T.reset_index()
  min_article.columns = ['Variable', 'Worst article']
  min_summary = max_min_summaries[0]

  max_article = max_min_articles.drop(columns='summary').tail(1).T.reset_index()
  max_article.columns = ['Variable', 'Best article']
  max_summary = max_min_summaries[1]



  return sentiment_by_source, pct_sentiment, min_article, min_summary, max_article, max_summary


def plot_sentiment_over_time(state: dict) -> alt.Chart:
  """Generates an Altair chart visualizing sentiment over time, including sentiment class,
  average sentiment, and standard deviation.

  This function processes the sentiment data from the application state to create
  a timeline visualization. It calculates the average sentiment and its standard
  deviation per publish date, categorizes sentiment as positive, neutral, or negative,
  and then generates an interactive Altair rule chart with error bars.

  Args:
      state (dict): A dictionary representing the current state of the Gradio application,
                    expected to contain 'df_metadata' (a pandas DataFrame with
                    article metadata, including 'publish_date', 'sentiment', and
                    'sentiment_class').

  Returns:
      alt.Chart: An Altair chart object displaying sentiment over time, with
                  sentiment class colored rules, average sentiment, and error bars
                  representing the standard deviation of sentiment. The chart is
                  interactive for zooming and panning.
  """

  df_sentiment_by_date = state['df_metadata'] \
  .groupby('publish_date', as_index=False) \
  .agg(
      avg_sentiment=('sentiment', 'mean'),
      std_sentiment=('sentiment', 'std'),
      article_count=('sentiment', 'count')
  ) \
  .assign(
      sentiment_class=lambda x: x['avg_sentiment'].apply(
          lambda y: 'positive' if y > 0 else 'negative' if y < 0 else 'neutral'
      )
  )

  # Set neutral sentiment to a small positive value for visibility
  df_sentiment_by_date['avg_sentiment_smooth'] = df_sentiment_by_date['avg_sentiment'].copy()
  df_sentiment_by_date.loc[df_sentiment_by_date['sentiment_class'] == 'neutral', 'avg_sentiment_smooth'] = 0.02


  df_sentiment_by_date['publish_date'] = pd.to_datetime(df_sentiment_by_date['publish_date'], format='%d %b %Y')

  # Convert datetime to numerical representation (timestamp) for Altair scale
  min_date_timestamp = df_sentiment_by_date['publish_date'].min().timestamp() * 1000
  max_date_timestamp = df_sentiment_by_date['publish_date'].max().timestamp() * 1000

  # Add a margin before the first date on the x-axis
  date_range = max_date_timestamp - min_date_timestamp
  start_margin = date_range * 0.05  # 5% margin
  adjusted_min_date_timestamp = min_date_timestamp - start_margin
  adjusted_max_date_timestamp = max_date_timestamp + start_margin

  # Define color scale based on sentiment_class
  color_scale = alt.Scale(domain=['positive', 'neutral', 'negative'],
                          range=['green', 'gray', 'red'])

  # Create the Altair plot
  chart = alt.Chart(df_sentiment_by_date) \
  .mark_rule(strokeWidth=20) \
  .encode(
      x=alt.X('publish_date',
              axis=alt.Axis(format="%b %d %Y", labelAngle=-45),
              title='Publish Date', # added format and labelAngle
              scale=alt.Scale(domainMin=adjusted_min_date_timestamp, domainMax=adjusted_max_date_timestamp)
              ),
      y=alt.Y('avg_sentiment_smooth', title='Sentiment'),
      color=alt.Color('sentiment_class',
                      scale=color_scale,
                      legend=alt.Legend(title="Sentiment",
                                        direction='horizontal',
                                        orient='none',
                                        titleAnchor='middle',
                                        legendX=375, legendY=-40)),
      tooltip=['publish_date', 'avg_sentiment', 'sentiment_class', 'article_count', 'std_sentiment']
  ) \
  .properties(
      width=900,
      height=350
  ) \
  .interactive()

  # Add error bars
  error_bars = alt.Chart(df_sentiment_by_date).mark_errorbar(extent='stderr').encode( # changed to stderr
      x='publish_date',
      y=alt.Y('avg_sentiment_smooth', title=''),
      yError='std_sentiment',
      color=alt.Color('sentiment_class', scale=color_scale, legend=None),
      tooltip=['std_sentiment']
  )
  final_chart = chart + error_bars

  return final_chart

## Topic analysis

In [13]:
# %%writefile src/topicanalysis.py

from re import findall
from textblob import TextBlob
from textblob.tokenizers import BaseTokenizer

import numpy as np
import pandas as pd
import altair as alt
from plotly.graph_objects import Figure as plotlyFigure

from nltk.downloader import download
download('punkt_tab')
download('wordnet')
download('averaged_perceptron_tagger_eng')

# from constants import EMBEDDING_MODEL, TOPIC_ANALYSIS_DISCLAIMER_MSG, TOPIC_ANALYSIS_ERROR_MSG, GROQ_MODEL
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import MaximalMarginalRelevance
from bertopic.vectorizers import ClassTfidfTransformer
from gradio import Info as gradioInfo, Error as gradioError, Textbox as gradioTextbox

from groq import Groq

class CustomTokenizer(BaseTokenizer):

  def tokenize(self, text: str) -> list[str]:
      """Apply sklearn regex pattern to extract words"""

      words = findall(r"(?u)\b\w\w+\b", text)
      return words

def custom_tokenizer(text) -> list:
  """Custom tokenizer to remove plural nouns (with exceptions) and lemmatize verbs"""

  plural_exceptions = {"police", "data", "fish", "sheep", "species", "news", "media"} # exceptions suggested by chatgpt
  blob = TextBlob(text, tokenizer=CustomTokenizer())

  return [
      word.singularize() if tag == 'NNS' and word not in plural_exceptions
      else word.lemmatize(pos='v') if tag.startswith('V')
      else word
      for word,tag in blob.tags
  ]



def initialize_bertopic_components(n_docs: int, random_seed: int) -> dict:
  """Initializes and returns a dictionary of BERTopic model components.

  This function sets up the various models required for BERTopic, including
  the embedding model, dimensionality reduction, clustering, vectorization,
  c-TF-IDF, and representation models.

  Args:
      n_docs (int): The number of documents to be processed, used to determine
                    the minimum cluster size for HDBSCAN.
      random_seed (int): A seed for reproducibility in models that involve
                          randomness, such as UMAP.

  Returns:
      dict: A dictionary containing initialized instances of the BERTopic components:
            'embedding_model' (SentenceTransformer),
            'dimensionality_reduction_model' (UMAP),
            'clustering_model' (HDBSCAN),
            'vectorizer_model' (CountVectorizer),
            'ctfidf_model' (ClassTfidfTransformer), and
            'representation_model' (MaximalMarginalRelevance).
  """


  # Pre-calculate embeddings
  embedding_model = SentenceTransformer(EMBEDDING_MODEL)

  # Dimensionality Reduction
  dimensionality_reduction_model = UMAP(
      n_neighbors=5,
      n_components=5,
      min_dist=0.0,
      metric='cosine',
      random_state=random_seed
  )

  # Clustering
  min_size = max(int(n_docs / 10), 2)
  clustering_model = HDBSCAN(
      min_cluster_size=min_size,
      metric='euclidean',
      cluster_selection_method='eom',
      prediction_data=True
  )

  # Vectorize
  vectorizer_model = CountVectorizer(
      stop_words='english',
      ngram_range=(1, 1),
      tokenizer=custom_tokenizer,
      max_df=.95,
      min_df=0.05
  )

  # c-TF-IDF
  ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

  # Representation Model
  representation_model = MaximalMarginalRelevance(diversity=0.3)

  return {
      'embedding_model': embedding_model,
      'dimensionality_reduction_model': dimensionality_reduction_model,
      'clustering_model': clustering_model,
      'vectorizer_model': vectorizer_model,
      'ctfidf_model': ctfidf_model,
      'representation_model': representation_model
  }



def run_topic_analysis(state: dict) -> dict:
  """Performs BERTopic topic analysis on the extracted article summaries.

  This function either loads a pre-trained BERTopic model and embeddings for
  an example search query or initializes and trains a new BERTopic model
  using the components set up in `initialize_bertopic_components`.
  It then generates human-readable topic labels and updates the application state
  with the trained topic model and encoded documents. It also provides a Gradio info message
  to indicate the start of the analysis.

  Args:
      state (dict): A dictionary representing the current state of the Gradio application.
                    It should contain:
                    - 'search_query' (str): The current search query.
                    - 'documents' (list): A list of raw document texts (if a new model is trained).
                    - 'list_metadata' (list[dict]): A list of dictionaries containing
                      article metadata, where each dictionary should have a 'summary' key.
                    - 'random_seed' (int): A seed for reproducibility.

  Returns:
      dict: The updated state dictionary, including:
            - 'topic_model' (BERTopic): The trained BERTopic model.
            - 'encoded_docs' (np.ndarray): The encoded document embeddings.

  Raises:
      gradioError: If the topic model fitting process fails.
  """

  gradioInfo(TOPIC_ANALYSIS_DISCLAIMER_MSG, duration=5)

  if state['search_query'] == 'Tesla (example)':
    # load topic model
    topic_model = BERTopic.load('examples/topic_model_example_tesla', embedding_model=EMBEDDING_MODEL)
    encoded_docs = np.load('examples/example_encoded_summaries_tesla.npy')
  else:
    n_docs = len(state['documents'])
    random_seed = state['random_seed']
    params = initialize_bertopic_components(n_docs, random_seed)

    topic_model = BERTopic(
        umap_model=params['dimensionality_reduction_model'],
        hdbscan_model=params['clustering_model'],
        vectorizer_model=params['vectorizer_model'],
        ctfidf_model=params['ctfidf_model'],
        representation_model=params['representation_model'],
        verbose=False
    )

    summaries = [doc['summary'] for doc in state['list_metadata']]
    encoded_docs = params['embedding_model'].encode(summaries, show_progress_bar=False)
    # topics, probs = topic_model.fit_transform(summaries, encoded_docs)
    try:
      topic_model.fit(summaries, encoded_docs)
    except Exception as e:
      print('Cannot fit topic model')
      raise gradioError(TOPIC_ANALYSIS_ERROR_MSG, duration=15)

  topic_labels = topic_model.generate_topic_labels(
      nr_words=3,
      topic_prefix=False,
      word_length=20,
      separator='-'
  )
  topic_model.set_topic_labels(topic_labels)
  state['topic_model'] = topic_model
  state['encoded_docs'] = encoded_docs

  return state



def promtp_generator(keywords: str, representative_docs: str) -> str:
  """Imitation of ChatPromptTemplate from Langchain. It uses a "skeleton" prompt with placeholders to create custom prompts"""

  return f"""You are given a topic described by the following keywords: {keywords}. Here are some representative documents related to this topic:\n{representative_docs}

  Based on the keywords and documents, provide a concise topic description and nothing else. Keep it under ten words"""



def get_topic_titles(topic_model_df: pd.DataFrame, topic_model: BERTopic) -> list[str]:
  """Generates concise titles for topics using a language model (Groq API).

  This function iterates through a DataFrame of topics, extracts keywords and
  representative documents for a subset of topics, and then uses a language
  model (via the Groq API) to generate a concise, human-readable title for
  each of these topics.

  Args:
      topic_model_df (pd.DataFrame): A DataFrame containing topic information,
                                      expected to have at least 'Topic' (int)
                                      and 'CustomName' (str) columns.
      topic_model (BERTopic): The trained BERTopic model instance, used to
                              retrieve representative documents for each topic.

  Returns:
      list[str]: A list of generated topic titles (strings).
  """

  groq_client = Groq()
  unique_topics = topic_model_df['Topic'].values.tolist()
  topic_raw_labels = topic_model_df['CustomName'].values.tolist()
  offset = 1 if -1 in unique_topics else 0

  n_topics_to_label = min(4, len(unique_topics) - offset)
  topic_titles = []
  for topic_id in unique_topics[offset:(n_topics_to_label + 1)]:
    pretty_topic_keywords = topic_raw_labels[topic_id + offset].replace('-', ', ')
    pretty_representative_docs = '\n'.join(['-' + doc for doc in topic_model.get_representative_docs(topic=topic_id)])
    llm_response = groq_client.chat.completions.create(
      messages=[
        {
          "role": "user",
          "content": promtp_generator(pretty_topic_keywords, pretty_representative_docs),
        }
      ],
      model=GROQ_MODEL,
      temperature=0.3
    )
    # clean title
    topic_title = llm_response.choices[0].message.content.replace('\n', '').strip()
    topic_titles.append(topic_title)

  return topic_titles

def plot_topic_word_scores(topic_scores: dict[int, list[tuple[str, float]]], max_topics_to_display: int = 4, num_terms_to_display: int = 5) -> alt.Chart:
  """Generates an Altair horizontal bar chart visualizing the top words and their scores for multiple topics.

  Takes a dictionary of topic word scores, filters and sorts them,
  and then creates a concatenated Altair chart. Each individual chart within the
  concatenation represents a single topic, displaying its most important words
  and their corresponding scores. The x-axis is scaled consistently across all
  topic charts.

  Args:
      topic_scores (dict[int, list[tuple[str, float]]]): A dictionary where keys are
          topic IDs (integers) and values are lists of tuples. Each tuple contains
          a word (string) and its associated score (float) for that topic.
      max_topics_to_display (int, optional): The maximum number of topics to display
          in the concatenated chart. Defaults to 4.
      num_terms_to_display (int, optional): The number of top terms (words) to display
          for each topic. Defaults to 5.

  Returns:
      alt.Chart: An Altair horizontal concatenated bar chart object, showing the
                  top words and their scores for the specified number of topics.
                  The chart includes a main title and individual topic titles.
  """

  # Altair's 'tableau10' scheme colors.
  colors = [
      '#4C78A8', '#F58518', '#E4572E', '#72B7B2', '#54A24B',
      '#EECA3B', '#B279A2', '#FF9DA7', '#9D755D', '#BAB0AC'
  ]
  # this will be used to decide which color to use first
  if -1 in topic_scores.keys():
    offset = 1
  else:
    offset = 0

  charts = []
  # Get topic IDs, filter out topic -1, sort them to ensure consistent order,
  # and then limit to the specified maximum number of topics to display.
  filtered_topic_ids = [topic_id for topic_id in topic_scores.keys() if topic_id != -1]
  sorted_topic_ids = sorted(filtered_topic_ids)[:max_topics_to_display]

  # Determine the maximum score across all displayed topics for consistent x-axis scaling
  max_score = 0
  for topic_id in sorted_topic_ids:
    # Consider only the top N terms for max_score calculation as well
    for _, score in topic_scores[topic_id][:num_terms_to_display]:
      if score > max_score:
          max_score = score

  # Add a small buffer to the max_score for the x-axis domain
  x_axis_domain_max = max_score * 1.1 if max_score > 0 else 0.4 # Ensure at least 0.4 if no data

  # Iterate through the selected topic IDs to create individual charts
  for i, topic_id in enumerate(sorted_topic_ids):
    # Get the words and scores for the current topic, taking only the top N terms
    words_scores = topic_scores[topic_id][:num_terms_to_display]

    # Create a pandas DataFrame for the current topic's data
    df = pd.DataFrame(words_scores, columns=['word', 'score'])
    df['topic'] = f'Topic {topic_id}' # Keep 'topic' column for color encoding

    # Assign a color from the predefined list, cycling through if necessary
    color_for_topic = colors[i + offset % len(colors)]

    # Create a bar chart for the current topic
    chart = alt.Chart(df) \
    .mark_bar(cornerRadius=3) \
    .encode(
        x=alt.X('score', title=None, scale=alt.Scale(domain=(0, x_axis_domain_max)), axis=alt.Axis(labelFontSize=12)),
        y=alt.Y('word', sort='-x', title=None, axis=alt.Axis(labels=True, ticks=False, labelFontSize=14)),
        color=alt.value(color_for_topic),
        tooltip=['word', 'score']
    ) \
    .properties(title=alt.Title(f'Topic {topic_id}', fontSize=16),
                width=150,
                height=125)

    charts.append(chart)

  # If no charts were created (e.g., topic_scores was empty, max_topics_to_display was 0,
  # or only topic -1 was present)
  # if not charts:
  #     return alt.Chart(pd.DataFrame({'message': ['No topics to display.']})).mark_text(
  #         align='center', baseline='middle', fontSize=20, color='gray'
  #     ).encode(
  #         text='message'
  #     ).properties(
  #         title="Topic Word Scores"
  #     )

  # Concatenate all individual topic charts horizontally
  final_chart = alt \
  .hconcat(*charts, spacing=10) \
  .resolve_scale(x='shared') \
  .properties(title=alt.Title("Topic Word Scores", anchor="middle", fontSize=26, dy=-10)) \
  .configure_axis(grid=True, gridColor='#E0E0E0') \
  .configure_view(stroke=None) # Remove border around the entire plot

  return final_chart


def plot_topics_scatterplot(topic_model: BERTopic,
                            encoded_docs: np.ndarray,
                            summaries: list[str],
                            topic_label_map: dict,
                            random_seed: int) -> alt.Chart:

  """Plots documents in a 2D scatter plot, colored by their assigned topic.

  This function takes document embeddings, reduces their dimensionality to 2D using UMAP,
  and then creates an Altair scatter plot. Each point represents a document, and its
  color indicates its topic. Topic centroids are also plotted with labels.

  Args:
      topic_model (BERTopic): The trained BERTopic model, used to get topic assignments.
      encoded_docs (np.ndarray): A NumPy array of document embeddings.
      summaries (list[str]): A list of document summaries, used for tooltips.
      topic_label_map (dict): A dictionary mapping topic IDs to their human-readable labels.
      random_seed (int): A seed for reproducibility in UMAP.

  Returns:
      alt.Chart: An Altair chart object representing the 2D scatter plot of documents,
                  colored by topic, with topic centroid labels. The chart is interactive.
  """

  topic_per_doc = topic_model.topics_
  df = pd.DataFrame({'topic': topic_per_doc, 'summary': summaries})

  # Reduce dimensionality of embeddings
  umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine', random_state=random_seed).fit(encoded_docs)
  embeddings_2d = umap_model.embedding_

  # Combine data
  df['x'] = embeddings_2d[:, 0]
  df['y'] = embeddings_2d[:, 1]

  # Add topic labels for viualization (legend)
  df['topic_label'] = df['topic'].map(topic_label_map)


  centroids = df.groupby('topic')[['x', 'y']].mean().reset_index()
  # Add topic labels for visualization (centroids)
  # Same as labels in legend, but without the prefix "Topic j:"
  centroids['topic_label'] = centroids['topic'].map(topic_label_map).str.split(': ').str[1]

  # Create the Altair plot
  base = alt.Chart(df) \
  .encode(
      x=alt.X('x',
              title='Dimension 1',
              scale=alt.Scale(domain=[df['x'].min() - 0.5, df['x'].max() + 0.5]) # Adjust the domain
              ),
      y=alt.Y('y',
              title='Dimension 2',
              scale=alt.Scale(domain=[df['y'].min() - 0.5, df['y'].max() + 0.5]) # Adjust the domain
              ),
      color=alt.Color('topic_label', title='Topic'), # Use topic_label for the legend, with title 'Topic'
      tooltip=['topic_label', 'summary']  # Include topic label in the tooltip
  ) \
  .properties(
      title='Document Representation in 2-D space', # Add a title
      width=700,  # Set the width of the chart
      height=400   # Set the height of the chart
  ) \
  .interactive()

  points = base.mark_circle(size=100, stroke='black', strokeWidth=2, opacity=.9) # Increase size and add border

  # Add labels for the centroids, exclude label for topic -1
  labels = alt.Chart(centroids) \
  .mark_text(
      align='left',
      baseline='middle',
      dx=7,  # Nudge labels to the right
      dy=-8,  # Nudge labels slightly up
      fontWeight='bold'
  ) \
  .encode(
      x='x',
      y='y',
      text=alt.condition(  # Use a condition to control the text
          alt.datum.topic != -1,  # If topic is not -1, display the label
          'topic_label',         # ...display the topic_label
          alt.value('')       # ...otherwise, display an empty string
      ),
      color=alt.value('black')
  ) \
  .interactive()

  chart = points + labels
  topic_scatter_chart = chart.configure_title(
      fontSize=18,
      font='Arial',
      color='black'
  )

  return topic_scatter_chart



def generate_topic_results(state: dict) -> tuple[plotlyFigure, alt.Chart, gradioTextbox, gradioTextbox, gradioTextbox, gradioTextbox]:
  """Generates and displays topic analysis results in a Gradio application.

  This function leverages a pre-trained or newly trained BERTopic model to
  produce various visualizations and textual outputs related to identified topics.
  It generates a barchart of topic word scores, a 2D scatter plot of documents
  colored by topic, and concise titles for the prominent topics using a language model.

  Args:
      state (dict): A dictionary representing the current state of the Gradio application.
                    It is expected to contain:
                    - 'topic_model' (BERTopic): The trained BERTopic model.
                    - 'random_seed' (int): The random seed used for reproducibility.
                    - 'list_metadata' (list[dict]): A list of dictionaries with article metadata,
                      each containing a 'summary' key.
                    - 'encoded_docs' (np.ndarray): The encoded document embeddings.
                    - 'search_query' (str): The current search query, used to load example data.

  Returns:
      tuple[go.Figure, alt.Chart, gradioTextbox, gradioTextbox, gradioTextbox, gradioTextbox]: A tuple containing:
          - topic_barchart (go.Figure): A Plotly bar chart visualizing topic word scores.
          - topic_scatterplot (alt.Chart): An Altair scatter plot of documents in 2D space, colored by topic.
          - gr_textboxes (gradioTextbox): Four Gradio Textbox components, displaying generated
            topic titles. If fewer than 4 titles are generated, the remaining textboxes will be invisible.
  """

  topic_model = state['topic_model']
  random_seed = state['random_seed']
  list_metadata = state['list_metadata']
  summaries = [doc['summary'] for doc in list_metadata]
  encoded_docs = state['encoded_docs']

  topic_model_df = topic_model.get_topic_info()
  topic_label_map = {}
  for i, row in topic_model_df.iterrows():
    topic_id = row['Topic']
    topic_label = row['CustomName']
    if topic_id == -1:
      topic_label_map[topic_id] = f'Topic -1: {topic_label} (noise)'
    else:
      topic_label_map[topic_id] = f'Topic {topic_id}: {topic_label}'

  # barchart
  topic_barchart = plot_topic_word_scores(topic_model.get_topics(), max_topics_to_display=4, num_terms_to_display=5)

  # scatterplot
  topic_scatterplot = plot_topics_scatterplot(topic_model, encoded_docs, summaries, topic_label_map, random_seed)

  # topic titles
  if state['search_query'] == 'Tesla (example)':
    with open('examples/example_topic_titles_tesla.txt', 'r') as f:
      topic_titles = f.read().splitlines()
  else:
    topic_titles = get_topic_titles(topic_model_df, topic_model)

  n_generated_titles = len(topic_titles)
  n_empty_textboxes = 4 - n_generated_titles
  gr_textboxes = []
  for i in range(n_generated_titles):
    gr_textboxes.append(gradioTextbox(label=f'Topic {i}', value=topic_titles[i], visible=True))
  for i in range(n_empty_textboxes):
    gr_textboxes.append(gradioTextbox(visible=False))


  return topic_barchart, topic_scatterplot, gr_textboxes[0], gr_textboxes[1], gr_textboxes[2], gr_textboxes[3]

Overwriting src/topicanalysis.py


## Gradio app

In [14]:
# %%writefile app_brand_monitoring.py

# from constants import MAX_ARTICLES, MAX_DAYS_AGO
# from newsapiextraction import generate_preliminary_search_results
# from diffbotextraction import generate_final_results, download_docs, download_metadata
# from sentimentanalysis import generate_sentiment_results, plot_sentiment_over_time
# from topicanalysis import run_topic_analysis, generate_topic_results
import gradio as gr


def load_top_brands() -> list[str]:
  """Load top 50 brands from txt file and sort by name"""

  with open('examples/top_brands.txt') as f:
    top_brands = f.read().splitlines()[:50]
    top_brands.append('Tesla (example)')
    top_brands.sort()

  return top_brands


def allow_topic_analysis(n_articles_label: str) -> gr.Button:
  """Allow topic analysis only if there are enough articles"""

  if int(n_articles_label) >= 6:
    return gr.Button(value='Run topic analysis 🔓', interactive=True)
  else:
    return gr.Button(value='Not enough sample to run topic analysis 🔒', interactive=False)

custom_theme = gr.Theme.load('gradio_theme/JohnSmith9982small_and_pretty.json')
with gr.Blocks(theme=custom_theme) as demo:
  app_state = gr.State({})
  top_brands = load_top_brands()

  gr.Markdown('# Brand Monitoring Tool')
  with gr.Tab('Search page') as search_page_tab:
    with gr.Row():
      gr.Markdown('Stay updated with the latest news about the most important brands. Select a brand, search for related articles, and analyze the content to gain insights into sentiment, article sources, and more.')
    with gr.Row():
      with gr.Column():
        with gr.Row():
          search_query = gr.Dropdown(
              label='Brand selection',
              choices=top_brands,
              interactive=True,
              value='Tesla (example)',
              scale=2
          )
          search_filter = gr.CheckboxGroup(
              label='Search matches in ...',
              choices=['title', 'description'],
              value = ['title', 'description'],
              interactive=True,
              scale=1
          )
        n_days_ago = gr.Number(
            label='Get articles from last n days (max = 30)',
            minimum=1,
            maximum=MAX_DAYS_AGO, # we limit the upper bound to
            step=1,
            value=MAX_DAYS_AGO,
            interactive=True
        )
        max_articles = gr.Number(
            label='Maximum number of articles to analyze (max = 100)',
            minimum=1,
            maximum=MAX_ARTICLES, # we limit the upper bound to
            step=5,
            value=MAX_ARTICLES,
            interactive=True
        )
        search_button = gr.Button('Search 🔍')
        with gr.Accordion('Preliminary search results', open=False) as newsapi_accordion:
          with gr.Row():
            number_of_preliminary_articles = gr.Label(label='# of retrieved articles')
            number_of_preliminary_sources = gr.Label(label='# of different sources')
          preliminary_results = gr.DataFrame(
              label='Retrieved articles (This list might not be definitive)',
              headers=['Title', 'Date', 'Source'],
              col_count=3,
              column_widths=['60%', '20%', '20%'],
              show_row_numbers=False,
              wrap=True,
              datatype='markdown'
          )
          extract_button = gr.Button('Extract content from articles', icon=None, visible=False)
      with gr.Column(visible=False) as extraction_results:
        with gr.Row():
          number_of_articles = gr.Label(label='# of extracted articles')
          number_of_sources = gr.Label(label='# of different sources')
          average_sentiment = gr.Label(label='Avg sentiment (-1 to 1)')

        sources_plot = gr.BarPlot(
            x='source',
            y='count',
            title='Most frequent sources',
            x_title='Source',
            y_title='Number of articles',
            x_label_angle=20
        )

        categories_plot = gr.BarPlot(
            x='Category',
            y='proportion',
            title='Most frequent categories',
            x_title='Category',
            y_title='% of ocurrences',
            x_label_angle=20
        )

        extracted_articles = gr.DataFrame(
            label='Extracted articles',
            headers=['Title', 'Date', 'Source'],
            show_search='filter',
            col_count=3,
            column_widths=['60%', '20%', '20%'],
            show_row_numbers=False,
            wrap=True,
            datatype='html',
            interactive=False
        )
        with gr.Row() as downloads:
          download_docs_button = gr.DownloadButton(
              label='Download documents',
              inputs=search_query,
              value=download_docs
          )
          download_metadata_button = gr.DownloadButton(
              label='Download metadata',
              inputs=search_query,
              value=download_metadata
          )

  with gr.Tab('Sentiment analysis') as sentiment_analysis_tab:
    gr.Markdown('Dive deeper into the sentiment behind brand news. Explore the best and worst articles based on sentiment, view a distribution of overall sentiment, and track how sentiment evolves over time.')
    with gr.Row():
      gr.Markdown('')
      sentiment_results_button = gr.Button('Run sentiment analysis 🔒', interactive=False, size='md')
      gr.Markdown('')
    with gr.Column():
      with gr.Row():
        sentiment_by_source = gr.DataFrame(
            headers=['Source', 'Average sentiment', 'Frequency']
        )
        sentiment_distribution_plot = gr.BarPlot(
            x='Sentiment',
            y='% of ocurrences',
            title='Sentiment distribution',
            scale=1,
            height=400
        )
      with gr.Row():
        with gr.Column():
          worst_article_df = gr.DataFrame(
              headers=['Variable', 'Worst article'],
              column_widths=['20%', '80%']
          )
          worst_article_summary = gr.Textbox(label='Summary')
        with gr.Column():
          best_article_df = gr.DataFrame(
              headers=['Variable', 'Best article'],
              column_widths=['20%', '80%']
          )
          best_article_summary = gr.Textbox(label='Summary')

      sentiment_evolution_plot = gr.Plot(label='Sentiment evolution over time')

  with gr.Tab('Topic analysis') as topic_analysis_tab:
    gr.Markdown('Uncover the key themes driving conversations around your brand. Articles are automatically clustered into distinct topics with AI-generated names, helping you identify the main areas of focus and trends.')
    with gr.Row():
      gr.Markdown('')
      topic_results_button = gr.Button('Run topic analysis 🔒', interactive=False, size='md')
      gr.Markdown('')
    gr.Markdown('### Top most relevant themes')
    with gr.Column():
      with gr.Row():
        topic0 = gr.Textbox(label='Topic 0', visible=False)
        topic1 = gr.Textbox(label='Topic 1', visible=False)
        topic2 = gr.Textbox(label='Topic 2', visible=False)
        topic3 = gr.Textbox(label='Topic 3', visible=False)
      topic_analysis_barchart = gr.Plot(label='Top words per topic') # 1.75
    topic_analysis_scatter = gr.Plot(label='Documents in 2-D space')

  # Events #

  # main page events
  search_button.click(
      fn=generate_preliminary_search_results,
      inputs=[search_query, n_days_ago, search_filter, app_state],
      outputs=[
          preliminary_results,
          number_of_preliminary_articles,
          number_of_preliminary_sources,
          newsapi_accordion,
          app_state
      ]
  ).then(
        lambda _: gr.update(value='Extract content from articles 🔓', visible=True, interactive=True),
        extract_button,
        extract_button
  )
  extract_button.click(
      fn=generate_final_results,
      inputs=[max_articles, app_state],
      outputs=[
          extracted_articles,
          number_of_articles,
          number_of_sources,
          average_sentiment,
          sources_plot,
          categories_plot,
          extraction_results,
          app_state
      ],
      show_progress='full',
      trigger_mode='once'
  ).then(
        lambda _: gr.update(value='Extract content from articles 🔒', interactive=False),
        extract_button,
        extract_button
  ).then(
        lambda _: gr.update(value='Run sentiment analysis 🔓', interactive=True),
        sentiment_results_button,
        sentiment_results_button
  ).then(
      allow_topic_analysis,
      number_of_articles,
      topic_results_button
  )

  # sentiment analysis
  sentiment_results_button.click(
        fn=generate_sentiment_results,
        inputs=[app_state],
        outputs=[
            sentiment_by_source,
            sentiment_distribution_plot,
            worst_article_df,
            worst_article_summary,
            best_article_df,
            best_article_summary
        ]
    ).then(
        plot_sentiment_over_time,
        inputs=[app_state],
        outputs=[sentiment_evolution_plot]
    ).then(
        lambda _: gr.update(value='Run sentiment analysis 🔒', interactive=False),
        sentiment_results_button,
        sentiment_results_button
    )

  # topic analysis
  topic_results_button.click(
      fn=run_topic_analysis,
      inputs=[app_state],
      outputs=[app_state]
  ).then(
      fn=generate_topic_results,
      inputs=[app_state],
      outputs=[
          topic_analysis_barchart,
          topic_analysis_scatter,
          topic0,
          topic1,
          topic2,
          topic3
      ]
  ).then(
        lambda _: gr.update(value='Run topic analysis 🔓', interactive=False),
        topic_results_button,
        topic_results_button
    )


demo.queue().launch(share=True, debug=True, allowed_paths=['./downloads'])

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://91d2336be8094138d8.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://91d2336be8094138d8.gradio.live


