In [1]:
import numpy as np
import pandas as pd
from deltalake import DeltaTable
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Selecting sample cases
- First let's manually select some sample news events which can be used for the consolidation task

##### Selected news events
- 67294e199902a3465058b1ba - Judge declines to block Musk’s $1 million voter giveaways
- 6724ba5a9aa896701328e02c - Jennifer Lopez Endorses Kamala Harris for President, Blasts Trump
- 6724c9c69aa896701328e087 - NFT Developers Plead Guilty to $400,000 'Rug Pull’, Laundering and Wire Fraud

In [3]:
case1_id = '67294e199902a3465058b1ba'
case2_id = '6724ba5a9aa896701328e02c'
case3_id = '6724c9c69aa896701328e087'

## A simple approach (Trial 1)

- Just getting embeddings of the content of each article and using a vector database to search the similars.

- Embedding mode: `all-mpnet-base-v2`
- Vector database: `ChromaDB`

In [4]:
from scripts.db_1 import Database
from scripts.embeddings_trial1 import Embeddings

db = Database('articles')
embed_model = Embeddings()

In [21]:
for row in tqdm(df.iterrows(), desc="Adding articles to Database", total=df.shape[0]):
  article = row[1]
  embedding = embed_model.generate_embeddings(article_content=article['content'])
  if embedding is None:
    raise Exception("Embedding is None")
  db.insert_document_by_embedding([row[0]], [embedding], [row[0]])

Adding articles to Database: 100%|██████████| 2080/2080 [23:52<00:00,  1.45it/s]


In [22]:
embedding.shape

(768,)

- Case 1

In [10]:
# Case 1
case1 = db.get_document([case1_id])

search_results = db.search_nearest_documents(case1['embeddings'], 30)


In [11]:
case1

{'ids': ['67294e199902a3465058b1ba'],
 'embeddings': array([[ 2.64897533e-02,  1.25086278e-01, -2.42463537e-02,
          8.36910158e-02, -1.15052313e-02, -7.18016468e-04,
         -6.85784668e-02,  6.14656415e-03, -2.08123270e-02,
         -4.92500290e-02,  3.46640646e-02, -2.28095539e-02,
          1.57090388e-02, -1.13801667e-02, -1.34485296e-03,
         -1.19611677e-02, -9.04761720e-03, -5.18046096e-02,
          5.97322620e-02, -6.35935813e-02, -4.12254594e-02,
         -3.02789751e-02, -7.14915060e-03,  1.41456593e-02,
          4.54695225e-02,  2.22552381e-02,  1.79047789e-02,
         -1.28367255e-02, -3.22326459e-02, -1.61981564e-02,
          1.23754377e-02,  2.16254368e-02, -4.42019068e-02,
          1.24177756e-03,  2.11034717e-06, -5.15646338e-02,
          2.82377340e-02,  6.97486941e-03,  3.14783938e-02,
          1.44725954e-02, -1.20117068e-02,  6.50139302e-02,
         -4.64815088e-02,  1.03424378e-02, -3.69942212e-03,
         -5.05360663e-02,  6.81113126e-03,  3.86

In [12]:
search_results

{'ids': [['67294e199902a3465058b1ba',
   '672917219902a3465058b0ba',
   '6729446c9902a3465058b198',
   '6729083f9902a3465058b069',
   '672912229902a3465058b098',
   '6729136e9902a3465058b0a1',
   '672912619902a3465058b09b',
   '672946ef9902a3465058b1a4',
   '67292daa9902a3465058b12e',
   '67293a219902a3465058b16b',
   '672936bb9902a3465058b155',
   '67292ecd9902a3465058b133',
   '67294bcb9902a3465058b1b2',
   '67293b969902a3465058b170',
   '67293e959902a3465058b17e',
   '6728f3ff9902a3465058b00b',
   '672953fd9902a3465058b1d0',
   '67291ab79902a3465058b0d5',
   '67291b739902a3465058b0df',
   '672918349902a3465058b0c3',
   '6724d5fd9aa896701328e0c5',
   '6724d8139aa896701328e0d2',
   '6728a2b89902a3465058aee2',
   '672902cd9902a3465058b04f',
   '67279a629902a3465058acde',
   '6728b0e49902a3465058af0b',
   '672722cf9902a3465058abbe',
   '6728d7e59902a3465058af81',
   '67272d4d9902a3465058abf0',
   '672508929aa896701328e1e7']],
 'embeddings': [array([[ 0.02648975,  0.12508628, -0.02424635

In [13]:
search_results['embeddings'][0][0]

array([ 2.64897533e-02,  1.25086278e-01, -2.42463537e-02,  8.36910158e-02,
       -1.15052313e-02, -7.18016468e-04, -6.85784668e-02,  6.14656415e-03,
       -2.08123270e-02, -4.92500290e-02,  3.46640646e-02, -2.28095539e-02,
        1.57090388e-02, -1.13801667e-02, -1.34485296e-03, -1.19611677e-02,
       -9.04761720e-03, -5.18046096e-02,  5.97322620e-02, -6.35935813e-02,
       -4.12254594e-02, -3.02789751e-02, -7.14915060e-03,  1.41456593e-02,
        4.54695225e-02,  2.22552381e-02,  1.79047789e-02, -1.28367255e-02,
       -3.22326459e-02, -1.61981564e-02,  1.23754377e-02,  2.16254368e-02,
       -4.42019068e-02,  1.24177756e-03,  2.11034717e-06, -5.15646338e-02,
        2.82377340e-02,  6.97486941e-03,  3.14783938e-02,  1.44725954e-02,
       -1.20117068e-02,  6.50139302e-02, -4.64815088e-02,  1.03424378e-02,
       -3.69942212e-03, -5.05360663e-02,  6.81113126e-03,  3.86694111e-02,
       -2.73975600e-02, -4.49131727e-02,  1.90080609e-02,  5.22376113e-02,
        3.27154100e-02, -

In [16]:
def articles_similarity_summary(case_id, n, flag_value=0.5):
  case = db.get_document([case_id])

  search_results = db.search_nearest_documents(case['embeddings'], n)

  print(f"\033[1m{'ID':<30}| {'Title':<90}| {'Source':<30}| {'Published Date':<30}| {'Similarity':<30}\033[0m")
  print(f"{'-'*30}+-{'-'*90}+-{'-'*30}+-{'-'*30}+-{'-'*30}")
  for i, result_id in enumerate(search_results['ids'][0]):
    embedding = search_results['embeddings'][0][i]
    doc_title = df[df.index == result_id]['title'].values[0]
    doc_title = doc_title[:70] + '...' if len(doc_title) > 70 else doc_title
    doc_source = df[df.index == result_id]['source'].values[0]
    doc_date = df[df.index == result_id]['publication_date'].values[0]
    similarity = embed_model.model.similarity(case['embeddings'], embedding)
    if similarity.item() > flag_value:
      print(f'\033[92m{result_id:<30}| {doc_title:<90}| {doc_source:<30}| {doc_date:<30}| {similarity.item():<30.6f}\033[0m')
    else:
      print(f'{result_id:<30}| {doc_title:<90}| {doc_source:<30}| {doc_date:<30}| {similarity.item():<30.6f}')

In [19]:
articles_similarity_summary(case1_id, 30)

[1mID                            | Title                                                                                     | Source                        | Published Date                | Similarity                    [0m
------------------------------+-------------------------------------------------------------------------------------------+-------------------------------+-------------------------------+-------------------------------
[92m67294e199902a3465058b1ba      | Judge declines to block Musk’s $1 million voter giveaways                                 | The Verge                     | 2024-11-04T00:00:00.000000000 | 1.000000                      [0m
[92m672917219902a3465058b0ba      | Elon Musk’s PAC admits $1 million voter giveaways aren’t ‘random’                         | The Verge                     | 2024-11-04T00:00:00.000000000 | 0.925632                      [0m
[92m6729446c9902a3465058b198      | Judge denies Philadelphia DA's request to block Elon Musk's $

In [20]:
articles_similarity_summary(case2_id, 120)

[1mID                            | Title                                                                                     | Source                        | Published Date                | Similarity                    [0m
------------------------------+-------------------------------------------------------------------------------------------+-------------------------------+-------------------------------+-------------------------------
[92m6724ba5a9aa896701328e02c      | Jennifer Lopez Endorses Kamala Harris for President, Blasts Trump                         | TMZ                           | 2024-11-01T00:00:00.000000000 | 1.000000                      [0m
[92m6724d0049aa896701328e0a2      | Jennifer Lopez Endorses Kamala Harris for President, Blasts Trump                         | TMZ                           | 2024-11-01T00:00:00.000000000 | 0.996461                      [0m
[92m6724faf89aa896701328e18d      | Jennifer Lopez goes after Trump comic's Puerto Rican dig as s

In [21]:
articles_similarity_summary(case3_id, 50)

[1mID                            | Title                                                                                     | Source                        | Published Date                | Similarity                    [0m
------------------------------+-------------------------------------------------------------------------------------------+-------------------------------+-------------------------------+-------------------------------
[92m6724c9c69aa896701328e087      | NFT Developers Plead Guilty to $400,000 'Rug Pull’, Laundering and Wir...                 | Decrypt                       | 2024-11-01T12:28:54.000000000 | 1.000000                      [0m
[92m6724f7d29aa896701328e180      | Ethereum Gaming Token IMX Plunges After Immutable Reveals SEC Threat                      | Decrypt                       | 2024-11-01T15:46:08.000000000 | 0.722982                      [0m
[92m6724f8129aa896701328e182      | Ethereum Gaming Token IMX Plunges After Immutable Reveals SEC

In [22]:
df.columns

Index(['title', 'author', 'publication_date', 'source', 'url', 'summary',
       'content', 'tags', 'categories', 'images'],
      dtype='object')

In [25]:
# Clear collection
# doc_ids = df.index.to_list()
# db.delete_document(doc_ids)

## Headline matching

In [4]:
from scripts.db_1 import Database
from scripts.embeddings_trial1 import Embeddings

db = Database('articles')
embed_model = Embeddings()

In [5]:
for row in tqdm(df.iterrows(), desc="Adding articles to Database", total=df.shape[0]):
  article = row[1]
  metadata = {
    'title': article['title'],
    'source': article['source'],
    'date': str(article['publication_date'])
  }
  embedding = embed_model.generate_embeddings(article_content=article['title'])
  if embedding is None:
    raise Exception("Embedding is None")
  db.insert_document_by_embedding([row[0]], [embedding], [article['content']], [metadata])

Adding articles to Database:   2%|▏         | 49/2080 [00:04<03:14, 10.46it/s]

Adding articles to Database: 100%|██████████| 2080/2080 [03:24<00:00, 10.17it/s]


In [26]:
%%time
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px

doc = db.get_document(case2_id)
documents = db.search_nearest_documents(doc['embeddings'], 500)
embeddings = np.array(documents['embeddings'])
metadatas = documents["metadatas"]
distances = documents['distances']

embeddings = embeddings[0, :, :]

# Step 2: Apply PCA
pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(embeddings)
tsne = TSNE(n_components=2, perplexity=100, n_jobs=5, random_state=42)
reduced_embeddings = tsne.fit_transform(reduced_embeddings)

# Step 3: Prepare data for Plotly
# Combine PCA results and metadata (if available) into a DataFrame
df = pd.DataFrame(reduced_embeddings, columns=["TSNE1", "TSNE2"])
if metadatas:
    metadata_df = pd.DataFrame(metadatas[0])
    df = pd.concat([df, metadata_df], axis=1)

# Step 4: Visualize using Plotly
fig = px.scatter(
    df,
    x="TSNE1",
    y="TSNE2",
    color=metadata_df["source"] if "source" in metadata_df else None,
    hover_name='title',
    hover_data=metadata_df,
    title="TSNE Visualization of Embeddings"
)
fig.show()

CPU times: user 3.95 s, sys: 11.7 ms, total: 3.97 s
Wall time: 3.77 s


In [27]:
%%time
# Step 4: Visualize using Plotly
fig = px.scatter(
    df,
    x="TSNE1",
    y="TSNE2",
    color=distances[0],
    hover_name='title',
    hover_data=metadata_df,
    title="TSNE Visualization of Embeddings"
)
fig.show()

CPU times: user 57.9 ms, sys: 0 ns, total: 57.9 ms
Wall time: 62.7 ms


In [14]:
metadata_df

Unnamed: 0,date,source,title
0,2024-11-01 00:00:00,TMZ,Jennifer Lopez Endorses Kamala Harris for Pres...
1,2024-11-01 00:00:00,TMZ,Jennifer Lopez Endorses Kamala Harris for Pres...
2,NaT,Daily Mail,Jennifer Lopez goes after Trump comic's Puerto...
3,NaT,Daily Mail,Jennifer Lopez goes after Trump comic's Puerto...
4,NaT,Daily Mail,Jennifer Lopez goes after Trump comic's Puerto...
...,...,...,...
95,NaT,Popsugar,"As a Latina Daughter of Immigrants, I'm Voting..."
96,NaT,FOX News,"In battle against Trump, Harris crisscrosses b..."
97,NaT,FOX News,Trump election ad airs during NBC's NASCAR rac...
98,NaT,FOX News,Cruz: Harris made mistake by tapping Walz inst...
