In [None]:
import warnings
warnings.filterwarnings('ignore')

from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm, trange
from DLAIUtils import Utils

import pandas as pd
import time
import os


In [None]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()
OPENAI_API_KEY = utils.get_openai_api_key()


In [None]:
!wget -q --show-progress -O all-the-news-3.zip "https://www.dropbox.com/scl/fi/wruzj2bwyg743d0jzd7ku/all-the-news-3.zip?rlkey=rgwtwpeznbdadpv3f01sznwxa&dl=1"

!unzip all-the-news-3.zip


In [None]:
with open('./data/all-the-news-3.csv', 'r') as f:
    #Reads the first line, which typically contains the header (column names) of the CSV file.
    header = f.readline()
    print(header)


In [None]:
df = pd.read_csv('./data/all-the-news-3.csv', nrows=99)
#displays the first five rows of the DataFrame for initial inspection of the data.
df.head()


In [None]:
#Setup Pinecone
openai_client = OpenAI(api_key=OPENAI_API_KEY)
util = Utils()
INDEX_NAME = utils.create_dlai_index_name('dl-ai')
pinecone = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-west-2'))

index = pinecone.Index(INDEX_NAME)


In [None]:
#creating embeddings of news titles and indexing them in Pinecone, so you can later query the index to find semantically similar articles

#Create Embeddings of the News Titles
def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)

CHUNK_SIZE=400
TOTAL_ROWS=10000

#Initializes a progress bar using tqdm for visual feedback on the processing progress.
progress_bar = tqdm(total=TOTAL_ROWS)

#Reading the Dataset in Chunks
chunks = pd.read_csv('./data/all-the-news-3.csv', chunksize=CHUNK_SIZE,
                     nrows=TOTAL_ROWS)
chunk_num = 0
for chunk in chunks:

    #get title, do embedding for it
    titles = chunk['title'].tolist()
    embeddings = get_embeddings(titles)

    #build a list of id,embedding and metadata
    prepped = [{'id':str(chunk_num*CHUNK_SIZE+i), 'values':embeddings.data[i].embedding,
                'metadata':{'title':titles[i]},} for i in range(0,len(titles))]

    chunk_num = chunk_num + 1
    if len(prepped) >= 200:
      index.upsert(prepped)
      prepped = []

    #Updates the progress bar based on the number of rows processed in the current chunk
    progress_bar.update(len(chunk))

index.describe_index_stats()

In [None]:
#Build the Recommender System
def get_recommendations(pinecone_index, search_term, top_k=10):
  #generates an embedding for the search term
  embed = get_embeddings([search_term]).data[0].embedding
  #queries the Pinecone index with this embedding
  res = pinecone_index.query(vector=embed, top_k=top_k, include_metadata=True)
  return res

#Getting Recommendations for a Search Term
reco = get_recommendations(index, 'obama')

#Printing the Recommendations
for r in reco.matches:
    #similarity score (r.score) and the title of the article (r.metadata["title"])
    print(f'{r.score} : {r.metadata["title"]}')


In [2]:
#Create Embeddings of 'All' News Content

#check existence, typically done to start fresh with a new dataset or schema.
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(name=INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-west-2'))
articles_index = pinecone.Index(INDEX_NAME)

def embed(embeddings, title, prepped, embed_num):
    for embedding in embeddings.data:
        prepped.append({'id':str(embed_num), 'values':embedding.embedding, 'metadata':{'title':title}})
        embed_num += 1
        if len(prepped) >= 100:
            articles_index.upsert(prepped)
            prepped.clear()
    return embed_num


In [None]:
#processes news articles, splits them into chunks, generates embeddings for each chunk, and then upserts them into the Pinecone index.


#news_data_rows_num sets the number of news articles to process.
news_data_rows_num = 100

#embed_num will be used to track the ID for each embedding.
embed_num = 0

#text_splitter is an instance of RecursiveCharacterTextSplitter for splitting articles into chunks with a specified size and overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)

#prepped is a list to accumulate data for batch upserts.
prepped = []

df = pd.read_csv('./data/all-the-news-3.csv', nrows=news_data_rows_num)
articles_list = df['article'].tolist()
titles_list = df['title'].tolist()


for i in range(0, len(articles_list)):
    print(".",end="")
    art = articles_list[i]
    title = titles_list[i]
    if art is not None and isinstance(art, str):
      texts = text_splitter.split_text(art)
      embeddings = get_embeddings(texts)
      embed_num = embed(embeddings, title, prepped, embed_num)

In [None]:
articles_index.describe_index_stats()

In [None]:
reco = get_recommendations(articles_index, 'obama', top_k=100)
seen = {}
for r in reco.matches:
    title = r.metadata['title']

    #ensures that each recommended article is only printed once, even if it appears multiple times in the recommendation results,
    # which is useful in cases where multiple chunks of the same article might be similar to the search query.
    if title not in seen:
        print(f'{r.score} : {title}')
        seen[title] = '.'