# Recommender Systems

Work with sample data from news articles, build vector embeddings out of the article titles. Then build recommender system searching through all articles titles and retrieve titles that are most relevant. Then build recommender system based on article content.

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm, trange
from DLAIUtils import Utils

import pandas as pd
import time
import os

In [3]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()
OPENAI_API_KEY = utils.get_openai_api_key()

In [5]:
# Load Dataset
# ! unzip ../public/data/all-the-news-3.zip

Archive:  ../public/data/all-the-news-3.zip
  inflating: all-the-news-3.csv      


In [7]:
# Open file and read it
with open('./all-the-news-3.csv', 'r') as file:
    header = file.readline()
    print(header)

date,year,month,day,author,title,article,url,section,publication



In [8]:
# Lets make it more convenient by using a dataframe from pandas package
df = pd.read_csv('./all-the-news-3.csv', nrows=99)
df.head()

Unnamed: 0,date,year,month,day,author,title,article,url,section,publication
0,2016-12-09 18:31:00,2016,12.0,9,Lee Drutman,We should take concerns about the health of li...,"This post is part of Polyarchy, an independent...",https://www.vox.com/polyarchy/2016/12/9/138983...,,Vox
1,2016-10-07 21:26:46,2016,10.0,7,Scott Davis,Colts GM Ryan Grigson says Andrew Luck's contr...,The Indianapolis Colts made Andrew Luck the h...,https://www.businessinsider.com/colts-gm-ryan-...,,Business Insider
2,2018-01-26 00:00:00,2018,1.0,26,,Trump denies report he ordered Mueller fired,"DAVOS, Switzerland (Reuters) - U.S. President ...",https://www.reuters.com/article/us-davos-meeti...,Davos,Reuters
3,2019-06-27 00:00:00,2019,6.0,27,,France's Sarkozy reveals his 'Passions' but in...,PARIS (Reuters) - Former French president Nico...,https://www.reuters.com/article/france-politic...,World News,Reuters
4,2016-01-27 00:00:00,2016,1.0,27,,Paris Hilton: Woman In Black For Uncle Monty's...,Paris Hilton arrived at LAX Wednesday dressed ...,https://www.tmz.com/2016/01/27/paris-hilton-mo...,,TMZ


## Setup Pinecone 

In [9]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)
util = Utils()
INDEX_NAME = utils.create_dlai_index_name('dl-ai')
pinecone = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-west-2'))

index = pinecone.Index(INDEX_NAME)

## Create embeddings of the news titles 

In [10]:
def get_embeddings(articles, model="text-embedding-ada-002"):
   return openai_client.embeddings.create(input = articles, model=model)

In [13]:
# Create chunks as subsets of the data from the articles dataset and upload them to the Pinecone index
CHUNK_SIZE=400
TOTAL_ROWS=20000
progress_bar = tqdm(total=TOTAL_ROWS)
chunks = pd.read_csv('./all-the-news-3.csv', chunksize=CHUNK_SIZE, 
                     nrows=TOTAL_ROWS)
chunk_num = 0
for chunk in chunks:
    titles = chunk['title'].tolist()
    embeddings = get_embeddings(titles)
    prepped = [{'id':str(chunk_num*CHUNK_SIZE+i), 'values':embeddings.data[i].embedding,
                'metadata':{'title':titles[i]},} for i in range(0,len(titles))]
    chunk_num = chunk_num + 1
    if len(prepped) >= 300:
      index.upsert(prepped)
      prepped = []
    progress_bar.update(len(chunk))

  0%|          | 0/20000 [00:00<?, ?it/s]

In [14]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 20000}},
 'total_vector_count': 20000}

## Build Recommender System 

In [16]:
# Helper function takes a pointer to pinecone, what you want to search for and how many results you want
def get_recommendations(pinecone_index, search_term, top_k=10):
  embed = get_embeddings([search_term]).data[0].embedding
  #  Executes and returns the search results
  res = pinecone_index.query(vector=embed, top_k=top_k, include_metadata=True)
  return res

In [17]:
reco = get_recommendations(index, 'obama')
for r in reco.matches:
    print(f'{r.score} : {r.metadata["title"]}')

0.856704175 : Obama's been a much more effective communicator than he gives himself credit for
0.85247159 : America will miss Barack Obama's decency
0.851420879 : Barack Obama Golfs in Tuscany
0.849604547 : Barack Obama just stepped off the sidelines to defend Obamacare
0.849476159 : Obama: if you were fine with big government until it served black people, rethink your biases
0.848955095 : “Our democracy is at stake”: Obama delivers his first post-presidency campaign speech
0.848202527 : President Obama has a new plan to fight the opioid epidemic
0.847667515 : Muhammad Ali -- Obama Praises The Greatest
0.844964087 : Prez Obama -- I Support Colin Kaepernick's National Anthem Protest (VIDEO)


## Create embeddings of all new content 

Look for vector embeddings that represent the content of the article

In [18]:
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(name=INDEX_NAME)

pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
  spec=ServerlessSpec(cloud='aws', region='us-west-2'))
articles_index = pinecone.Index(INDEX_NAME)

In [19]:

def embed(embeddings, title, prepped, embed_num):
  for embedding in embeddings.data:
    prepped.append({'id':str(embed_num), 'values':embedding.embedding, 'metadata':{'title':title}})
    embed_num += 1
    if len(prepped) >= 300:
        articles_index.upsert(prepped)
        prepped.clear()
  return embed_num

In [23]:
news_data_rows_num = 1000

embed_num = 0 #keep track of embedding number for 'id'
# Chunk up article text into smaller pieces with overlap 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, 
    chunk_overlap=20) # how to chunk each article
prepped = []
df = pd.read_csv('./all-the-news-3.csv', nrows=news_data_rows_num)
articles_list = df['article'].tolist()
titles_list = df['title'].tolist()

# Iterate through all the articles and embed them through Langchain recursive character text splitter 
for i in tqdm(range(0, len(articles_list))):
    art = articles_list[i]
    title = titles_list[i]
    if art is not None and isinstance(art, str):
      texts = text_splitter.split_text(art)
      embeddings = get_embeddings(texts)
      embed_num = embed(embeddings, title, prepped, embed_num)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [24]:
articles_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10500}},
 'total_vector_count': 10500}

In [25]:
reco = get_recommendations(articles_index, 'obama', top_k=100)
seen = {}
for r in reco.matches:
    title = r.metadata['title']
    if title not in seen:
        print(f'{r.score} : {title}')
        seen[title] = '.'

0.842914104 : In separate speeches Obama, Biden, Carter target Trump policies
0.838885725 : Waffle House Hero James Shaw Jr. Says Trump's Call Was 'Lackluster' But Appreciated
0.838100433 : 'Cut it out,' Obama told Putin on cyber attacks
0.83676821 : The social soapbox: Democrats will embrace visual platforms at the 2016 Convention
0.831418157 : Government shutdown: are Democrats becoming more like Republicans?
0.831008136 : Obama-Trump meeting 'less awkward' than some might have expected: White House
0.827040255 : Obama's quiet push to ban LGBTQ discrimination in the workplace
0.825527072 : Do terrorist attacks make a Trump win more likely? Here’s what the research suggests.
0.825044274 : Virginia governor's election results: polls, live updates, and what to expect
0.824935436 : Medicare-for-all will be shaped by these 3 questions
0.824586391 : The game theory behind Mitch McConnell's Supreme Court strategy
0.824284792 : This is what Obamacare sabotage looks like
0.821054697 : Why Oba