In [1]:
import pandas as pd
import numpy as np
import json
import openai
import tiktoken
from bs4 import BeautifulSoup
import re
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity

### A clean inference pipeline where all experiments are removed

In [2]:
# Independent cell to define embedding function, the embedding model is set to OpenAI's latest text-embedding-3-small by default

openai.api_key =  # GitHub version does not include the OpenAI API key, please replace with your own key

import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt

# Retry up to 6 times with exponential backoff, starting at 1 second and maxing out at 20 seconds delay
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, model='text-embedding-3-small') -> list[float]:
    # print(text)
    return openai.Embedding.create(input=text, model=model)["data"][0]["embedding"]

In [3]:
# Load GBH articles
unseen_articles = pd.read_csv('../datasets/combined_dataset_new_labels.csv')
# unseen_articles = pd.read_csv('../datasets/Articles Nov 2020 - March 2023.csv', usecols=range(12))
# unseen_articles = unseen_articles.dropna(subset=['Body'])
# unseen_articles = unseen_articles.sample(n=5000, random_state=1)
unseen_articles.reset_index(drop=True, inplace=True)
print(unseen_articles.head(3))

                                                text           race_label  \
0  It has been nearly three years since auto plan...  N/A - not specified   
1  The Boston City Council Wednesday approved pro...  N/A - not specified   
2  So far this year, 16 people have been murdered...  N/A - not specified   

   race_discussed link  
0               0  NaN  
1               0  NaN  
2               0  NaN  


In [5]:
# Updated token count & truncating preprocessing, using OpenAI's tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def truncate_to_max_tokens(string: str, encoding_name: str, max_tokens: int = 8192) -> str:
    """Truncate the string to not exceed max_tokens when encoded."""
    # Initially assume the full string is okay
    end = len(string)
    start = 0
    while start < end:
        # Check the middle of the current text portion
        mid = (start + end) // 2
        current_slice = string[:mid]
        tokens_count = num_tokens_from_string(current_slice, encoding_name)
        
        if tokens_count > max_tokens:
            # Too many tokens, need to reduce the size of the text
            end = mid - 1
        else:
            # Not too many tokens, but can we include more?
            if tokens_count == max_tokens or num_tokens_from_string(string[:mid + 1], encoding_name) > max_tokens:
                return current_slice
            else:
                start = mid + 1

    # In case the string is shorter than the max tokens or exactly max tokens, return the original string
    return string

encoding_name = "cl100k_base"

unseen_articles['text'] = unseen_articles['text'].apply(lambda x: re.sub(re.compile('<.*?>'), '', x))
unseen_articles['processed_body'] = unseen_articles['text'].apply(lambda x: truncate_to_max_tokens(x, "cl100k_base", 8192))
unseen_articles = unseen_articles.dropna(subset=['processed_body'])
unseen_articles['ada_embedding'] = unseen_articles['processed_body'].apply(lambda x: get_embedding(x))

print(unseen_articles.head(3))

                                                text           race_label  \
0  It has been nearly three years since auto plan...  N/A - not specified   
1  The Boston City Council Wednesday approved pro...  N/A - not specified   
2  So far this year, 16 people have been murdered...  N/A - not specified   

   race_discussed link                                     processed_body  \
0               0  NaN  It has been nearly three years since auto plan...   
1               0  NaN  The Boston City Council Wednesday approved pro...   
2               0  NaN  So far this year, 16 people have been murdered...   

                                       ada_embedding  
0  [-0.017918458208441734, -0.022890731692314148,...  
1  [0.028521470725536346, 0.002924327738583088, 0...  
2  [0.048949964344501495, 0.06095076724886894, 0....  


In [10]:
# Alternative taxonomy: client's list of topics

client_taxonomy_df = pd.read_csv('../datasets/asad_topic_list.csv', names=['label'])
# client_taxonomy_df = pd.read_excel('../datasets/singers_topics_list.xlsx', names=['label'])
client_taxonomy_df['ada_embedding'] = client_taxonomy_df['label'].map(get_embedding)

In [11]:
# Find most similar taxonomy (out of client's list of topics) to news body

# Column of embeddings to list

client_topic_embedding_list = client_taxonomy_df['ada_embedding'].to_list()
client_topic_embedding_list = client_taxonomy_df['ada_embedding'].to_list()
client_topic_list = client_taxonomy_df['label'].to_list()
similarity_arr = []

closest_topic_list_client = []
for index, row in unseen_articles.iterrows():
    target_embedding = row['ada_embedding']
    similarities = [cosine_similarity(np.array(target_embedding).reshape(1, -1), np.array(topic).reshape(1, -1))[0][0] for topic in client_topic_embedding_list]
    
    if max(similarities) > 0.2:    
        # Find the index of the topic with the highest similarity
        # closest_topic_index = np.argmax(similarities)
        top3_indices = np.argsort(similarities)[-3:][::-1]  # This reverses the slice to ensure highest similarities are first

        # Retrieve the closest topic embedding
        # closest_topic = client_topic_list[closest_topic_index]
        # closest_topic_list_client.append(closest_topic)
        top3_topics = [client_topic_list[index] for index in top3_indices]
        closest_topic_list_client.append(top3_topics)

    else:
        closest_topic_list_client.append('Other')
    similarity_arr.append(max(similarities))
    
unseen_articles['closest_topic_client'] = closest_topic_list_client
print(unseen_articles.head(10))

                                                text           race_label  \
0  It has been nearly three years since auto plan...  N/A - not specified   
1  The Boston City Council Wednesday approved pro...  N/A - not specified   
2  So far this year, 16 people have been murdered...  N/A - not specified   
3  Recently I spent the better part of two days i...  N/A - not specified   
4  Communities across the nation are beginning th...                Black   
5  When Carrie Kissell learned that her employer ...  N/A - not specified   
6  Advisory: This story includes descriptions of ...                Black   
7  On this week’s edition of the Joy Beat, theAll...  N/A - not specified   
8  Nearly two years after its executive director ...  N/A - not specified   
9  As a highly respected astronomer, Harvard prof...  N/A - not specified   

   race_discussed link                                     processed_body  \
0               0  NaN  It has been nearly three years since auto plan...  

In [12]:
# Outputs all columns but ada_embedding

# unseen_articles.to_csv('../output/pure_embedding_all_gbh_5000.csv', columns=['Headline', 'Section', 'Body', 'closest_topic_all', 'closest_topic_selected', 'closest_topic_client'])
columns_to_output = unseen_articles.columns.difference(['ada_embedding', 'Unnamed: 0', 'processed_body'])
unseen_articles.to_csv('../output/pure_embedding_three_labels_anulika.csv', columns=columns_to_output)