In [1]:
import os
import re
import json
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup
from transformers import pipeline
from openai import AsyncOpenAI, OpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
tqdm.pandas()

### Pure Embedding Approach for Topic Modeling

In [4]:
# 230 topics from BERTopic
selected_topics_df = pd.read_csv("./topics/embedding_similarity_label.csv")
selected_topics_df = selected_topics_df.drop(columns=['Unnamed: 0'])

In [5]:
selected_topics_df

Unnamed: 0,Topic,Name,Representation,closest_topic
0,-1,-1_schools_case_information_women,"['schools', 'case', 'information', 'women', 'p...",Personal Finance - Financial Assistance - Gove...
1,0,0_downtime_blah_salute_ok,"['downtime', 'blah', 'salute', 'ok', 'boy', 'n...",Communication
2,1,1_song_symphony_opera_blues,"['song', 'symphony', 'opera', 'blues', 'musici...",Fine Art - Opera
3,2,2_inning_yankees_red_francona,"['inning', 'yankees', 'red', 'francona', 'runs...",Sports - Baseball
4,3,3_candidates_primary_romney_gop,"['candidates', 'primary', 'romney', 'gop', 'we...",Politics - Elections
...,...,...,...,...
226,225,225_farms_csa_burpee_hirshberg,"['farms', 'csa', 'burpee', 'hirshberg', 'heron...",Business and Finance - Industries - Agriculture
227,226,226_crone_olympic_peterson_mantha,"['crone', 'olympic', 'peterson', 'mantha', 'li...",Sports - Olympic Sports - Winter Olympic Sports
228,227,227_saleh_yemeni_rabbo_taiz,"['saleh', 'yemeni', 'rabbo', 'taiz', 'marib', ...",Religion & Spirituality - Spirituality
229,228,228_detainees_waterboarding_conventions_geneva,"['detainees', 'waterboarding', 'conventions', ...",Science


In [None]:
# Load CURRENT GBH articles
unseen_articles = pd.read_csv('./sample_data/se_naacp_db.articles_data.csv')
# unseen_articles = pd.read_csv('./eval_dataset/Singer_Evaluation_Set.csv')
unseen_articles = unseen_articles.rename(columns={"body": "Body"})
unseen_articles = unseen_articles.dropna(subset=['Body'])
unseen_articles.reset_index(drop=True, inplace=True)

In [None]:
unseen_articles.columns

In [None]:
# Load GBH articles
# unseen_articles = pd.read_csv('./sample_data/Articles Nov 2020 - March 2023.csv', usecols=range(12))
# unseen_articles = unseen_articles.dropna(subset=['Body'])
# unseen_articles = unseen_articles.sample(n=5000, random_state=1)
# unseen_articles.reset_index(drop=True, inplace=True)

### Calling of the OpenAI Embeddings

In [None]:
client = OpenAI(
    # This is the default and can be omitted
    api_key='YOUR_OPEN_KEY_HERE',
)

In [None]:
# Retry up to 10 times with exponential backoff, starting at 1 second and maxing out at 20 seconds delay
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def get_embedding(text: str, model="text-embedding-3-small"):
    #print(text)
    try:
        embedding = client.embeddings.create(input=text, model=model).data[0].embedding
        return embedding
    except Exception as e:
        print(f"Failed to retrieve ADA Embedding: {e}. Replacing with replacement value!")
        return [-1.0]
    return 

### Text Preprocessing

In [None]:
# News body preprocessing, gets rid of html tags
def truncate(tokens, length=500):
    """
    Function to get the first 500 elements from a list
    """
    return tokens[:length]

unseen_articles['Body'] = unseen_articles['Body'].apply(lambda x: re.sub(re.compile('<.*?>'), '', x))
unseen_articles['tokens'] = unseen_articles['Body'].apply(lambda x: x.split())
unseen_articles['tokens'] = unseen_articles['tokens'].apply(truncate)

print(max(len(row['tokens']) for _, row in unseen_articles.iterrows()))

unseen_articles['ada_embedding'] = unseen_articles.tokens.progress_apply(lambda x: get_embedding(','.join(map(str,x)), model='text-embedding-ada-002'))

In [None]:
unseen_articles

### Ada Embedding -> Taxanomy List

In [None]:
taxonomy_df = pd.read_csv('./taxonomy_list/Content_Taxonomy.csv', skiprows=5, usecols=range(8))
taxonomy_df.columns = taxonomy_df.iloc[0]
taxonomy_df = taxonomy_df.tail(-1)

In [None]:
taxonomy_df

In [None]:
tier_1_list = []
tier_2_list = []
tier_3_list = []
tier_4_list = []
for index, row in taxonomy_df.iterrows():
    if not pd.isnull(row['Tier 4']) and row['Tier 4'] != ' ':
        tier_1_label = row['Tier 1']
        tier_2_label = row['Tier 2']
        tier_3_label = row['Tier 3']
        tier_4_label = row['Tier 4']
        tier_4_list.append(f'{tier_1_label} - {tier_2_label} - {tier_3_label} - {tier_4_label}')
    elif not pd.isnull(row['Tier 3']) and row['Tier 3'] != ' ':
        tier_1_label = row['Tier 1']
        tier_2_label = row['Tier 2']
        tier_3_label = row['Tier 3']
        tier_3_list.append(f'{tier_1_label} - {tier_2_label} - {tier_3_label}')
    elif not pd.isnull(row['Tier 2']) and row['Tier 2'] != ' ':
        tier_1_label = row['Tier 1']
        tier_2_label = row['Tier 2']
        tier_2_list.append(f'{tier_1_label} - {tier_2_label}')
    else:
        tier_1_label = row['Tier 1']
        tier_1_list.append(f'{tier_1_label}')

tier_1_list = list(set(tier_1_list))
tier_2_list = list(set(tier_2_list))
tier_3_list = list(set(tier_3_list))
tier_4_list = list(set(tier_4_list))

tier_1_embedding = [get_embedding(topic) for topic in tier_1_list]
tier_2_embedding = [get_embedding(topic) for topic in tier_2_list]
tier_3_embedding = [get_embedding(topic) for topic in tier_3_list]
tier_4_embedding = [get_embedding(topic) for topic in tier_4_list]

all_topics_list = []
[all_topics_list.append(topic) for topic in tier_1_list]
[all_topics_list.append(topic) for topic in tier_2_list]
[all_topics_list.append(topic) for topic in tier_3_list]
[all_topics_list.append(topic) for topic in tier_4_list]

all_topics_embedding = []
[all_topics_embedding.append(embedding) for embedding in tier_1_embedding]
[all_topics_embedding.append(embedding) for embedding in tier_2_embedding]
[all_topics_embedding.append(embedding) for embedding in tier_3_embedding]
[all_topics_embedding.append(embedding) for embedding in tier_4_embedding]
print(len(all_topics_embedding))

### Similarity Metric Matching

In [None]:
# Get embedding for the 230 topics selected by BERTopic 
selected_taxonomy_df = pd.read_csv('./topics/embedding_similarity_label.csv')
selected_taxonomy_df = selected_taxonomy_df.dropna(subset=['closest_topic'])
selected_topics_list = selected_taxonomy_df['closest_topic'].values.tolist()
# print(all_topics_list)

selected_topics_embedding = [get_embedding(topic) for topic in selected_topics_list]

In [None]:
# Find most similar taxonomy to the BOW representations
closest_topic_list_all = []
for index, row in unseen_articles.iterrows():
    target_embedding = row['ada_embedding']
    similarities = [cosine_similarity(np.array(target_embedding).reshape(1, -1), np.array(topic).reshape(1, -1))[0][0] for topic in all_topics_embedding]

    # Find the index of the topic with the highest similarity
    closest_topic_index = np.argmax(similarities)

    # Retrieve the closest topic embedding
    closest_topic = all_topics_list[closest_topic_index]
    closest_topic_list_all.append(closest_topic)

unseen_articles['closest_topic_all'] = closest_topic_list_all
print(unseen_articles.head(10))

In [None]:
# Find most similar taxonomy to the BOW representations
closest_topic_list_selected = []
for index, row in unseen_articles.iterrows():
    target_embedding = row['ada_embedding']
    similarities = [cosine_similarity(np.array(target_embedding).reshape(1, -1), np.array(topic).reshape(1, -1))[0][0] for topic in selected_topics_embedding]

    # Find the index of the topic with the highest similarity
    closest_topic_index = np.argmax(similarities)

    # Retrieve the closest topic embedding
    closest_topic = selected_topics_list[closest_topic_index]
    closest_topic_list_selected.append(closest_topic)

unseen_articles['closest_topic_selected'] = closest_topic_list_selected
print(unseen_articles.head(10))

### Export to CSV

In [None]:
unseen_articles.to_csv('./outputs/pure_embedding_current_GBH_Data.csv')

### Quick Analysis

In [None]:
import matplotlib.pyplot as plt

In [None]:
unseen_articles = pd.read_csv('./outputs/pure_embedding_current_GBH_Data.csv')

In [None]:
unseen_articles.columns

In [None]:
unseen_articles

In [None]:
df_to_export = pd.concat([unseen_articles['_id'], unseen_articles['hl1'], unseen_articles['Body'], unseen_articles['openai_labels[0]'], unseen_articles['closest_topic_selected'], unseen_articles['closest_topic_all']], axis=1)

In [None]:
df_to_export.to_csv('./outputs/pure_embedding_current_GBH_Data_CLEANED_WITH_ID.csv')

In [None]:
df_to_export[100:201].to_csv('./outputs/pure_embedding_current_SAMPLE.csv')

In [None]:
# Analysis on labels from two lists
selected_series = unseen_articles['closest_topic_selected']

# Get the value counts
selected_counts = selected_series.value_counts()
print(selected_counts)
# Identify categories with count 1
categories_to_combine = selected_counts[selected_counts <= 10].index

# Create a new category name for combined values
new_category_name = 'Other'

# Replace categories with count 1 with the new category
selected_series = selected_series.apply(lambda x: new_category_name if x in categories_to_combine else x)

# Create a pie chart
plt.figure(figsize=(22, 22))
plt.pie(selected_series.value_counts(), labels=selected_series.value_counts().index, autopct='%1.1f%%', startangle=140)
plt.title('Pie Chart of Selected Topics')
plt.show()


all_series = unseen_articles['closest_topic_all']

# Get the value counts
all_counts = all_series.value_counts()
print(all_counts)
# Identify categories with count 1
categories_to_combine = all_counts[all_counts <= 10].index

# Create a new category name for combined values
new_category_name = 'Other'

# Replace categories with count 1 with the new category
all_series = all_series.apply(lambda x: new_category_name if x in categories_to_combine else x)

# Create a pie chart
plt.figure(figsize=(22, 22))
plt.pie(all_series.value_counts(), labels=all_series.value_counts().index, autopct='%1.1f%%', startangle=140)
plt.title('Pie Chart of All Topics')
plt.show()

count_matching_rows = len(unseen_articles[unseen_articles['closest_topic_all'] == unseen_articles['closest_topic_selected']])
print(f'There are {count_matching_rows} matching labels in {len(unseen_articles)} articles. The ratio is {count_matching_rows/len(unseen_articles)*100}%.')