# Labels output & Formatting .json Notebook
### Use this script after generating BERTopic as well as OpenAI labels to output results into a .csv file
Created by Dingyuan Xu dyxu@bu.edu based on Michelle Voong's mvoong@bu.edu work. <br>
Please follow markdown/ comments on each cell to ensure proper functioning. DON'T JUST RUN ALL.

In [1]:
# Assuming working dir on scc: sparkgrp/dyxu
# A few warnings are normal
# If CUDA throws an error saying device occupied/ no available device, check that you have a session with gpu and that all other kernels occupying the gpu are terminated
import pandas as pd
from bertopic import BERTopic
import json
import openai
from transformers import pipeline

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


### Load saved BERTopic model and corresponding openai labels

In [3]:
topic_model = BERTopic.load("../models/bglobe_519_body_230") # BERTopic model dir
topic_df = topic_model.get_topic_info()
print(topic_df.head(5))

   Topic  Count                               Name  \
0     -1  58266  -1_schools_case_information_women   
1      0  14312          0_downtime_blah_salute_ok   
2      1   6280        1_song_symphony_opera_blues   
3      2   5669      2_inning_yankees_red_francona   
4      3   4659    3_candidates_primary_romney_gop   

                                      Representation  \
0  [schools, case, information, women, public, ch...   
1  [downtime, blah, salute, ok, boy, nice, pretty...   
2  [song, symphony, opera, blues, musicians, ball...   
3  [inning, yankees, red, francona, runs, homer, ...   
4  [candidates, primary, romney, gop, weld, dole,...   

                                 Representative_Docs  
0  [$400,000, but the natural gas savings alone w...  
1                                             [, , ]  
2  [Haynes is humble about his role. "Music is ve...  
3  [BODY Latos (0-1) was ineffective in his third...  
4  [Here is that list, sorted by party and alphab...  


In [5]:
# Updating this cell for loading openai labels generated by the embedding similarity approach
openai_df = pd.read_csv("../openai_label_file/embedding_similarity_label.csv", usecols=['Topic', 'Name', 'Representation', 'closest_topic'])
openai_df.head(5)

Unnamed: 0,Topic,Name,Representation,closest_topic
0,-1,-1_schools_case_information_women,"['schools', 'case', 'information', 'women', 'p...",Personal Finance - Financial Assistance - Gove...
1,0,0_downtime_blah_salute_ok,"['downtime', 'blah', 'salute', 'ok', 'boy', 'n...",Communication
2,1,1_song_symphony_opera_blues,"['song', 'symphony', 'opera', 'blues', 'musici...",Fine Art - Opera
3,2,2_inning_yankees_red_francona,"['inning', 'yankees', 'red', 'francona', 'runs...",Sports - Baseball
4,3,3_candidates_primary_romney_gop,"['candidates', 'primary', 'romney', 'gop', 'we...",Politics - Elections


In [4]:
f = open('../openai_label_file/openai_label_from_taxonomy_structured_230.json') # Openai label json dir
openai_labels = json.load(f)

### Unseen articles: load target news articles, predict labels and append to df

In [7]:
# Adapt to new openai label file structure
# prompt model to predict unseen gbh articles

# Change this to your input file
unseen_articles = pd.read_csv('../datasets/Articles Nov 2020 - March 2023.csv', usecols=range(12))
unseen_articles = unseen_articles.dropna(subset=['Body'])
unseen_sample = unseen_articles.sample(n=50, random_state=1)
unseen_sample.reset_index(drop=True, inplace=True)

# get bertopics for each article
topics, probs = topic_model.transform(unseen_sample['Body'])
unseen_sample['bertopic_topic_label'] = topics

# add open ai label to bglobe dataframe in new column
unseen_label_name = [openai_df.loc[openai_df['Topic'] == int(unseen_sample['bertopic_topic_label'][i]), 'closest_topic'].values[0] 
              if int(unseen_sample['bertopic_topic_label'][i]) != -1 else "" for i in range(len(unseen_sample))]

# print(unseen_label_name)
unseen_sample['openai_label'] = unseen_label_name

print(unseen_sample.head(5))

      Type                                              Label  \
0  Article  'Atlantic Crossing' Episode 3 Recap: Happy Day...   
1  Article  Pharaoh of the opera: Anthony Roth Costanzo di...   
2  Article  Infrastructure bills are kicking off billions ...   
3  Article  David Prowse, Actor Behind Darth Vader, Dies A...   
4  Article  President Trump Invites Michigan GOP Leaders T...   

                                            Headline  \
0  'Atlantic Crossing' Episode 3 Recap: Happy Day...   
1  Pharaoh of the opera: Anthony Roth Costanzo di...   
2  Infrastructure bills are kicking off billions ...   
3  David Prowse, Actor Behind Darth Vader, Dies A...   
4  President Trump Invites Michigan GOP Leaders T...   

                                              Byline Section Navigation  \
0                                    Jackie Bruleigh                NaN   
1                                           GBH News                NaN   
2                                      Chris Bu

In [None]:
# Change this to your desired directory/ filename
unseen_sample.to_csv('../output/non_stochastic_bertopic_embedding_similarity_gbh_sample.csv')

## Uncomment following cells if you want to also label outliers from BERTopic

In [9]:
# unseen_outliers = unseen_sample[unseen_sample['bertopic_topic_label']==-1]
# print(unseen_outliers.head(5))

      Type                                              Label  \
0  Article  'Atlantic Crossing' Episode 3 Recap: Happy Day...   
2  Article  Infrastructure bills are kicking off billions ...   
4  Article  President Trump Invites Michigan GOP Leaders T...   
6  Article  U.S. Soccer paying millions in back pay to fem...   
9  Article  RadRunner Plus Electric Bike 89.7 Sweepstakes ...   

                                            Headline  \
0  'Atlantic Crossing' Episode 3 Recap: Happy Day...   
2  Infrastructure bills are kicking off billions ...   
4  President Trump Invites Michigan GOP Leaders T...   
6  U.S. Soccer paying millions in back pay to fem...   
9  RadRunner Plus Electric Bike 89.7 Sweepstakes ...   

                                              Byline Section Navigation  \
0                                    Jackie Bruleigh                NaN   
2                                      Chris Burrell                NaN   
4  Ed White, David Eggert and Zeke Miller | Ass

In [10]:
# # Independent cell to define embedding function

# openai.api_key = # GitHub version does not include the OpenAI API key, please replace with your own key

# import openai
# from tenacity import retry, wait_random_exponential, stop_after_attempt

# # Retry up to 6 times with exponential backoff, starting at 1 second and maxing out at 20 seconds delay
# @retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# def get_embedding(text: str, model="text-embedding-ada-002") -> list[float]:
#     return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]

In [16]:
# unseen_outliers['tokens'] = unseen_outliers['Body'].apply(lambda x: x.split())
# print(unseen_outliers.head(5))

      Type                                              Label  \
0  Article  'Atlantic Crossing' Episode 3 Recap: Happy Day...   
2  Article  Infrastructure bills are kicking off billions ...   
4  Article  President Trump Invites Michigan GOP Leaders T...   
6  Article  U.S. Soccer paying millions in back pay to fem...   
9  Article  RadRunner Plus Electric Bike 89.7 Sweepstakes ...   

                                            Headline  \
0  'Atlantic Crossing' Episode 3 Recap: Happy Day...   
2  Infrastructure bills are kicking off billions ...   
4  President Trump Invites Michigan GOP Leaders T...   
6  U.S. Soccer paying millions in back pay to fem...   
9  RadRunner Plus Electric Bike 89.7 Sweepstakes ...   

                                              Byline Section Navigation  \
0                                    Jackie Bruleigh                NaN   
2                                      Chris Burrell                NaN   
4  Ed White, David Eggert and Zeke Miller | Ass

In [17]:
# # Function to get the first 500 elements from a list
# def truncate(tokens, length=500):
#     return tokens[:length]

# # Apply the function to the 'tokens' column and store the result in a new column 'tokens_500'
# unseen_outliers['tokens'] = unseen_outliers['tokens'].apply(truncate)

# unseen_outliers['ada_embedding'] = unseen_outliers.tokens.apply(lambda x: get_embedding(','.join(map(str,x)), model='text-embedding-ada-002'))

0    'Atlantic Crossing' Episode 3 Recap: Happy Day...
2    Infrastructure bills are kicking off billions ...
4    President Trump Invites Michigan GOP Leaders T...
6    U.S. Soccer paying millions in back pay to fem...
9    RadRunner Plus Electric Bike 89.7 Sweepstakes ...
Name: Headline, dtype: object


In [18]:
# print(unseen_outliers.head(5))

      Type                                              Label  \
0  Article  'Atlantic Crossing' Episode 3 Recap: Happy Day...   
2  Article  Infrastructure bills are kicking off billions ...   
4  Article  President Trump Invites Michigan GOP Leaders T...   
6  Article  U.S. Soccer paying millions in back pay to fem...   
9  Article  RadRunner Plus Electric Bike 89.7 Sweepstakes ...   

                                            Headline  \
0  'Atlantic Crossing' Episode 3 Recap: Happy Day...   
2  Infrastructure bills are kicking off billions ...   
4  President Trump Invites Michigan GOP Leaders T...   
6  U.S. Soccer paying millions in back pay to fem...   
9  RadRunner Plus Electric Bike 89.7 Sweepstakes ...   

                                              Byline Section Navigation  \
0                                    Jackie Bruleigh                NaN   
2                                      Chris Burrell                NaN   
4  Ed White, David Eggert and Zeke Miller | Ass

In [19]:
# Get the embedding for taxonomies again
# TODO: Improve this part. Ada is called twice, which is unnecessary
# taxonomy_df = pd.read_csv('../datasets/Content_Taxonomy.csv', skiprows=5, usecols=range(8))
# taxonomy_df.columns = taxonomy_df.iloc[0]
# taxonomy_df = taxonomy_df.tail(-1)
# taxonomy_df.head(10)

# tier_1_list = []
# tier_2_list = []
# tier_3_list = []
# tier_4_list = []
# for index, row in taxonomy_df.iterrows():
#     if not pd.isnull(row['Tier 4']) and row['Tier 4'] != ' ':
#         tier_1_label = row['Tier 1']
#         tier_2_label = row['Tier 2']
#         tier_3_label = row['Tier 3']
#         tier_4_label = row['Tier 4']
#         tier_4_list.append(f'{tier_1_label} - {tier_2_label} - {tier_3_label} - {tier_4_label}')
#     elif not pd.isnull(row['Tier 3']) and row['Tier 3'] != ' ':
#         tier_1_label = row['Tier 1']
#         tier_2_label = row['Tier 2']
#         tier_3_label = row['Tier 3']
#         tier_3_list.append(f'{tier_1_label} - {tier_2_label} - {tier_3_label}')
#     elif not pd.isnull(row['Tier 2']) and row['Tier 2'] != ' ':
#         tier_1_label = row['Tier 1']
#         tier_2_label = row['Tier 2']
#         tier_2_list.append(f'{tier_1_label} - {tier_2_label}')
#     else:
#         tier_1_label = row['Tier 1']
#         tier_1_list.append(f'{tier_1_label}')

# tier_1_list = list(set(tier_1_list))
# tier_2_list = list(set(tier_2_list))
# tier_3_list = list(set(tier_3_list))
# tier_4_list = list(set(tier_4_list))

# tier_1_embedding = [get_embedding(topic) for topic in tier_1_list]
# tier_2_embedding = [get_embedding(topic) for topic in tier_2_list]
# tier_3_embedding = [get_embedding(topic) for topic in tier_3_list]
# tier_4_embedding = [get_embedding(topic) for topic in tier_4_list]

# all_topics_list = []
# [all_topics_list.append(topic) for topic in tier_1_list]
# [all_topics_list.append(topic) for topic in tier_2_list]
# [all_topics_list.append(topic) for topic in tier_3_list]
# [all_topics_list.append(topic) for topic in tier_4_list]

# all_topics_embedding = []
# [all_topics_embedding.append(embedding) for embedding in tier_1_embedding]
# [all_topics_embedding.append(embedding) for embedding in tier_2_embedding]
# [all_topics_embedding.append(embedding) for embedding in tier_3_embedding]
# [all_topics_embedding.append(embedding) for embedding in tier_4_embedding]
# print(len(all_topics_embedding))

703


In [20]:
# Find most similar taxonomy to the BOW representations

# import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity

# closest_topic_list = []
# for index, row in unseen_outliers.iterrows():
#     target_embedding = row['ada_embedding']
#     similarities = [cosine_similarity(np.array(target_embedding).reshape(1, -1), np.array(topic).reshape(1, -1))[0][0] for topic in all_topics_embedding]

#     # Find the index of the topic with the highest similarity
#     closest_topic_index = np.argmax(similarities)

#     # Retrieve the closest topic embedding
#     closest_topic = all_topics_list[closest_topic_index]
#     closest_topic_list.append(closest_topic)

# unseen_outliers['closest_topic'] = closest_topic_list
# print(unseen_outliers.head(10))

       Type                                              Label  \
0   Article  'Atlantic Crossing' Episode 3 Recap: Happy Day...   
2   Article  Infrastructure bills are kicking off billions ...   
4   Article  President Trump Invites Michigan GOP Leaders T...   
6   Article  U.S. Soccer paying millions in back pay to fem...   
9   Article  RadRunner Plus Electric Bike 89.7 Sweepstakes ...   
10  Article  'Absolutely Terrifying': Congressional Leaders...   
11  Article  The secret to happiness is more in your hands ...   
15  Article      Eric In The Evening: Saturday, March 19, 2022   
16  Article  The FDA is expected to authorize 2nd boosters ...   
19  Article  Rainy Summer Devastated Mass. Pumpkins And Oth...   

                                             Headline  \
0   'Atlantic Crossing' Episode 3 Recap: Happy Day...   
2   Infrastructure bills are kicking off billions ...   
4   President Trump Invites Michigan GOP Leaders T...   
6   U.S. Soccer paying millions in back pay t

In [25]:
# print(closest_topic_list)
# unseen_outliers.to_csv('../output/non_stochastic_bertopic_embedding_similarity_gbh_outliers.csv', columns=['Type','Headline','Body','closest_topic'])

['Sports - Sailing', 'Careers - Apprenticeships', 'Politics - Elections', 'Sports - Soccer', 'Genres - Sports Radio', 'Politics', 'Healthy Living', 'Events', 'Medical Health - Vaccines', 'Disasters', 'Politics - Elections', 'Genres - Drama', 'Personal Finance - Home Utilities - Water Services', 'Sports - Hunting and Shooting', 'Medical Health - Diseases and Conditions - Sexual Health', 'Genres - Documentary', 'Holidays', 'Pets - Birds', 'Education - Online Education', 'Crime', 'Politics', 'Genres - Drama']


### Output to .csv

In [14]:
unseen_articles.to_csv("../output/non_stochastic_bertopic_embedding_similarity.csv")
# training_articles.to_csv("gbh_step_one_output_old_model.csv")