# Labels output & Formatting .json Notebook
### Use this script after generating BERTopic as well as OpenAI labels to output results into a .csv file
Created by Dingyuan Xu dyxu@bu.edu based on Michelle Voong's mvoong@bu.edu work

In [1]:
#Assuming working dir on scc: sparkgrp/dyxu

import pandas as pd
from bertopic import BERTopic
import json
import openai
from transformers import pipeline

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


### Load saved BERTopic model and corresponding openai labels

In [2]:
topic_model = BERTopic.load("../models/bglobe_519_body_350") # BERTopic model dir
f = open('../openai_label_file/openai_label_from_taxonomy_structured_230.json') # Openai label json dir
openai_labels = json.load(f)

topic_df = topic_model.get_topic_info()
# topic_df.to_csv("BERTopic_topic.csv")

In [3]:
print(openai_labels[27])

{'openai': 'Technology Companies', 'bow': ['emc', 'ibm', 'digital', 'hp', 'wang', 'compaq', 'android', 'stratus', 'storage', 'analysts']}


### Existing articles: load articles, get labels, and append to df

In [24]:
training_articles = pd.read_csv('gbh_100k_sample.csv')
body = training_articles['Body_x']

print(topic_model.get_document_info(body)[:5])

training_articles['bertopic_topic_label'] = topic_model.get_document_info(body)['Topic'] # Append BERTopic topics

label_name = [openai_labels[str(training_articles['bertopic_topic_label'][i])]['OpenAI_label'] 
              if int(training_articles['bertopic_topic_label'][i]) != -1 else "" for i in range(len(training_articles))]
training_articles['openai_label'] = label_name # Append OpenAI labels

print(training_articles.head(5))

                                            Document  Topic  \
0  As more people get shots, hesitancy toward COV...      2   
1  The coronavirus vaccine mandate implemented in...     17   
2  Norwegian Cruise Line can require passengers s...     -1   
3  As hospitals across the country weather a surg...     16   
4  Mikayla Miller loved to write. She had dreams ...    156   

                                           Name  \
0              2_vaccines_booster_doses_johnson   
1                 17_proof_exemptions_wu_indoor   
2                   -1_care_media_pandemic_2020   
3  16_coronavirus_ecmo_hospitalizations_hopkins   
4        156_strothers_altercation_calvina_3977   

                                      Representation  \
0  [vaccines, booster, doses, johnson, biontech, ...   
1  [proof, exemptions, wu, indoor, employers, req...   
2  [care, media, pandemic, 2020, black, says, bid...   
3  [coronavirus, ecmo, hospitalizations, hopkins,...   
4  [strothers, altercation, calvin

### Unseen articles: load target news articles, predict labels and append to df

In [7]:
# prompt model to predict unseen gbh articles
unseen_articles = pd.read_csv('../datasets/geocoded_articles.csv')
unseen_articles = unseen_articles.dropna(subset=['content_id'])
# get bertopics for each article
topics, probs = topic_model.transform(unseen_articles['body'])
unseen_articles['bertopic_topic_label'] = topics

# add open ai label to bglobe dataframe in new column    
unseen_label_name = [openai_labels[str(unseen_articles['bertopic_topic_label'][i])]['OpenAI_label'] 
              if int(unseen_articles['bertopic_topic_label'][i]) != -1 else "" for i in range(len(unseen_articles))]
unseen_articles['openai_label'] = unseen_label_name

unseen_articles.head(10)

KeyError: '27'

In [4]:
# Adapt to changed openai label file structure
# prompt model to predict unseen gbh articles
unseen_articles = pd.read_csv('../datasets/geocoded_articles.csv')
unseen_articles = unseen_articles.dropna(subset=['content_id'])
# get bertopics for each article
topics, probs = topic_model.transform(unseen_articles['body'])
unseen_articles['bertopic_topic_label'] = topics

# add open ai label to bglobe dataframe in new column
unseen_label_name = [openai_labels[unseen_articles['bertopic_topic_label'][i]]['openai'] 
              if int(unseen_articles['bertopic_topic_label'][i]) != -1 else "" for i in range(len(unseen_articles))]
unseen_articles['openai_label'] = unseen_label_name

unseen_articles.head(10)

Unnamed: 0.1,Unnamed: 0,neighborhoods,position_section,tracts,author,body,content_id,hl1,hl2,pub_date,pub_name,link,bertopic_topic_label,openai_label
0,0,['Fenway'],Education,['010300'],Esteban Bustillos,"Thomas White, a senior at Boston Latin School,...",00000175-7583-d779-a575-779f0f6b0001,"For High School Athletes, The Pandemic Has Led...","For High School Athletes, The Pandemic Has Led...",2020-11-11 00:00:00,GBH,https://wgbh.org/news/education/2020/11/11/for...,-1,
1,1,"['Downtown', 'Beacon Hill']",Politics,"['981700', '020302']",Mike Deehan,A wave of blue votes could wash over the Massa...,00000175-75fe-d5c8-a775-f7fe5a7f0001,Mass. Republicans Don't Fear Trump-Fueled Blue...,Mass. Republicans Don't Fear Trump-Fueled Blue...,2020-11-03 00:00:00,GBH,https://wgbh.org/news/politics/2020/11/03/mass...,3,Political Issues & policy
2,2,"['Dorchester', 'Mattapan']",Politics,"['100900', '100700']",Adam Reilly,It’s unlikely Donald Trump will win Massachuse...,00000175-7aad-d944-a9fd-7aed30970002,Trump Won't Win Boston — But He Might Win This...,Trump Won't Win Boston — But He Might Win This...,2020-11-02 00:00:00,GBH,https://wgbh.org/news/politics/2020/11/02/trum...,-1,
3,3,['Downtown'],News,['030302'],Craig LeMoult,The state Department of Public Health released...,00000175-7b20-d944-a9fd-7be1d4bf0001,Household 'Clusters' Are A Problem In Massachu...,Household 'Clusters' Are A Problem In Massachu...,2020-11-01 00:00:00,GBH,https://wgbh.org/news/local-news/2020/11/01/ho...,32,Public Health
4,4,['Jamaica Plain'],Education,['120400'],Kirk Carapezza,"A couple of years ago, Daymian Mejia, a senior...",00000175-7b24-d5c8-a775-fb2c49a40001,'A Tint Over Everything': College Students Of ...,'A Tint Over Everything': College Students Of ...,2020-11-10 00:00:00,GBH,https://wgbh.org/news/education/2020/11/10/a-t...,-1,
5,5,['Roslindale'],News,['110403'],Liz Neisloss,After more than 35 years of helping voters thr...,00000175-8649-d779-a575-b65fe6bf0001,"In An Election Year During A Pandemic, Senior ...","In An Election Year During A Pandemic, Senior ...",2020-11-02 00:00:00,GBH,https://wgbh.org/news/local-news/2020/11/02/in...,-1,
6,6,['Downtown'],News,['981700'],State House News Service,There are 50 contested races among the 200 leg...,00000175-89a5-d3e2-adf5-dbfd8ddb0001,Races to Watch: Legislative Contests on Tap fo...,Races to Watch: Legislative Contests on Tap fo...,2020-11-02 00:00:00,GBH,https://wgbh.org/news/news/2020/11/02/races-to...,3,Political Issues & policy
7,7,"['Roxbury', 'Downtown']",Politics,"['030302', '080601']",Isaiah Thompson,Boston Mayor Marty Walsh is urging calm on and...,00000175-89fb-d5c8-a775-abffc2b20001,Walsh Assures Boston That There Are No Known E...,Walsh Assures Boston That There Are No Known E...,2020-11-02 00:00:00,GBH,https://wgbh.org/news/politics/2020/11/02/wals...,85,Law Enforcement
8,8,['Fenway'],News,['010103'],Hannah Uebele,"Reverend Jack Graham, one of President Donald ...",00000175-8a5e-d5c8-a775-ab5e8ace0000,All Rev'd Up: The Problem With 'Patriotic Chur...,All Rev'd Up: The Problem With 'Patriotic Chur...,2020-11-02 00:00:00,GBH,https://wgbh.org/news/national-news/2020/11/02...,-1,
9,9,['Downtown'],News,['070102'],Aidan Connelly,Today on Boston Public Radio:Robert Costa talk...,00000175-8a7c-d5c8-a775-ab7c05e50001,Boston Public Radio Full Show: 11/2/20,Boston Public Radio Full Show: 11/2/20,2020-11-02 00:00:00,GBH,https://wgbh.org/news/national-news/2020/11/02...,3,Political Issues & policy


### Single column (openai_label) -> Three columns (Keyword_x)

In [9]:
def split_label_columns(articles):
    labels_column = articles['openai_label']

    # Split the keywords column into separate columns
    split_columns = labels_column.str.split(',', expand=True)

    # Rename the columns with appropriate names
    new_columns = [f'Keyword_{i+1}' for i in range(split_columns.shape[1])]
    split_columns.columns = new_columns

    # Fill empty cells with NaN
    split_columns = split_columns.replace('', pd.NA)

    # Concatenate the original DataFrame with the split columns
    articles = pd.concat([articles, split_columns], axis=1)

    articles.drop(columns=['openai_label'], inplace=True)
    
    return articles
    
unseen_articles = split_label_columns(unseen_articles)

### Quick checks

In [10]:
print(unseen_articles.head(5))
unseen_articles['bertopic_topic_label'].value_counts()
# training_articles['bertopic_topic_label'].value_counts()

   Unnamed: 0                neighborhoods position_section  \
0           0                   ['Fenway']        Education   
1           1  ['Downtown', 'Beacon Hill']         Politics   
2           2   ['Dorchester', 'Mattapan']         Politics   
3           3                 ['Downtown']             News   
4           4            ['Jamaica Plain']        Education   

                 tracts             author  \
0            ['010300']  Esteban Bustillos   
1  ['981700', '020302']        Mike Deehan   
2  ['100900', '100700']        Adam Reilly   
3            ['030302']      Craig LeMoult   
4            ['120400']     Kirk Carapezza   

                                                body  \
0  Thomas White, a senior at Boston Latin School,...   
1  A wave of blue votes could wash over the Massa...   
2  It’s unlikely Donald Trump will win Massachuse...   
3  The state Department of Public Health released...   
4  A couple of years ago, Daymian Mejia, a senior...   

       

bertopic_topic_label
-1     1295
 26     396
 3      297
 2      195
 47      79
       ... 
 71       1
 91       1
 42       1
 53       1
 52       1
Name: count, Length: 68, dtype: int64

### Output to .csv

In [5]:
unseen_articles.to_csv("../output/non_stochastic_bertopic_taxonomy_list_230.csv")
# training_articles.to_csv("gbh_step_one_output_old_model.csv")