# This notebook calls OpenAI API to summarize topics from BERTopic with OpenAI representation model to a list of keywords

In [1]:
#Assuming working dir on scc: sparkgrp/dyxu/improve_model/

import pandas as pd
from bertopic import BERTopic
import json
import openai
from transformers import pipeline

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


In [2]:
# Load the topic model
topic_model = BERTopic.load("bglobe_519_body_openai")

In [3]:
# Load the articles, quick checks to make sure that model runs correctly
training_set = pd.read_csv("bglobe_100k_sample.csv")
print(training_set.head(5))

body = training_set['body']
topic_model.get_document_info(body)[:5]

   Unnamed: 0               pub_type      position_section  \
0       12862                    NaN  METRO/REGION; Pg. B2   
1       17795  Newspaper, Newspapers     EDITORIAL OPINION   
2        1036                    NaN  METRO/REGION; Pg. B3   
3       13596                    NaN  METRO/REGION; Pg. 28   
4        1669                    NaN    CITY WEEKLY; Pg. 1   

  position_subsection                                                hl1  hl2  \
0                 NaN                      MWRA issues 1st water report;  NaN   
1             Opinion                                                NaN  NaN   
2                 NaN             Curley jury process begins in private;  NaN   
3                 NaN                    Pols look for that union label;  NaN   
4                 NaN  Hillary Clinton touts Somerville as a can-do c...  NaN   

  author                                               lede  \
0    NaN  The Massachusetts Water Resources Authority ha...   
1    NaN  OBAM

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,"Locally, just about any level of reporting wou...",20,20_Energy and Environmental Developments in Ma...,[Energy and Environmental Developments in Mass...,[GLOBE SOUTH 2 BRIDGEWATER VETERANS AGENT LEAV...,Energy and Environmental Developments in Massa...,0.72509,False
1,- JEFFREY GOLDBERG www.theatlantic.com ``Presi...,-1,-1_Miscellaneous events and issues in a town,[Miscellaneous events and issues in a town],"[From the Treasury's perspective, the rational...",Miscellaneous events and issues in a town,0.0,False
2,CAMBRIDGE - Jury selection began behind clos...,-1,-1_Miscellaneous events and issues in a town,[Miscellaneous events and issues in a town],"[From the Treasury's perspective, the rational...",Miscellaneous events and issues in a town,0.0,False
3,"""I want to be the point man for labor to go to...",-1,-1_Miscellaneous events and issues in a town,[Miscellaneous events and issues in a town],"[From the Treasury's perspective, the rational...",Miscellaneous events and issues in a town,0.0,False
4,SOMERVILLE - Hillary Rodham Clinton's visit ...,3,3_Campaigns and Elections,[Campaigns and Elections],[Brown's seat is one of just two held by Repub...,Campaigns and Elections,0.896168,False


In [5]:
# get all topics from BERTopic
topics_df = topic_model.get_topic_info()
topics_df['Representation'].head(5)

0          [Miscellaneous events and issues in a town]
1                 [Sports teams and players in Boston]
2    [royally wham blah ok count smiling accomplish...
3    [Urban Development and Transportation in Great...
4                            [Campaigns and Elections]
Name: Representation, dtype: object

In [6]:
# use openai api to create a label for the topic from BERTopic
openai.api_key = "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"

# exponential back off - because i kept getting ratelimiterror 
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def completion_with_backoff(**kwargs):
    return openai.ChatCompletion.create(**kwargs)

In [8]:
# get labels for all bag of words from bertopic
# range function modified to suit weird outliers of keyBERT-inspired representation model
for i in [x for x in range(1, len(topics_df))]:
    topic = topics_df['Representation'][i][0]
    
    # build prompt for openai 
    text_input = "Please summarize this topic into a maximum of three concise keywords: " + topic
    text_input += "Please generalize the final result.\n"

    # using gpt-3.5-turbo: $0.002/1,000 tokens
    # each request is ~150 tokens (including the response from openai)
    background_prompt = [{"role": "system", "content": "You are presented with topics of news articles. Your task is to summarize the topic represented as a sentence to a maximum of three keywords."},
                         {"role": "user", "content": "Please summarize this topic into three keywords: Diversification strategies in investing and the role of banks in the mortgage market. Please generalize the result."},
                         {"role": "assistant", "content": "Investment strategies, Banking, Finance"}]


    prompt = background_prompt + [{"role": "user", "content": text_input}]

    response = completion_with_backoff(model="gpt-3.5-turbo",
                                       temperature=1, 
                                       max_tokens=8,
                                       messages=prompt)

    # save result in json file
    with open('openai_label_three_keyword_openai.json', 'r+') as f:
        # load existing data 
        file_data = json.load(f)
        file_data[int(topics_df['Topic'][i])] = {"Name": topics_df['Name'][i],
                                              "OpenAI_label": response['choices'][0]['message']['content'],
                                              "OpenAI_metadata": response}
    
        f.seek(0)
        # convert back to json
        json.dump(file_data, f, indent=4)