In [1]:
import json
import openai
import tiktoken
import pandas as pd
import numpy as np
from ast import literal_eval

cfg = json.load(open('api_config.json'))
openai.api_key = cfg['KEY']

In [10]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
EMBED_MAX_TOKEN_LENGTH = 8000

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    
    # Cuts texts down to maximum token length
    tokens = encoding.encode(text)
    if len(tokens) > EMBED_MAX_TOKEN_LENGTH:
        text = encoding.decode(tokens[:EMBED_MAX_TOKEN_LENGTH])
    
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

# Possibly give random sampling of titles to get context before classifying?
def get_cluster_label(prompt, model="gpt-3.5-turbo", role="system"):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": role, "content": prompt},
        ],
        temperature=0.5
    )
    
    return response

LLM_MAX_TOKEN_LENGTH = 4000
SUMAMRY_PROMPT = "write a few bullet point synopsis about this article for technical readers: "
def get_bullet_summary(text, model="gpt-3.5-turbo", role="system"):
    text = text.replace("\n", " ")
    
    tokens = encoding.encode(text)
    if len(tokens) > LLM_MAX_TOKEN_LENGTH:
        text = encoding.decode(tokens[:LLM_MAX_TOKEN_LENGTH])

    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": role, "content": SUMAMRY_PROMPT + text},
        ],
    )
    
    return response.choices[0].message.content


In [4]:
df = pd.read_csv('../data/text_data.csv', sep='\t')

# Drops all empty text rows
df = df[df['title'] != ' '].reset_index(drop=True)

In [30]:
len(df.iloc()[0]['title_embedding'])

1536

In [3]:
df = pd.read_csv('embed_data.csv', sep='\t')

In [6]:
# Generate title embeddings
df['title_embedding'] = df['title'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df.to_csv('embed_data.csv', sep='\t', index=False)

In [7]:
# Generate text embeddings
df['text_embedding'] = df['text'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df.to_csv('embed_data.csv', sep='\t', index=False)

In [11]:
# Generate bullet summaries
df['summary'] = df['text'].apply(lambda x: get_bullet_summary(x))
df.to_csv('embed_data.csv', sep='\t', index=False)

In [36]:
"""
K-Means Clustering
"""
matrix = np.vstack(df['title_embedding'].to_list())

from sklearn.cluster import KMeans

n_clusters = int(len(df['title']) / 5)

kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_
df["cluster"] = labels

In [37]:
"""
Labels each cluster with a overall topic.
"""

PROMPT_TEMPLATE = f'''I am going to give you a list of titles of similar articles about artificial intelligence.
All of these will be discussing different aspects of AI, so get as specific as possible.
I would like you to provide a short description of the group of articles and a few word label for the group.
This label should refer to the domain where AI is being applied in the articles. Some examples are:
"AI in Law", "Facial Recognition", "Quantum Computing", or "AI in Culture and Art. 

Format your response like this:

description: <short description>
label: <one word label>

here is the list of article titles: 
'''

prompts = []

for cluster_ind in df['cluster'].unique():
    # Gets the subframe of a specific cluster.
    c_df = df[df['cluster'] == cluster_ind]
    
    titles = []
    for t in list(c_df['title']):
        titles.append(t)
        
    
    prompts.append(PROMPT_TEMPLATE + str(titles))

In [38]:
"""
Query GPT for cluster labels based on titles.
"""
responses = []
for prompt in prompts:
    responses.append(get_cluster_label(prompt))

In [39]:
"""
Create a dictionary for cluster information.
"""

# ChatGPT generated function.
def get_substring_between_strings(input_string, start_string, end_string):
    start_index = input_string.find(start_string)
    if start_index == -1:
        return None  # Start string not found in the input string

    end_index = input_string.find(end_string, start_index + len(start_string))
    if end_index == -1:
        return None  # End string not found in the input string

    substring = input_string[start_index + len(start_string):end_index]
    return substring

cluster_labels = dict()

for r, cluster_ind in zip(responses, df['cluster'].unique()):
    response = r.choices[0].message.content
    
    description = get_substring_between_strings(response, 'description: ', 'label: ').replace("\n", "")
    label = response.split('label: ')[1].replace("\n", "")
    
    cluster_labels[int(cluster_ind)] = {'label': label, 'desc': description}
    
with open('cluster_labels.json', 'w') as f:
    json.dump(cluster_labels, f)

In [40]:
"""
Adds cluster data to the dataframe.
"""

cluster_labels = dict()
with open ('cluster_labels.json', 'r') as f:
    cluster_labels = json.load(f)

df['cluster_info'] = df['cluster'].astype(str).map(cluster_labels)
df['cluster_label'] = df['cluster_info'].apply(lambda x: x['label'])
df['cluster_desc'] = df['cluster_info'].apply(lambda x: x['desc'])

df.to_csv('cluster_data.csv', sep='\t', index=False)
