In [2]:
import json
import openai
import tiktoken
import pandas as pd
import numpy as np
from ast import literal_eval

cfg = json.load(open('api_config.json'))
openai.api_key = cfg['KEY']

In [12]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
MAX_TOKEN_LENGTH = 8000

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    
    # Cuts texts down to maximum token length
    tokens = encoding.encode(text)
    if len(tokens) > MAX_TOKEN_LENGTH:
        text = encoding.decode(tokens[:MAX_TOKEN_LENGTH])
    
    return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

# Possibly give random sampling of titles to get context before classifying?
def get_cluster_label(prompt, model="gpt-3.5-turbo", role="system"):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": role, "content": prompt},
        ],
        temperature=0.5
    )
    
    return response

In [26]:
get_cluster_label('''I am going to give you a list of titles of similar articles about artificial intelligence.
All of these will be discussing different aspects of AI, so get as specific as possible.
I would like you to provide a short description of the group of articles and a few word label for the group.
This label should refer to the domain where AI is being applied in the articles. Some examples are:
"AI in Law", "Facial Recognition", "Quantum Computing", "AI applied to Science" or "AI in Culture and Art".
Format your response like this:

description: <short description>
label: <one word label>

here is the list of article titles: 
['This AI girlfriend startup is making $100k a month fulfilling people’s fetishes', 'Gender Shades: Intersectional Accuracy Disparities in Commercial Gender Classification', 'Reddit Poker Bot', 'US curbs AI chip exports from Nvidia and AMD to some Middle East countries', 'The Technology Facebook and Google Didn’t Dare Release', 'In U.S.-China AI contest, the race is on to deploy killer robots', 'An AI quadcopter has beaten human champions at drone racing', 'Study finds major shortcomings in Air Force processes to test AI technologies', 'Identifying AI-generated images with SynthID', 'Inside the messy ethics of making war with machines', "New Jersey's Ocean City taps AI gun detection in hopes of thwarting mass shootings", 'Unlocking the power of AI for ocean exploration', 'These Prisoners Are Training AI', '1 Small AI Stock Just Landed New Orders Worth Over $100 Million', "Israel's new military AI systems select targets and plan missions 'in minutes'", 'Where Memory Ends and Generative AI Begins', 'PANS: A Portable Navigation Platform', 'Man beats machine at Go in human victory over AI', 'US rejects AI copyright for famous state fair-winning Midjourney art', 'Artificial Intelligence Flags Knee Osteoarthritis Using Medical Images', 'Will DALL-E the AI Artist Take My Job?', 'Microsoft files new AI-powered smart backpack patent. It can hear you and see what you see', 'As artificial intelligence and robotics advance, a basic income may be the only viable solution']''')

<OpenAIObject chat.completion id=chatcmpl-83QkxbhBOvhIjGF6IU61OGyH581S4 at 0x2062b706250> JSON: {
  "id": "chatcmpl-83QkxbhBOvhIjGF6IU61OGyH581S4",
  "object": "chat.completion",
  "created": 1695828527,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "description: These articles cover a wide range of topics related to the application of artificial intelligence in various domains. They discuss AI's role in fulfilling people's fetishes, gender classification disparities, poker playing bots, export restrictions on AI chips, unreleased technologies by Facebook and Google, the race to deploy killer robots, AI quadcopters beating human champions, shortcomings in Air Force testing of AI technologies, identifying AI-generated images, the ethics of using AI in warfare, AI gun detection for mass shooting prevention, AI for ocean exploration, prisoners training AI, a small AI stock receiving new orders, mi

In [36]:
df = pd.read_csv('../data/text_data.csv', sep='\t')

# Drops all empty text rows
df = df[df['title'] != ' '].reset_index(drop=True)

In [3]:
df = pd.read_csv('embed_data.csv', sep='\t')

In [30]:
# Generate title embeddings
df['title_embedding'] = df['title'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df.to_csv('embed_data.csv', sep='\t', index=False)

In [53]:
# Generate text embeddings
df['text_embedding'] = df['text'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
df.to_csv('embed_data.csv', sep='\t', index=False)

In [4]:
"""
K-Means Clustering
"""
matrix = np.vstack(df['text_embedding'].apply(literal_eval).to_list())

from sklearn.cluster import KMeans

n_clusters = int(len(df['title']) / 10)

kmeans = KMeans(n_clusters=n_clusters, init="k-means++", random_state=42)
kmeans.fit(matrix)
labels = kmeans.labels_
df["cluster"] = labels

In [57]:
"""
Labels each cluster with a overall topic.
"""

PROMPT_TEMPLATE = f'''I am going to give you a list of titles of similar articles about artificial intelligence.
All of these will be discussing different aspects of AI, so get as specific as possible.
I would like you to provide a short description of the group of articles and a few word label for the group.
This label should refer to the domain where AI is being applied in the articles. Some examples are:
"AI in Law", "Facial Recognition", "Quantum Computing", or "AI in Culture and Art. 

Format your response like this:

description: <short description>
label: <one word label>

here is the list of article titles: 
'''

TITLE_LEN_LIMIT = 100

prompts = []

for cluster_ind in df['cluster'].unique():
    # Gets the subframe of a specific cluster.
    c_df = df[df['cluster'] == cluster_ind]
    
    titles = []
    for t in list(c_df['title']):
        if len(t) < TITLE_LEN_LIMIT:
            titles.append(t)
        
    
    prompts.append(PROMPT_TEMPLATE + str(titles))

In [58]:
"""
Query GPT for cluster labels based on titles.
"""
responses = []
for prompt in prompts:
    responses.append(get_cluster_label(prompt))

In [62]:
"""
Create a dictionary for cluster information.
"""

# ChatGPT generated function.
def get_substring_between_strings(input_string, start_string, end_string):
    start_index = input_string.find(start_string)
    if start_index == -1:
        return None  # Start string not found in the input string

    end_index = input_string.find(end_string, start_index + len(start_string))
    if end_index == -1:
        return None  # End string not found in the input string

    substring = input_string[start_index + len(start_string):end_index]
    return substring

cluster_labels = dict()

for r, cluster_ind in zip(responses, df['cluster'].unique()):
    response = r.choices[0].message.content
    
    description = get_substring_between_strings(response, 'description: ', 'label: ').replace("\n", "")
    label = response.split('label: ')[1].replace("\n", "")
    
    cluster_labels[int(cluster_ind)] = {'label': label, 'desc': description}
    
with open('cluster_labels.json', 'w') as f:
    json.dump(cluster_labels, f)

In [29]:
"""
Adds cluster data to the dataframe.
"""

cluster_labels = dict()
with open ('cluster_labels.json', 'r') as f:
    cluster_labels = json.load(f)

df['cluster_info'] = df['cluster'].astype(str).map(cluster_labels)
df['cluster_label'] = df['cluster_info'].apply(lambda x: x['label'])
df['cluster_desc'] = df['cluster_info'].apply(lambda x: x['desc'])

df.to_csv('cluster_data.csv', sep='\t', index=False)
