In [1]:
import pandas as pd
import math

# Load the existing codebook and dataset
codebook_path = "codebook.csv"
dataset_path = "dataset.csv"

# Read the existing codebook
codebook_df = pd.read_csv(codebook_path)

# Read the dataset
dataset_df = pd.read_csv(dataset_path)

# Show basic info
codebook_df.head(), dataset_df.head(), len(dataset_df)



(Empty DataFrame
 Columns: [Code, Description, Rationale, Example_ids]
 Index: [],
    Serial No.                                      unique_app_id       id  \
 0           1  20250512154221992300_ArtificialInteligence_1iz...  1izv9kf   
 1           2  20250512154221992300_ArtificialInteligence_1kj...  1kjwb85   
 2           3  20250512154221992300_ArtificialInteligence_1dl...  1dlw98o   
 3           4  20250512154221992300_ArtificialInteligence_1k3...  1k3rrkx   
 4           5  20250512154221992300_ArtificialInteligence_1je...  1je0frk   
 
    type                                              title  score  \
 0  post  Hot take: LLMs are not gonna get us to AGI, an...    467   
 1  post  When do you think the real AGI boom will happe...     83   
 2  post  The more I learn about AI the less I believe w...    431   
 3  post  dont care about agi/asi definitions; ai is "sm...     73   
 4  post  Google Deepmind CEO predicts AGI will emerge b...    158   
 
                  author 

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Use TF-IDF to vectorize the post texts
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9, min_df=2)
X = vectorizer.fit_transform(dataset_df["text"])

# Estimate the number of clusters (codes)
n_clusters = min(30, round(math.sqrt(len(dataset_df))))

# Apply KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
dataset_df['cluster'] = kmeans.fit_predict(X)

# Check top terms in each cluster to help label them
top_terms_per_cluster = []
terms = vectorizer.get_feature_names_out()
for i in range(n_clusters):
    center = kmeans.cluster_centers_[i]
    top_indices = center.argsort()[::-1][:10]
    top_terms = [terms[ind] for ind in top_indices]
    top_terms_per_cluster.append(top_terms)

top_terms_per_cluster



ValueError: np.nan is an invalid document, expected byte or unicode string.

In [3]:
# Drop rows with missing text
dataset_df = dataset_df.dropna(subset=["text"])

# Re-run TF-IDF vectorization
X = vectorizer.fit_transform(dataset_df["text"])

# Recalculate clusters
n_clusters = min(30, round(math.sqrt(len(dataset_df))))
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
dataset_df['cluster'] = kmeans.fit_predict(X)

# Extract top terms per cluster again
top_terms_per_cluster = []
terms = vectorizer.get_feature_names_out()
for i in range(n_clusters):
    center = kmeans.cluster_centers_[i]
    top_indices = center.argsort()[::-1][:10]
    top_terms = [terms[ind] for ind in top_indices]
    top_terms_per_cluster.append(top_terms)

top_terms_per_cluster



[['tests',
  'architecture',
  'scaling',
  'chain',
  'years',
  'emergent',
  'needed',
  'far',
  'models',
  'inherent'],
 ['ai',
  'human',
  'agi',
  'intelligence',
  'awareness',
  'thought',
  'just',
  'time',
  'insane',
  'quantum'],
 ['humans',
  'agi',
  'think',
  'going',
  'tech',
  'people',
  'ai',
  'better',
  'ask',
  'human'],
 ['humans',
  'model',
  'reasoning',
  'models',
  'org',
  'arc',
  'abs',
  'agi',
  'arxiv',
  'https'],
 ['agi',
  'llms',
  'human',
  'data',
  'definition',
  'level',
  'high',
  'research',
  'training',
  'companies'],
 ['com',
  'https',
  'status',
  'poll',
  'www',
  'reddit',
  'view',
  'focus',
  'google',
  'youtube'],
 ['agi',
  'economic',
  'government',
  'day',
  'developed',
  'ai',
  'open',
  'people',
  'source',
  'world'],
 ['just',
  'don',
  'know',
  'agi',
  'ai',
  'agents',
  've',
  'saying',
  'wondering',
  'achieved'],
 ['ai',
  'agi',
  'said',
  'good',
  'developing',
  'videos',
  'jensen',
  'poi

In [5]:
# Map from cluster index to final labels and descriptions (based on top terms)
cluster_labels = {
    0: ("Quantum AI", "Posts discussing quantum computing, neural design, and advanced AI hardware."),
    1: ("AGI Ethics", "Discussions on ethical, democratic, or societal aspects of AGI development."),
    2: ("Curiosity & Questions", "Posts asking speculative or curious questions about AGI and AI futures."),
    3: ("Human vs AI", "Comparisons between human intelligence and AI models or capabilities."),
    4: ("LLM Architecture", "Discussion of LLM internals, scaling laws, and data-driven design."),
    5: ("Off-topic Links", "Posts primarily linking external content or polls with limited commentary."),
    6: ("Breakthrough Claims", "Posts declaring or speculating on recent breakthroughs in AGI."),
    7: ("Futurism Attitudes", "Optimistic or skeptical reflections on AGI and future tech trajectories."),
    8: ("AGI Consciousness", "Debates on AI awareness, sentience, or philosophical identity."),
    9: ("Hype & Misinformation", "Cynical takes on AGI hype, tech claims, or disillusionment."),
    10: ("Corporate AGI News", "News or reports from companies like DeepMind or OpenAI on AGI progress."),
}

# Create the codebook entries
codebook_entries = []
for cluster_id, (code, description) in cluster_labels.items():
    subset = dataset_df[dataset_df['cluster'] == cluster_id]
    example_ids = subset['unique_app_id'].head(3).tolist()
    rationale = f"This code captures a distinct discussion cluster focused on {description.split()[0].lower()} topics."
    codebook_entries.append({
        "Code": code,
        "Description": description,
        "Rationale": rationale,
        "Example_ids": ", ".join(example_ids)
    })

# Convert to DataFrame for CSV output
final_codebook_df = pd.DataFrame(codebook_entries, columns=["Code", "Description", "Rationale", "Example_ids"])

# Export as raw CSV text (RFC 4180 compliant)
import io
csv_buffer = io.StringIO()
final_codebook_df.to_csv(csv_buffer, index=False, line_terminator="\n", quoting=1)  # quoting=1 for QUOTE_ALL

csv_output = csv_buffer.getvalue()
csv_output



TypeError: NDFrame.to_csv() got an unexpected keyword argument 'line_terminator'

In [6]:
# Map from cluster index to final labels and descriptions (based on top terms)
cluster_labels = {
    0: ("Quantum AI", "Posts discussing quantum computing, neural design, and advanced AI hardware."),
    1: ("AGI Ethics", "Discussions on ethical, democratic, or societal aspects of AGI development."),
    2: ("Curiosity & Questions", "Posts asking speculative or curious questions about AGI and AI futures."),
    3: ("Human vs AI", "Comparisons between human intelligence and AI models or capabilities."),
    4: ("LLM Architecture", "Discussion of LLM internals, scaling laws, and data-driven design."),
    5: ("Off-topic Links", "Posts primarily linking external content or polls with limited commentary."),
    6: ("Breakthrough Claims", "Posts declaring or speculating on recent breakthroughs in AGI."),
    7: ("Futurism Attitudes", "Optimistic or skeptical reflections on AGI and future tech trajectories."),
    8: ("AGI Consciousness", "Debates on AI awareness, sentience, or philosophical identity."),
    9: ("Hype & Misinformation", "Cynical takes on AGI hype, tech claims, or disillusionment."),
    10: ("Corporate AGI News", "News or reports from companies like DeepMind or OpenAI on AGI progress."),
}

# Create the codebook entries
codebook_entries = []
for cluster_id, (code, description) in cluster_labels.items():
    subset = dataset_df[dataset_df['cluster'] == cluster_id]
    example_ids = subset['unique_app_id'].head(3).tolist()
    rationale = f"This code captures a distinct discussion cluster focused on {description.split()[0].lower()} topics."
    codebook_entries.append({
        "Code": code,
        "Description": description,
        "Rationale": rationale,
        "Example_ids": ", ".join(example_ids)
    })

# Convert to DataFrame for CSV output
final_codebook_df = pd.DataFrame(codebook_entries, columns=["Code", "Description", "Rationale", "Example_ids"])

# Export as raw CSV text (RFC 4180 compliant)
import io
csv_buffer = io.StringIO()
final_codebook_df.to_csv(csv_buffer, index=False, lineterminator="\n", quoting=1)

csv_output = csv_buffer.getvalue()
csv_output



'"Code","Description","Rationale","Example_ids"\n"Quantum AI","Posts discussing quantum computing, neural design, and advanced AI hardware.","This code captures a distinct discussion cluster focused on posts topics.","20250512154221992300_ArtificialInteligence_1b3gf38_10, 20250512154221992300_ArtificialInteligence_1h0u7cs_11, 20250512154221992300_ArtificialInteligence_1f9igzu_25"\n"AGI Ethics","Discussions on ethical, democratic, or societal aspects of AGI development.","This code captures a distinct discussion cluster focused on discussions topics.","20250512154221992300_ArtificialInteligence_1je0frk_4, 20250512154221992300_ArtificialInteligence_1hbqe1d_21, 20250512154221992300_ArtificialInteligence_1gonbz9_48"\n"Curiosity & Questions","Posts asking speculative or curious questions about AGI and AI futures.","This code captures a distinct discussion cluster focused on posts topics.","20250512154221992300_ArtificialInteligence_1kjwb85_1, 20250512154221992300_ArtificialInteligence_1k3rr