In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
import pyLDAvis
import matplotlib.pyplot as plt
from collections import Counter
import joblib
import os
import pyLDAvis.sklearn_models as sklearn_lda_vis
pyLDAvis.enable_notebook()


ModuleNotFoundError: No module named 'pyLDAvis.sklearn_models'

In [10]:
df = pd.read_csv("data/processed/lex_fridman_cleaned.csv")
df['cleaned_text'] = df['cleaned_text'].fillna('')
documents = df['cleaned_text'].tolist()


In [11]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7, min_df=5, max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

nmf_model = NMF(n_components=5, random_state=42)
nmf_W = nmf_model.fit_transform(tfidf_matrix)

def display_topics(model, feature_names, no_top_words=10):
    for idx, topic in enumerate(model.components_):
        print(f"\n🧠 NMF Topic {idx}")
        print(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(nmf_model, tfidf_vectorizer.get_feature_names_out())



🧠 NMF Topic 0
robot, neural, robots, reinforcement, autonomous, networks, car, code, robotics, programming

🧠 NMF Topic 1
war, government, bitcoin, wan, country, united, putin, russia, trump, ukraine

🧠 NMF Topic 2
uh, um, programming, fucking, python, instruction, instructions, bobby, shit, hey

🧠 NMF Topic 3
consciousness, physics, quantum, conscious, mechanics, mathematics, gravity, string, particles, einstein

🧠 NMF Topic 4
wrestling, chess, fight, judo, martial, match, training, games, opponent, sport


In [12]:
lsa_model = TruncatedSVD(n_components=5, random_state=42)
lsa_model.fit(tfidf_matrix)

print("\n--- Top Words in LSA Topics ---")
display_topics(lsa_model, tfidf_vectorizer.get_feature_names_out())



--- Top Words in LSA Topics ---

🧠 NMF Topic 0
uh, consciousness, wan, physics, war, um, neural, robot, robots, quantum

🧠 NMF Topic 1
war, putin, russia, ukraine, fight, trump, government, united, country, hitler

🧠 NMF Topic 2
uh, um, programming, python, reinforcement, instruction, neural, instructions, code, bobby

🧠 NMF Topic 3
quantum, consciousness, physics, uh, um, mechanics, string, gravity, particles, conscious

🧠 NMF Topic 4
bitcoin, ethereum, currency, government, crypto, gold, china, blockchain, war, cryptocurrency


In [15]:
count_vectorizer = CountVectorizer(stop_words="english", max_df=0.7, min_df=5, max_features=5000)
count_matrix = count_vectorizer.fit_transform(documents)

lda_model = LatentDirichletAllocation(n_components=5, max_iter=10, learning_method='online', random_state=42)
lda_model.fit(count_matrix)

print("\n--- Top Words in LDA Topics ---")
display_topics(lda_model, count_vectorizer.get_feature_names_out())



--- Top Words in LDA Topics ---

🧠 NMF Topic 0
bitcoin, companies, market, gold, digital, network, government, business, ethereum, crypto

🧠 NMF Topic 1
war, government, wan, god, country, united, russia, russian, american, media

🧠 NMF Topic 2
chess, games, training, fight, python, programming, code, uh, win, team

🧠 NMF Topic 3
consciousness, physics, quantum, robot, uh, conscious, mathematics, robots, light, math

🧠 NMF Topic 4
neural, cells, test, planet, cell, biology, virus, models, environment, vision


In [16]:
# Optional: Save LDA topic words to JSON
os.makedirs("prediction_output", exist_ok=True)
with open("prediction_output/lda_topics.json", "w") as f:
    json.dump(lda_topics, f, indent=4)


In [18]:
from huggingface_hub import login
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
bertopic_model = BERTopic(embedding_model=embedding_model, language="english", calculate_probabilities=True)
topics, probs = bertopic_model.fit_transform(df["cleaned_text"].tolist())


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [20]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Use local model to avoid login issues
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Fit BERTopic
bertopic_model = BERTopic(
    embedding_model=embedding_model,
    language="english",
    calculate_probabilities=True,
    verbose=True
)

# Fit to cleaned text
docs = df["cleaned_text"].fillna("").tolist()
topics, probs = bertopic_model.fit_transform(docs)

# Save to DataFrame
df["bertopic_topic"] = topics

# Save result
df.to_csv("data/processed/lex_fridman_with_topics.csv", index=False)
print("✅ Saved: data/processed/lex_fridman_with_topics.csv")

# Save topic summary
topic_info = bertopic_model.get_topic_info()
topic_info.to_csv("prediction_output/bertopic_summary.csv", index=False)

# Visualize
bertopic_model.visualize_barchart(top_n_topics=10)


2025-06-19 18:03:21,971 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

2025-06-19 18:03:26,201 - BERTopic - Embedding - Completed ✓
2025-06-19 18:03:26,202 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-19 18:03:26,334 - BERTopic - Dimensionality - Completed ✓
2025-06-19 18:03:26,335 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-19 18:03:26,344 - BERTopic - Cluster - Completed ✓
2025-06-19 18:03:26,345 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-06-19 18:03:28,467 - BERTopic - Representation - Completed ✓


✅ Saved: data/processed/lex_fridman_with_topics.csv


In [22]:
# Fit and transform using BERTopic
topics, probs = bertopic_model.fit_transform(df['cleaned_text'].tolist())

# Save topics to DataFrame
df['predicted_topic'] = topics


2025-06-19 18:51:19,977 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/11 [00:00<?, ?it/s]

2025-06-19 18:51:24,204 - BERTopic - Embedding - Completed ✓
2025-06-19 18:51:24,205 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-06-19 18:51:24,341 - BERTopic - Dimensionality - Completed ✓
2025-06-19 18:51:24,342 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-06-19 18:51:24,352 - BERTopic - Cluster - Completed ✓
2025-06-19 18:51:24,354 - BERTopic - Representation - Extracting topics from clusters using representation models.
2025-06-19 18:51:26,498 - BERTopic - Representation - Completed ✓


In [23]:
guest_topic = df.groupby('guest')['predicted_topic'].agg(['count', pd.Series.mode])
guest_topic.columns = ['appearances', 'dominant_topic']
guest_topic = guest_topic.sort_values('appearances', ascending=False)
guest_topic.head(10)


Unnamed: 0_level_0,appearances,dominant_topic
guest,Unnamed: 1_level_1,Unnamed: 2_level_1
Manolis Kellis,4,0
Michael Malice,4,0
Eric Weinstein,4,0
Elon Musk,3,0
Andrew Huberman,3,0
Stephen Wolfram,3,0
van Rossum,2,0
François Chollet,2,0
Chris Lattner,2,0
Sean Carroll,2,0


In [26]:
# Save topic labels for classification
df.to_csv("data/processed/lex_fridman_with_topics.csv", index=False)
print("✅ BERTopic-labeled data saved: data/processed/lex_fridman_with_topics.csv")

✅ BERTopic-labeled data saved: data/processed/lex_fridman_with_topics.csv


In [27]:
# === Save BERTopic Summary ===
topic_info = bertopic_model.get_topic_info()
topic_info.to_csv("prediction_output/bertopic_summary.csv", index=False)


In [28]:
bertopic_model.visualize_barchart(top_n_topics=10)


In [29]:
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

NameError: name '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' is not defined