In [13]:
import os
import re
import ast
import string
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from datasets import load_dataset

In [15]:
# Setup
nltk.download("punkt")
nltk.download("stopwords")

# Load the dataset from Hugging Face (fallbacks to cached if offline)
dataset = load_dataset("Whispering-GPT/lex-fridman-podcast")

# Convert to pandas DataFrame
df = pd.DataFrame(dataset['train'])

# Display structure
print(f"✅ Dataset loaded with {len(df)} rows and columns: {df.columns.tolist()}")

# Create raw data directory
os.makedirs("data/raw", exist_ok=True)

# Save raw CSV
csv_path = "data/raw/lex_fridman_podcast.csv"
df.to_csv(csv_path, index=False)

print(f"✅ Saved dataset to {csv_path}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\archa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\archa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using the latest cached version of the dataset since Whispering-GPT/lex-fridman-podcast couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\archa\.cache\huggingface\datasets\Whispering-GPT___lex-fridman-podcast\default\0.0.0\89ae90cf6e8d21e4f81b581252f1c8f4964b2de3 (last modified on Mon Jun  2 20:34:28 2025).


✅ Dataset loaded with 346 rows and columns: ['id', 'channel', 'channel_id', 'title', 'categories', 'tags', 'description', 'text', 'segments']
✅ Saved dataset to data/raw/lex_fridman_podcast.csv


In [16]:
# === Clean & Normalize ===
df['text'] = df['text'].fillna('')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'https?:\/\/\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    punct = set(string.punctuation)
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words and t not in punct]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(clean_text)


In [17]:
# === Tokenize ===
custom_stopwords = set(stopwords.words("english")).union({'s', 't', 're', 've', 'll', 'm'})

def tokenize(text):
    text = str(text).lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    tokens = re.findall(r'\b\w+\b', text)
    return [t for t in tokens if t not in custom_stopwords and t not in string.punctuation]

df['tokens'] = df['text'].apply(tokenize)

In [6]:
# === Save Processed File ===
os.makedirs("data/processed", exist_ok=True)
df.to_csv("data/processed/lex_fridman_cleaned.csv", index=False)
print("✅ Cleaned dataset saved to: data/processed/lex_fridman_cleaned.csv")


✅ Cleaned dataset saved to: data/processed/lex_fridman_cleaned.csv


In [18]:
# === Lexical Stats ===
all_tokens = [t for sublist in df['tokens'] for t in sublist]
total_tokens = len(all_tokens)
unique_tokens = len(set(all_tokens))
lexical_diversity = unique_tokens / total_tokens if total_tokens else 0
avg_len = sum(len(t) for t in all_tokens) / total_tokens if total_tokens else 0

print("\n📊 Lexical Summary:")
print({
    "Total Tokens": total_tokens,
    "Unique Tokens": unique_tokens,
    "Lexical Diversity": round(lexical_diversity, 4),
    "Average Token Length": round(avg_len, 2)
})


📊 Lexical Summary:
{'Total Tokens': 3473801, 'Unique Tokens': 50018, 'Lexical Diversity': 0.0144, 'Average Token Length': 5.95}


In [19]:
# === Guest Extraction ===
def extract_guest_name(title):
    if ":" in title:
        words = title.split(":")[0].split()
        return " ".join(words[-2:]) if len(words) >= 2 else title
    return title

df['guest'] = df['title'].apply(extract_guest_name)

In [25]:
# Print a few guest names to verify
print(df[['title', 'guest']].head(10))

                                               title            guest
0  Jed Buchwald: Isaac Newton and the Philosophy ...     Jed Buchwald
1  Sergey Nazarov: Chainlink, Smart Contracts, an...   Sergey Nazarov
2  Stephen Wolfram: Fundamental Theory of Physics...  Stephen Wolfram
3  Philip Goff: Consciousness, Panpsychism, and t...      Philip Goff
4  Oriol Vinyals: DeepMind AlphaStar, StarCraft, ...    Oriol Vinyals
5  Ray Dalio: Principles, the Economic Machine, A...        Ray Dalio
6  Michael Malice: Totalitarianism and Anarchy | ...   Michael Malice
7  Tomaso Poggio: Brains, Minds, and Machines | L...    Tomaso Poggio
8  George Hotz: Comma.ai, OpenPilot, and Autonomo...      George Hotz
9  Tim Dillon: Comedy, Power, Conspiracy Theories...       Tim Dillon
