In [1]:
import pandas as pd  
import nltk  
from gensim.models import Word2Vec  
from sklearn.metrics.pairwise import cosine_similarity  
from sklearn.cluster import KMeans
import numpy as np  
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [2]:
nltk.download('punkt')  


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Cargar el archivo CSV
#file_path = '/kaggle/input/lex-fridman-podcast-transcript/podcastdata_dataset.csv'
file_path = 'podcastdata_dataset.csv'
df = pd.read_csv(file_path)

In [4]:
# Funciones para contar palabras y oraciones
def count_words(text):
    if pd.isna(text):
        return 0
    return len(text.split())

def count_sentences(text):
    if pd.isna(text):
        return 0
    return len(text.split('.'))  

In [5]:
# Añadir columnas para número de palabras y oraciones
df['Word Count'] = df['text'].apply(count_words)
df['Sentence Count'] = df['text'].apply(count_sentences)

result = df[['id', 'text', 'Word Count', 'Sentence Count']]
print(result)

      id                                               text  Word Count  \
0      1  As part of MIT course 6S099, Artificial Genera...       13424   
1      2  As part of MIT course 6S099 on artificial gene...       10217   
2      3  You've studied the human mind, cognition, lang...        5989   
3      4  What difference between biological neural netw...        5993   
4      5  The following is a conversation with Vladimir ...        6374   
..   ...                                                ...         ...   
314  321  By the time he gets to 2045, we'll be able to ...       12807   
315  322  there's a broader question here, right? As we ...       26034   
316  323  Once this whole thing falls apart and we are c...       25255   
317  324  you could be the seventh best player in the wh...       29911   
318  325  turns out that if you train a planarian and th...       33714   

     Sentence Count  
0               611  
1               499  
2               292  
3          

In [6]:
# Listas para almacenar las oraciones y los mapeos
all_sentences = []  
ep_sentence_map = []  

In [7]:
# Recorrer cada fila del DataFrame
for index, row in df.iterrows():
    episode_id = row['id']  # ID del episodio
    text = str(row['text']) if pd.notna(row['text']) else ""  
    sentences = nltk.sent_tokenize(text)  # Dividir el texto en oraciones

    # Recorrer cada oración y asignar un ID de oración dentro del episodio
    for i, sentence in enumerate(sentences):
        all_sentences.append(nltk.word_tokenize(sentence.lower()))  
        ep_sentence_map.append({'ep_id': episode_id, 'st_id': i + 1, 'text': sentence})  

In [8]:
# Entrenar el modelo Word2Vec con todas las oraciones
word2vec_model = Word2Vec(sentences=all_sentences, vector_size=100, window=5, min_count=1, workers=4)

In [9]:
# Función para generar el embedding de una oración
def sentence_embedding(sentence):
    words = nltk.word_tokenize(sentence.lower())
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]  
    if len(word_vectors) == 0:
        return np.zeros(100)  
    return np.mean(word_vectors, axis=0)  

In [10]:
# Crear el nuevo DataFrame con las columnas ep_id, st_id, text, embedding
data = []

for row in ep_sentence_map:
    embedding = sentence_embedding(row['text'])  
    data.append({
        'ep_id': row['ep_id'],  
        'st_id': row['st_id'],  
        'text': row['text'],  
        'embedding': embedding.tolist()  
    })

embedding_df = pd.DataFrame(data)  
print(embedding_df)

        ep_id  st_id                                               text  \
0           1      1  As part of MIT course 6S099, Artificial Genera...   
1           1      2                     He is a professor here at MIT.   
2           1      3  He's a physicist, spent a large part of his ca...   
3           1      4  But he's also studied and delved into the bene...   
4           1      5  Amongst many other things, he is the cofounder...   
...       ...    ...                                                ...   
443537    325   2085                                Is it in the cells?   
443538    325   2086  There are many, many layers to this as always ...   
443539    325   2087                    So there are chemical networks.   
443540    325   2088   So for example, gene regulatory networks, right?   
443541    325   2089   Which, or basically any kind of chemical pathway   

                                                embedding  
0       [0.3643413186073303, -0.3646314

In [11]:
# Función para medir la similitud de la consulta (query) con las oraciones
def get_similar_sentences(query, embedding_df, threshold=0.8):
    query_embedding = sentence_embedding(query)  
    embedding_df['similarity'] = embedding_df['embedding'].apply(lambda x: cosine_similarity([query_embedding], [x])[0][0])
    similar_sentences = embedding_df[embedding_df['similarity'] >= threshold]  
    return similar_sentences[['ep_id', 'st_id', 'text', 'similarity']]  

In [12]:
query = "machine learning inventor"
similar_sentences = get_similar_sentences(query, embedding_df)
print(similar_sentences) 

        ep_id  st_id                                               text  \
1997        4    333                   GANs and reinforcement learning.   
2237        5    214                   It is standard machine learning.   
39052      54    716                           That's machine learning.   
53154      70   1120                                  Machine learning.   
53155      70   1121                                  Machine learning.   
61408      81    869  Is it the success of machine learning and rein...   
72598      93    172  Yeah, and in the case of machine learning is a...   
73566      94    633                            Reinforcement learning.   
74451      95    436                       Rework Deep Learning Summit.   
86560     108    222  What's the role of simulation in reinforcement...   
87009     108    671  So reinforcement learning can be viewed as a g...   
87010     108    672  You can certainly cast supervised learning as ...   
124609    132   1991     

In [13]:
# Obtener lista de episodios únicos
episode_ids = embedding_df['ep_id'].unique()

In [14]:
import os
from sklearn.cluster import KMeans

# Evitar el memory leak en Windows
os.environ["OMP_NUM_THREADS"] = "1"  # Limitar a un solo hilo

In [15]:
# Crear lista para almacenar datos de todos los tópicos
all_topic_data = []

In [None]:
# Recorrer cada episodio
for ep_id in episode_ids:
    # Filtrar las oraciones del episodio actual
    episode_embeddings = embedding_df[embedding_df['ep_id'] == ep_id].copy()  # Copia explícita para evitar SettingWithCopyWarning
    embeddings = np.array(episode_embeddings['embedding'].tolist())

    # Clustering con KMeans para modelado de tópicos
    n_topics = 5  # Número de tópicos estimado
    kmeans = KMeans(n_clusters=n_topics, random_state=0, n_init=10)  # Evitar múltiples hilos
    episode_embeddings['topic'] = kmeans.fit_predict(embeddings)

    # Generación de embeddings de tópicos
    for topic_id in episode_embeddings['topic'].unique():
        topic_sentences = episode_embeddings[episode_embeddings['topic'] == topic_id]
        
        # Concatenar todas las oraciones del tópico y calcular el embedding
        combined_text = " ".join(topic_sentences['text'].values)
        topic_embedding = sentence_embedding(combined_text)

        all_topic_data.append({
            'ep_id': ep_id,  # ID del episodio actual
            'topic': topic_id,  # ID del tópico
            'embedding': topic_embedding.tolist()  # Embedding del tópico
        })


In [None]:
# Crear un DataFrame con los tópicos de todos los episodios
all_topics_df = pd.DataFrame(all_topic_data)
print("DataFrame de tópicos para todos los episodios:")
print(all_topics_df)

In [None]:
# Guardar en un archivo CSV
all_topics_df.to_csv('all_episodes_topics.csv', index=False)
