In [1]:
!pip install nltk gensim tqdm



In [2]:
import numpy as np
import pandas as pd
import re
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from tqdm import tqdm

In [7]:
# Funciones de utilidad para procesamiento de texto
def analyze_text_metrics(text):
    word_count = len(text.split(" "))
    sentence_count = len([sent for sent in re.split(r'[.]', text) if sent.strip()])
    return word_count, sentence_count

# Cargar datos
podcast_data = pd.read_csv('data/podcastdata_dataset.csv')

# Calcular métricas básicas
podcast_data['word_count'], podcast_data['sentence_count'] = zip(*podcast_data['text'].apply(analyze_text_metrics))

In [15]:
# Visualizar el DataFrame inicial
print("=== DATASET ORIGINAL ===")
print(podcast_data)

=== DATASET ORIGINAL ===
      id             guest                                              title  \
0      1       Max Tegmark                                           Life 3.0   
1      2     Christof Koch                                      Consciousness   
2      3     Steven Pinker                            AI in the Age of Reason   
3      4     Yoshua Bengio                                      Deep Learning   
4      5   Vladimir Vapnik                               Statistical Learning   
..   ...               ...                                                ...   
314  321      Ray Kurzweil    Singularity, Superintelligence, and Immortality   
315  322  Rana el Kaliouby   Emotion AI, Social Robots, and Self-Driving Cars   
316  323        Will Sasso  Comedy, MADtv, AI, Friendship, Madness, and Pr...   
317  324   Daniel Negreanu                                              Poker   
318  325     Michael Levin  Biology, Life, Aliens, Evolution, Embryogenesi...   

  

In [8]:
def extract_sentences(text):
    """Extrae oraciones limpias del texto."""
    return [sentence.strip() for sentence in re.split(r'[.]', text) if sentence.strip()]

# Crear dataset de oraciones
sentence_records = []
for index, row in podcast_data.iterrows():
    podcast_id = row['id']
    episode_sentences = extract_sentences(row['text'])
    
    for sent_num, sentence in enumerate(episode_sentences, 1):
        sentence_records.append({
            'podcast_id': podcast_id, 
            'sentence_number': sent_num, 
            'content': sentence
        })

sentence_dataset = pd.DataFrame(sentence_records)

In [16]:
# Visualizar el DataFrame de oraciones
print("=== DATASET DE ORACIONES ===")
print(sentence_dataset)

=== DATASET DE ORACIONES ===
        podcast_id  sentence_number  \
0                1                1   
1                1                2   
2                1                3   
3                1                4   
4                1                5   
...            ...              ...   
385890         325             1703   
385891         325             1704   
385892         325             1705   
385893         325             1706   
385894         325             1707   

                                                  content  \
0       As part of MIT course 6S099, Artificial Genera...   
1                           He is a professor here at MIT   
2       He's a physicist, spent a large part of his ca...   
3       But he's also studied and delved into the bene...   
4       Amongst many other things, he is the cofounder...   
...                                                   ...   
385890                                 It's the beginning   
385891  It's n

In [9]:
# Preparar oraciones para Word2Vec
processed_sentences = [simple_preprocess(sent) for sent in sentence_dataset['content']]

# Entrenar modelo de embeddings
embedding_model = Word2Vec(
    sentences=processed_sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4
)

# Generar vectores de oraciones
def generate_sentence_vector(sentence):
    tokens = simple_preprocess(sentence)
    if not tokens:
        return np.zeros(100)
    return embedding_model.wv[tokens].mean(axis=0)

sentence_dataset['vector_representation'] = sentence_dataset['content'].apply(generate_sentence_vector)

In [10]:
def semantic_search(query_text, model, data_frame):
    """Realiza búsqueda semántica en el conjunto de oraciones."""
    # Generar embedding de la consulta
    query_tokens = simple_preprocess(query_text)
    try:
        query_vector = np.mean([model.wv[token] for token in query_tokens if token in model.wv], axis=0)
    except:
        data_frame['relevance_score'] = 0.0
        return data_frame
    
    # Calcular similitudes
    similarity_scores = []
    for vec in tqdm(data_frame['vector_representation'], desc="Evaluando similitud"):
        score = cosine_similarity([query_vector], [vec])[0][0] if not np.all(vec == 0) else 0.0
        similarity_scores.append(score)
    
    data_frame['relevance_score'] = similarity_scores
    return data_frame

In [11]:
search_query = "biological neural networks"
sentence_dataset = semantic_search(search_query, embedding_model, sentence_dataset)

Evaluando similitud: 100%|██████████| 385895/385895 [03:00<00:00, 2143.54it/s]


In [12]:
# Preparar vectores para clustering
sentence_vectors = np.array(sentence_dataset['vector_representation'].tolist())

# Configurar y ejecutar clustering
topic_count = 10
topic_model = KMeans(n_clusters=topic_count, random_state=42, n_init=10, max_iter=300)
sentence_dataset['topic_id'] = topic_model.fit_predict(sentence_vectors)

# Generar representaciones de tópicos
topic_representations = []
for topic_idx in range(topic_count):
    topic_vectors = sentence_vectors[sentence_dataset['topic_id'] == topic_idx]
    topic_centroid = topic_vectors.mean(axis=0)
    topic_representations.append({
        'topic_number': topic_idx + 1,
        'topic_vector': topic_centroid
    })

In [13]:
# DataFrame de tópicos
topics_summary = pd.DataFrame(topic_representations)

# DataFrame de episodios y tópicos
episode_topics = sentence_dataset.groupby(['podcast_id', 'topic_id']).first().reset_index()
episode_topics = episode_topics[['podcast_id', 'topic_id', 'vector_representation']]

In [17]:
# Mostrar resultados del análisis
print("=== RESUMEN DEL ANÁLISIS DE PODCAST ===")
print(f"\nTotal de episodios analizados: {len(podcast_data)}")
print(f"Total de oraciones procesadas: {len(sentence_dataset)}")
print(f"\nEstadísticas por episodio:")
print(podcast_data[['word_count', 'sentence_count']].describe())

print("\n=== RESULTADOS DE BÚSQUEDA ===")
print(f"\nConsulta: '{search_query}'")
print("\nTop 5 oraciones más relevantes:")
top_results = sentence_dataset.nlargest(5, 'relevance_score')
for _, row in top_results.iterrows():
    print(f"\nPodcast ID: {row['podcast_id']}")
    print(f"Relevancia: {row['relevance_score']:.4f}")
    print(f"Contenido: {row['content']}")

print("\n=== DISTRIBUCIÓN DE TÓPICOS ===")
topic_distribution = sentence_dataset['topic_id'].value_counts().sort_index()
for topic_id, count in topic_distribution.items():
    print(f"\nTópico {topic_id + 1}: {count} oraciones")
    
# Mostrar los DataFrames finales
print("\n=== RESUMEN DE TÓPICOS ===")
print(topics_summary)

print("\n=== ASIGNACIÓN DE TÓPICOS POR EPISODIO ===")
print(episode_topics)

# Visualizar DataFrame de tópicos (equivalente a topics_df original)
print("=== DATAFRAME DE TÓPICOS ===")
print(topics_summary)

# Visualizar DataFrame final (equivalente a final_df original)
print("=== DATAFRAME FINAL ===")
print(episode_topics)

=== RESUMEN DEL ANÁLISIS DE PODCAST ===

Total de episodios analizados: 319
Total de oraciones procesadas: 385895

Estadísticas por episodio:
         word_count  sentence_count
count    319.000000      319.000000
mean   21672.557994     1209.702194
std     9961.707400      637.632089
min     4928.000000      118.000000
25%    14079.500000      719.000000
50%    20334.000000     1128.000000
75%    28266.500000     1545.000000
max    59475.000000     3748.000000

=== RESULTADOS DE BÚSQUEDA ===

Consulta: 'biological neural networks'

Top 5 oraciones más relevantes:

Podcast ID: 103
Relevancia: 0.9731
Contenido: AKA neural networks

Podcast ID: 186
Relevancia: 0.9179
Contenido: A neural surgeon

Podcast ID: 20
Relevancia: 0.8905
Contenido: Well, neural networks and graphs

Podcast ID: 49
Relevancia: 0.8653
Contenido: So it's like digital neural net will interface with biological neural net

Podcast ID: 190
Relevancia: 0.8488
Contenido: I mean, neural networks themselves are fundamentally