# 2. Visualizaci√≥n de Datos - MovieLens

**Objetivo**: Explorar visualmente los datos procesados mediante t-SNE, UMAP y gr√°ficos exploratorios.

**Requisito previo**: Ejecutar `1_procesamiento_datos.ipynb` primero.

---

## Contenido
1. Carga de datos procesados
2. Distribuci√≥n de ratings y likes
3. t-SNE (diferentes perplexities)
4. UMAP (diferentes hiperpar√°metros)
5. DBSCAN sobre UMAP
6. Visualizaci√≥n de embeddings por g√©nero

## 1. Importaci√≥n de Librer√≠as

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
import umap
import warnings
warnings.filterwarnings('ignore')

# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)

print("‚úì Librer√≠as importadas correctamente")

## 2. Carga de Datos Procesados

In [None]:
# Cargar datos procesados
data_final = pd.read_csv('data_processed/data_final.csv')
movie_features = pd.read_csv('data_processed/movie_features.csv')

print("="*60)
print("DATOS CARGADOS")
print("="*60)
print(f"data_final: {data_final.shape}")
print(f"movie_features: {movie_features.shape}")

# Extraer embeddings de pel√≠culas
item_embed_cols = [col for col in movie_features.columns if 'item_embed_' in col]
X_embeddings = movie_features[item_embed_cols].values

print(f"\nEmbeddings de pel√≠culas: {X_embeddings.shape}")

## 3. Distribuci√≥n de Ratings

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distribuci√≥n de ratings
sns.countplot(data=data_final, x='rating', ax=axes[0], palette='viridis')
axes[0].set_title('Distribuci√≥n de Ratings', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Rating')
axes[0].set_ylabel('Frecuencia')

# Distribuci√≥n Like/Dislike
like_counts = data_final['like'].value_counts()
axes[1].bar(['Dislike (<4)', 'Like (‚â•4)'], like_counts.values, color=['#e74c3c', '#2ecc71'])
axes[1].set_title('Distribuci√≥n Like/Dislike', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Frecuencia')
for i, v in enumerate(like_counts.values):
    axes[1].text(i, v + 1000, f'{v:,}\n({v/len(data_final)*100:.1f}%)',
                 ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 4. t-SNE: Variando Perplexity

In [None]:
# Preparar datos para colorear por g√©nero
genre_columns = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy',
                 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
                 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                 'Thriller', 'War', 'Western']

viz_data = movie_features.copy()
viz_data['genre_dominant'] = viz_data[genre_columns].idxmax(axis=1)
top_genres = viz_data['genre_dominant'].value_counts().head(5).index

print(f"Top 5 g√©neros: {list(top_genres)}")

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

for idx, perplexity in enumerate([5, 30, 50]):
    print(f"Calculando t-SNE con perplexity={perplexity}...")
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=42, max_iter=1000, verbose=0)
    X_tsne = tsne.fit_transform(X_embeddings)
    
    for genre in top_genres:
        mask = viz_data['genre_dominant'] == genre
        axes[idx].scatter(X_tsne[mask, 0], X_tsne[mask, 1], 
                         label=genre, alpha=0.6, s=30, edgecolors='none')
    
    axes[idx].set_title(f't-SNE (perplexity={perplexity})', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('t-SNE dimensi√≥n 1')
    axes[idx].set_ylabel('t-SNE dimensi√≥n 2')
    axes[idx].legend(fontsize=9, loc='best')
    axes[idx].grid(True, alpha=0.3)

plt.suptitle('t-SNE: Efecto de Perplexity en Embeddings de Pel√≠culas', 
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## 5. UMAP: Variando n_neighbors

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

for idx, n_neighbors in enumerate([5, 15, 50]):
    print(f"Calculando UMAP con n_neighbors={n_neighbors}...")
    reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, 
                       min_dist=0.1, random_state=42)
    X_umap = reducer.fit_transform(X_embeddings)
    
    for genre in top_genres:
        mask = viz_data['genre_dominant'] == genre
        axes[idx].scatter(X_umap[mask, 0], X_umap[mask, 1], 
                         label=genre, alpha=0.6, s=30, edgecolors='none')
    
    axes[idx].set_title(f'UMAP (n_neighbors={n_neighbors})', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('UMAP dimensi√≥n 1')
    axes[idx].set_ylabel('UMAP dimensi√≥n 2')
    axes[idx].legend(fontsize=9, loc='best')
    axes[idx].grid(True, alpha=0.3)

plt.suptitle('UMAP: Efecto de n_neighbors en Embeddings de Pel√≠culas', 
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## 6. UMAP: Variando min_dist

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20, 6))

for idx, min_dist in enumerate([0.0, 0.1, 0.5]):
    print(f"Calculando UMAP con min_dist={min_dist}...")
    reducer = umap.UMAP(n_components=2, n_neighbors=15, 
                       min_dist=min_dist, random_state=42)
    X_umap = reducer.fit_transform(X_embeddings)
    
    for genre in top_genres:
        mask = viz_data['genre_dominant'] == genre
        axes[idx].scatter(X_umap[mask, 0], X_umap[mask, 1], 
                         label=genre, alpha=0.6, s=30, edgecolors='none')
    
    axes[idx].set_title(f'UMAP (min_dist={min_dist})', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('UMAP dimensi√≥n 1')
    axes[idx].set_ylabel('UMAP dimensi√≥n 2')
    axes[idx].legend(fontsize=9, loc='best')
    axes[idx].grid(True, alpha=0.3)

plt.suptitle('UMAP: Efecto de min_dist en Embeddings de Pel√≠culas', 
             fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## 7. DBSCAN sobre UMAP

In [None]:
# Aplicar DBSCAN sobre la √∫ltima proyecci√≥n UMAP
print("Aplicando DBSCAN...")
clusters = DBSCAN(eps=0.5).fit_predict(X_umap)

# Crear DataFrame con coordenadas UMAP + etiquetas de cluster
umap_df = pd.DataFrame(X_umap, columns=['UMAP_1', 'UMAP_2'])
umap_df['cluster'] = clusters

print(f"\nClusters encontrados: {len(set(clusters)) - (1 if -1 in clusters else 0)}")
print(f"Puntos de ruido: {(clusters == -1).sum()}")

In [None]:
# Visualizar
plt.figure(figsize=(10, 8))
sns.scatterplot(
    data=umap_df,
    x='UMAP_1', y='UMAP_2',
    hue='cluster',
    palette='tab10',
    s=60,
    alpha=0.8,
    legend='full'
)
plt.title('Clusters detectados por DBSCAN en espacio UMAP', fontsize=14)
plt.xlabel('UMAP dimensi√≥n 1')
plt.ylabel('UMAP dimensi√≥n 2')
plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()

## 8. Resumen de Visualizaci√≥n

In [None]:
print("="*60)
print("RESUMEN DE VISUALIZACI√ìN")
print("="*60)
print("\n‚úì t-SNE: Explorado con perplexities 5, 30, 50")
print("‚úì UMAP: Explorado con n_neighbors 5, 15, 50")
print("‚úì UMAP: Explorado con min_dist 0.0, 0.1, 0.5")
print("‚úì DBSCAN: Aplicado sobre proyecci√≥n UMAP")
print("\nüí° Observaciones:")
print("  - Las pel√≠culas se agrupan naturalmente por g√©nero")
print("  - Los embeddings SVD capturan bien la estructura")
print("  - UMAP preserva mejor la estructura local que t-SNE")