In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import scipy.cluster.hierarchy as sch

%load_ext kedro.ipython


The kedro.ipython extension is already loaded. To reload it, use:
  %reload_ext kedro.ipython


In [2]:
movies_metadata = catalog.load("movies_metadata")

print(movies_metadata.head())

   adult                              belongs_to_collection    budget  \
0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1  False                                                NaN  65000000   
2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3  False                                                NaN  16000000   
4  False  {'id': 96871, 'name': 'Father of the Bride Col...         0   

                                              genres  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...   
4                     [{'id': 35, 'name': 'Comedy'}]   

                               homepage     id    imdb_id original_language  \
0  http://toystory.disney.com/toy-story    862  tt0114709                en   
1                                   NaN   8844  tt0113497         

In [3]:
# ============================================
# 1. Selección y preparación de variables
# ============================================

numeric_features = ["runtime", "popularity", "vote_count", "budget", "revenue"]

# Convertir a numérico con manejo de errores
movies_metadata[numeric_features] = movies_metadata[numeric_features].apply(
    pd.to_numeric, errors="coerce"
)

# Eliminar filas con valores faltantes
movies_cluster = movies_metadata.dropna(subset=numeric_features).copy()

print("Filas restantes después de limpiar:", len(movies_cluster))

# Escalar datos numéricos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(movies_cluster[numeric_features])

print("\nEjemplo de datos escalados:")
print(X_scaled[:5])


Filas restantes después de limpiar: 45203

Ejemplo de datos escalados:
[[-0.34181445  3.15841932 10.767643    1.47395361  5.61601126]
 [ 0.2570287   2.33909542  4.67385395  3.47726297  3.8990808 ]
 [ 0.17891873  1.45809299 -0.03756656 -0.2431687  -0.17475185]
 [ 0.85587186  0.15329573 -0.15530132  0.67262986  1.08790388]
 [ 0.30910202  0.90560041  0.12685613 -0.2431687   1.01235977]]


In [4]:
# ============================================
# 2. Reducción de dimensionalidad (PCA)
# ============================================

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

movies_cluster["PCA1"] = X_pca[:, 0]
movies_cluster["PCA2"] = X_pca[:, 1]

print("\nVarianza explicada por PCA:")
print(pca.explained_variance_ratio_)

print("\nPrimeras filas con PCA:")
print(movies_cluster[["PCA1", "PCA2"]].head())



Varianza explicada por PCA:
[0.58749338 0.19490453]

Primeras filas con PCA:
        PCA1      PCA2
0  10.670793 -1.900343
1   7.284357 -0.710495
2   0.393224  0.251494
3   1.004270  0.708856
4   0.899942  0.221297


In [None]:
# ============================================
# 4. Clustering (3 algoritmos obligatorios)
# ============================================

from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering

# ============================================
# 4a. DBSCAN
# ============================================

def apply_dbscan(X_pca, eps=0.5, min_samples=10):
    model = DBSCAN(eps=eps, min_samples=min_samples)
    labels = model.fit_predict(X_pca)

    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)  # excluye ruido
    n_noise = list(labels).count(-1)

    return labels, n_clusters, n_noise

labels_dbscan, n_clusters_dbscan, n_noise_dbscan = apply_dbscan(X_pca)

print("=== DBSCAN ===")
print(f"Clusters encontrados: {n_clusters_dbscan}")
print(f"Puntos ruido: {n_noise_dbscan}")

movies_cluster["DBSCAN"] = labels_dbscan


# ============================================
# 4b. KMeans
# ============================================

def apply_kmeans(X_pca, n_clusters=5):
    model = KMeans(n_clusters=n_clusters, random_state=20)
    labels = model.fit_predict(X_pca)
    return labels

labels_kmeans = apply_kmeans(X_pca, n_clusters=5)

print("\n=== KMeans ===")
print("Clusters encontrados:", len(set(labels_kmeans)))

movies_cluster["KMeans"] = labels_kmeans


# ============================================
# 4c. Jerárquico (Agglomerative)
# ============================================

def apply_hierarchical(X_pca, n_clusters=5, linkage="ward"):
    model = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
    labels = model.fit_predict(X_pca)
    return labels

labels_hierarchical = apply_hierarchical(X_pca, n_clusters=5)

print("\n=== Clustering Jerárquico ===")
print("Clusters encontrados:", len(set(labels_hierarchical)))

movies_cluster["Hierarchical"] = labels_hierarchical


# ============================================
# Mensaje final
# ============================================

print("\n✔ Operación finalizada: Los tres algoritmos de clustering se ejecutaron correctamente y las etiquetas fueron añadidas al dataframe.")


=== DBSCAN ===
Clusters encontrados: 3
Puntos ruido: 217

=== KMeans ===
Clusters encontrados: 5


In [None]:
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

def compute_metrics(X, labels, name=""):
    # Evitar métricas inválidas si solo hay 1 cluster
    if len(set(labels)) < 2:
        return {
            "Modelo": name,
            "Silhouette": None,
            "Davies-Bouldin": None,
            "Calinski-Harabasz": None
        }

    return {
        "Modelo": name,
        "Silhouette": silhouette_score(X, labels),
        "Davies-Bouldin": davies_bouldin_score(X, labels),
        "Calinski-Harabasz": calinski_harabasz_score(X, labels)
    }

metrics = []

metrics.append(compute_metrics(X_pca, labels_kmeans, "KMeans"))
metrics.append(compute_metrics(X_pca, labels_dbscan, "DBSCAN"))
metrics.append(compute_metrics(X_pca, labels_hierarchical, "Hierarchical"))

import pandas as pd
metrics_df = pd.DataFrame(metrics)
metrics_df


In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

inertia_list = []
k_values = range(2, 11)

for k in k_values:
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X_pca)
    inertia_list.append(model.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(k_values, inertia_list, marker='o')
plt.title("Método del Codo (Elbow Method)")
plt.xlabel("Número de Clusters (k)")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()


In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

plt.figure(figsize=(10, 6))
linkage_matrix = linkage(X_pca[:500], method='ward') 

dendrogram(linkage_matrix)
plt.title("Dendrograma del Clustering Jerárquico")
plt.xlabel("Muestras")
plt.ylabel("Distancia")
plt.show()
