from sklearn.metrics import silhouette_score

# 1.5. Clustering y Calidad del Agrupamiento (Método de la Silueta)

In [None]:
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# Cargar los datos
df_movies = pd.read_csv('C:/Users/Usuario/OneDrive/Documentos/Proyecto1.2 - Minería/movies.csv', encoding='ISO-8859-1')


# Seleccionar características relevantes para el agrupamiento
features = df_movies[['budget', 'revenue', 'runtime', 'popularity', 'voteAvg', 'voteCount']]

# Escalar las características
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Aplicar KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
df_movies['Cluster_KMeans'] = kmeans.fit_predict(features_scaled)

# Aplicar Agglomerative Clustering
agg_clustering = AgglomerativeClustering(n_clusters=5)
df_movies['Cluster_Agg'] = agg_clustering.fit_predict(features_scaled)

# Calcular la puntuación de la silueta para KMeans
sil_score_kmeans = silhouette_score(features_scaled, df_movies['Cluster_KMeans'])
print(f"Silhouette Score for KMeans: {sil_score_kmeans}")

# Calcular la puntuación de la silueta para Agglomerative Clustering
sil_score_agg = silhouette_score(features_scaled, df_movies['Cluster_Agg'])
print(f"Silhouette Score for Agglomerative Clustering: {sil_score_agg}")

Silhouette Score for KMeans: 0.2676199836151407
Silhouette Score for Agglomerative Clustering: 0.21840816090350562


# 1.6  Interpretación de los grupos

In [5]:
# Agrupar los datos por los clústeres generados por KMeans
grouped_kmeans = df_movies.groupby('Cluster_KMeans').agg({
    'budget': ['mean', 'median', 'std'],
    'revenue': ['mean', 'median', 'std'],
    'runtime': ['mean', 'median', 'std'],
    'popularity': ['mean', 'median', 'std'],
    'voteAvg': ['mean', 'median', 'std'],
    'voteCount': ['mean', 'median', 'std']
}).reset_index()

# Mostrar el resumen para KMeans
print(grouped_kmeans)

# Lo mismo para Agglomerative Clustering
grouped_agg = df_movies.groupby('Cluster_Agg').agg({
    'budget': ['mean', 'median', 'std'],
    'revenue': ['mean', 'median', 'std'],
    'runtime': ['mean', 'median', 'std'],
    'popularity': ['mean', 'median', 'std'],
    'voteAvg': ['mean', 'median', 'std'],
    'voteCount': ['mean', 'median', 'std']
}).reset_index()

# Mostrar el resumen para Agglomerative Clustering
print(grouped_agg)


  Cluster_KMeans        budget                                  revenue  \
                          mean       median           std          mean   
0              0  7.058566e+07   65000000.0  4.211913e+07  1.980664e+08   
1              1  6.991657e+06          0.0  1.168483e+07  1.881955e+07   
2              2  1.484579e+08  150000000.0  6.780480e+07  7.583603e+08   
3              3  7.373313e+06          0.0  1.398792e+07  1.222301e+07   
4              4  1.050000e+08   80000000.0  8.004463e+07  3.542120e+08   

                                 runtime                     popularity  \
        median           std        mean median        std         mean   
0  173785484.5  1.227669e+08  116.756078  115.0  27.875688    67.310425   
1     127345.0  3.539072e+07  106.891669  105.0  23.741573    41.372426   
2  701762825.0  3.669948e+08  128.560976  129.0  24.397941   189.087963   
3          0.0  3.023623e+07   86.107063   91.0  25.102190    36.630203   
4  203000000.0  5.308336

In [6]:
# Contar la frecuencia de los géneros por clúster para KMeans
genre_freq_kmeans = df_movies.groupby('Cluster_KMeans')['genres'].value_counts().unstack().fillna(0)

# Mostrar las frecuencias de los géneros para KMeans
print(genre_freq_kmeans)

# Contar la frecuencia de los géneros por clúster para Agglomerative Clustering
genre_freq_agg = df_movies.groupby('Cluster_Agg')['genres'].value_counts().unstack().fillna(0)

# Mostrar las frecuencias de los géneros para Agglomerative Clustering
print(genre_freq_agg)


genres          Action  Action|Adventure  Action|Adventure|Animation  \
Cluster_KMeans                                                         
0                  4.0               5.0                         0.0   
1                 42.0               6.0                         4.0   
2                  0.0               2.0                         0.0   
3                 66.0               8.0                         0.0   
4                  0.0               0.0                         0.0   

genres          Action|Adventure|Animation|Comedy|Family  \
Cluster_KMeans                                             
0                                                    2.0   
1                                                    0.0   
2                                                    0.0   
3                                                    1.0   
4                                                    0.0   

genres          Action|Adventure|Animation|Comedy|TV Movie  \
Cluster_KMea

# 2. Reglas de Asociación 
## Conjunto de datos usando el algoritmo “A priori”


In [12]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder


# Cargar los datos
df = pd.read_csv('C:/Users/Usuario/OneDrive/Documentos/Proyecto1.2 - Minería/movies.csv', encoding='ISO-8859-1')

# Discretizar las variables numéricas (ejemplo)
df['budget'] = pd.cut(df['budget'], bins=[0, 10000000, 50000000, 200000000, float('inf')], labels=['Low', 'Medium', 'High', 'Very High'])
df['revenue'] = pd.cut(df['revenue'], bins=[0, 50000000, 100000000, 500000000, float('inf')], labels=['Low', 'Medium', 'High', 'Very High'])

# Convertir en una lista de transacciones (para usar en apriori)
transactions = df[['budget', 'revenue', 'runtime']].astype(str).values.tolist()

# Aplicar A Priori
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_te = pd.DataFrame(te_ary, columns=te.columns_)

# Encontrar conjuntos frecuentes
frequent_itemsets = apriori(df_te, min_support=0.05, use_colnames=True)

# Generar reglas de asociación
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
print(rules)


  antecedents consequents  antecedent support  consequent support  support  \
0    (Medium)      (High)              0.2629              0.1728   0.0684   
1      (High)    (Medium)              0.1728              0.2629   0.0684   
2       (Low)    (Medium)              0.3637              0.2629   0.1129   
3    (Medium)       (Low)              0.2629              0.3637   0.1129   

   confidence      lift  representativity  leverage  conviction  \
0    0.260175  1.505642               1.0  0.022971    1.118102   
1    0.395833  1.505642               1.0  0.022971    1.220028   
2    0.310421  1.180756               1.0  0.017283    1.068913   
3    0.429441  1.180756               1.0  0.017283    1.115222   

   zhangs_metric   jaccard  certainty  kulczynski  
0       0.455612  0.186224   0.105627    0.328004  
1       0.405986  0.186224   0.180346    0.328004  
2       0.240586  0.219778   0.064470    0.369931  
3       0.207685  0.219778   0.103317    0.369931  


# 3. Análisis de Componentes Principales 
## 3.1 Transformación de variables categóricas para PCA


In [13]:
import pandas as pd

# Cargar los datos
df = pd.read_csv('C:/Users/Usuario/OneDrive/Documentos/Proyecto1.2 - Minería/movies.csv', encoding='ISO-8859-1')

# Seleccionar variables categóricas
categorical_cols = ['genres', 'productionCompany', 'productionCountry', 'originalLanguage']

# Contar categorías únicas por variable
unique_counts = df[categorical_cols].nunique()
print(unique_counts)


genres               2345
productionCompany    8163
productionCountry     879
originalLanguage       40
dtype: int64


## 3.2 Evaluar viabilidad del PCA (Índice KMO y prueba de Bartlett)

In [21]:
from factor_analyzer.factor_analyzer import calculate_kmo
from factor_analyzer import FactorAnalyzer
from scipy.stats import bartlett
import pandas as pd

# Cargar los datos
df = pd.read_csv('C:/Users/Usuario/OneDrive/Documentos/Proyecto1.2 - Minería/movies.csv', encoding='ISO-8859-1')

# Seleccionar variables numéricas
numerical_cols = ['budget', 'revenue', 'runtime', 'popularity', 'voteAvg', 'voteCount']
df_numeric = df[numerical_cols].dropna()

# Calcular KMO
kmo_all, kmo_model = calculate_kmo(df_numeric)
print(f"KMO: {kmo_model}")

# Prueba de esfericidad de Bartlett
chi_square_value, p_value = bartlett(*df_numeric.values.T)
print(f"Prueba de Bartlett - Chi2: {chi_square_value}, p-value: {p_value}")


KMO: 0.7183333605537804
Prueba de Bartlett - Chi2: 1099118.5768360756, p-value: 0.0
