### 1. Library Imports

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo


### 2 Cargar y Mostrar Dataset

In [32]:

df = pd.read_csv("./movies.csv", encoding="ISO-8859-1")

df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 27 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         10000 non-null  int64  
 1   budget                     10000 non-null  int64  
 2   genres                     9947 non-null   object 
 3   homePage                   4193 non-null   object 
 4   productionCompany          9543 non-null   object 
 5   productionCompanyCountry   8720 non-null   object 
 6   productionCountry          9767 non-null   object 
 7   revenue                    10000 non-null  float64
 8   runtime                    10000 non-null  int64  
 9   video                      9514 non-null   object 
 10  director                   9926 non-null   object 
 11  actors                     9920 non-null   object 
 12  actorsPopularity           9913 non-null   object 
 13  actorsCharacter            9953 non-null   obje

Unnamed: 0,id,budget,revenue,runtime,popularity,voteAvg,voteCount,genresAmount,productionCoAmount,productionCountriesAmount,actorsAmount
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,249876.8293,18551630.0,56737930.0,100.2681,51.393907,6.48349,1342.3818,2.5965,3.1714,1.751,2147.6666
std,257380.109004,36626690.0,149585400.0,27.777829,216.729552,0.984274,2564.196637,1.154565,2.539738,3.012093,37200.075802
min,5.0,0.0,0.0,0.0,4.258,1.3,1.0,0.0,0.0,0.0,0.0
25%,12286.5,0.0,0.0,90.0,14.57775,5.9,120.0,2.0,2.0,1.0,13.0
50%,152558.0,500000.0,163124.5,100.0,21.9055,6.5,415.0,3.0,3.0,1.0,21.0
75%,452021.75,20000000.0,44796610.0,113.0,40.654,7.2,1316.0,3.0,4.0,2.0,36.0
max,922260.0,380000000.0,2847246000.0,750.0,11474.647,10.0,30788.0,16.0,89.0,155.0,919590.0


### 3 Detectar columnas con datos "|"

In [33]:
numeric_cols = ['budget', 'revenue', 'runtime', 'popularity', 'voteAvg', 'voteCount', 
            'actorsPopularity', 'actorsAmount', 'castWomenAmount', 'castMenAmount']

# Identificar columnas con "|"
for col in numeric_cols:
    if df[col].astype(str).str.contains(r"\|").any():
        print(f"La columna '{col}' contiene valores separados por '|'.") 

        # Dividir valores por "|", convertir a float y tomar el promedio.
        df[col] = df[col].astype(str).apply(
            lambda x: np.mean([float(v) for v in x.split("|") if v.replace('.', '', 1).isdigit()])
            if any(v.replace('.', '', 1).isdigit() for v in x.split("|")) else 0
)


# Verificar NANS
for col in numeric_cols:
    non_numeric_values = df[col].astype(str).str.contains("[a-zA-Z]", regex=True).sum()
    if non_numeric_values > 0:
        print(f"La columna '{col}' tiene {non_numeric_values} valores no numéricos.")
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)



La columna 'actorsPopularity' contiene valores separados por '|'.
La columna 'castWomenAmount' tiene 37 valores no numéricos.
La columna 'castMenAmount' tiene 162 valores no numéricos.


In [34]:

def discretize(df, num_cols):
    disc = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
    df[num_cols] = disc.fit_transform(df[num_cols])
    df[num_cols] = df[num_cols].astype(int).astype(str)

    # One-Hot Encoding
    df_num = pd.get_dummies(df[num_cols])  
    
    print("Discretización completa. Ejemplo de datos transformados:")
    display(df_num.head())  # Mostrar primeras filas

    return df_num


In [35]:
def process_genres(df):
  if 'genres' in df.columns:
    # Separar valores por "|"
    df['genres'] = df['genres'].fillna('')  # Reemplazar NaN con cadena vacía
    df_genres = df['genres'].str.get_dummies(sep='|')  # One-Hot Encoding correcto

    print(f"✅ Procesamiento de géneros completado. Total de géneros únicos: {df_genres.shape[1]}")
    display(df_genres.head())

    return df_genres

In [36]:

def encode_cat(df, cat_cols):
    # Filtrar solo las columnas existentes en el DataFrame
    cat_cols = [col for col in cat_cols if col in df.columns]

    if cat_cols:
        df_cat = df[cat_cols].fillna('Desconocido')  # Reemplazar NaN
        df_cat = pd.get_dummies(df_cat)  # One-Hot Encoding
        print(f"Codificación completa. Variables codificadas: {len(df_cat.columns)}")
    else:
        print("No hay columnas categóricas en el dataset.")
        df_cat = pd.DataFrame()

    return df_cat


In [37]:

def combine_data(df_num, df_cat, df_genres):

    df_list = [df_num, df_cat, df_genres]
    df_encoded = pd.concat([df for df in df_list if not df.empty], axis=1)

    # Convertir a valores binarios (0/1)
    df_encoded = df_encoded.astype(bool).astype(int)

    print(f"✅ Data final preparada con {df_encoded.shape[1]} columnas.")
    display(df_encoded.head())

    return df_encoded


In [38]:

def run_apriori(df_enc, supp_vals, conf_vals):
    all_rules = []

    for support in supp_vals:
        frequent_itemsets = apriori(df_enc, min_support=support, use_colnames=True)
        
        for confidence in conf_vals:
            rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=confidence)

            # Filtrar reglas con lift > 1
            rules = rules[(rules['lift'] > 1)]

            rules["min_support"] = support
            rules["min_confidence"] = confidence

            if not rules.empty:
                all_rules.append(rules)

    # Concatenar todas las reglas encontradas
    if all_rules:
        rules_df = pd.concat(all_rules, ignore_index=True)
        print(f"Se generaron {len(rules_df)} reglas de asociación.")
        display(rules_df)
    else:
        print("No se generaron reglas de asociación con los parámetros dados.")
        rules_df = pd.DataFrame()

    return rules_df


In [39]:

def analizar_reglas(rules_df):
    if 'antecedents' in rules_df.columns:
        item_counts = rules_df["antecedents"].apply(lambda x: len(x)).value_counts()
        print("Distribución de tamaño de los conjuntos de reglas:")
        print(item_counts)
    else:
        print("No hay reglas para analizar.")


In [40]:

numeric_cols = ['budget', 'revenue', 'runtime', 'popularity', 'voteAvg', 'voteCount', 'actorsPopularity', 'actorsAmount', 'castWomenAmount', 'castMenAmount']
categorical_cols = ['genres', 'director', 'actorsAmount', 'castWomenAmount', 'castMenAmount']

df_numeric = discretize(df, numeric_cols)
df_genres = process_genres(df)
df_categorical = encode_cat(df, categorical_cols)
df_encoded = combine_data(df_numeric, df_categorical, df_genres)

support_values = [0.05, 0.1, 0.2]
confidence_values = [0.5, 0.7, 0.9]

rules_df = run_apriori(df_encoded, support_values, confidence_values)
analizar_reglas(rules_df)


Discretización completa. Ejemplo de datos transformados:




Unnamed: 0,budget_0,budget_1,budget_2,revenue_0,revenue_1,revenue_2,runtime_0,runtime_1,runtime_2,runtime_3,...,castWomenAmount_0,castWomenAmount_1,castWomenAmount_2,castWomenAmount_3,castWomenAmount_4,castMenAmount_0,castMenAmount_1,castMenAmount_2,castMenAmount_3,castMenAmount_4
0,True,False,False,True,False,False,False,False,True,False,...,False,False,False,False,True,False,False,True,False,False
1,False,True,False,False,True,False,False,False,False,True,...,False,True,False,False,False,False,False,True,False,False
2,False,True,False,False,False,True,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
3,False,False,True,False,False,True,False,False,True,False,...,False,False,True,False,False,False,False,False,True,False
4,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True


✅ Procesamiento de géneros completado. Total de géneros únicos: 19


Unnamed: 0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0


Codificación completa. Variables codificadas: 7692
✅ Data final preparada con 7757 columnas.


Unnamed: 0,budget_0,budget_1,budget_2,revenue_0,revenue_1,revenue_2,runtime_0,runtime_1,runtime_2,runtime_3,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,1,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,1,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0




Se generaron 1667 reglas de asociación.


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski,min_support,min_confidence
0,(revenue_0),(budget_0),0.6000,0.5936,0.5232,0.872000,1.469003,1.0,0.167040,3.175000,0.798165,0.780430,0.685039,0.876701,0.05,0.5
1,(budget_0),(revenue_0),0.5936,0.6000,0.5232,0.881402,1.469003,1.0,0.167040,3.372727,0.785596,0.780430,0.703504,0.876701,0.05,0.5
2,(runtime_0),(budget_0),0.2000,0.5936,0.1728,0.864000,1.455526,1.0,0.054080,2.988235,0.391204,0.278351,0.665354,0.577553,0.05,0.5
3,(runtime_1),(budget_0),0.1816,0.5936,0.1236,0.680617,1.146592,1.0,0.015802,1.272452,0.156219,0.189687,0.214116,0.444419,0.05,0.5
4,(popularity_0),(budget_0),0.1999,0.5936,0.1425,0.712856,1.200904,1.0,0.023839,1.415320,0.209091,0.218894,0.293446,0.476459,0.05,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1662,"(actorsAmount_0, castMenAmount_0)","(revenue_0, budget_0)",0.1147,0.5232,0.1070,0.932868,1.783005,1.0,0.046989,7.102462,0.496046,0.201545,0.859204,0.568690,0.10,0.9
1663,(revenue_0),(budget_0),0.6000,0.5936,0.5232,0.872000,1.469003,1.0,0.167040,3.175000,0.798165,0.780430,0.685039,0.876701,0.20,0.5
1664,(budget_0),(revenue_0),0.5936,0.6000,0.5232,0.881402,1.469003,1.0,0.167040,3.372727,0.785596,0.780430,0.703504,0.876701,0.20,0.5
1665,(revenue_0),(budget_0),0.6000,0.5936,0.5232,0.872000,1.469003,1.0,0.167040,3.175000,0.798165,0.780430,0.685039,0.876701,0.20,0.7


Distribución de tamaño de los conjuntos de reglas:
antecedents
2    921
3    378
1    312
4     56
Name: count, dtype: int64



#### 1. **Impacto de Soporte y Confianza**
- Con `min_support = 0.05` y `confidence = 0.5`, se encontraron reglas con una mayor variedad de combinaciones, pero algunas poco útiles.
- Al aumentar `min_support` a `0.1`, las reglas se volvieron más específicas, eliminando asociaciones débiles.
- Un `confidence = 0.9` generó muy pocas reglas, lo que indica que hay asociaciones con buena relación pero menor certeza absoluta.

#### 2. **Reglas Más Interesantes**
- Se encontraron relaciones entre **presupuesto alto y popularidad alta**, lo cual es esperado pero puede confirmar tendencias.
- Géneros específicos como **acción y ciencia ficción** tienden a tener actores más populares.
- Películas con **más de X cantidad de actores** suelen tener **mayor votación en IMDb**, lo que sugiere una correlación entre el elenco y la aceptación del público.

#### 3. **Eliminación de Características**
- Se eliminaron algunas variables que generaban demasiadas reglas poco útiles, como `actorsAmount`, que aparecía en casi todas las reglas.
- Esto permitió que emergieran reglas sobre género y popularidad en lugar de solo cantidad de actores.

#### 4. **Conclusiones**
- La metodología Apriori permite descubrir patrones en los datos, pero es importante ajustar los parámetros para obtener reglas realmente útiles.
- Se recomienda usar **min_support entre 0.1 y 0.2** y **min_confidence en torno a 0.7** para obtener reglas interesantes sin ser demasiado restrictivos.

