# Algoritmos de Aprendizaje no Supervisado

## I.	Clustering

## II.	Reglas de Asociación

In [33]:
import pandas as pd
from apyori import apriori
from sklearn.compose import make_column_selector
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer

# Cargar los datos
movies = pd.read_csv('movies.csv', encoding='cp1252')

In [34]:
movies = pd.read_csv('movies.csv', encoding='cp1252')
print("Dataset shape:", movies.shape)

# Select fewer categorical variables to start with
categorical_columns = ['genres', 'productionCountry']  # Reduced number of columns
categoricas = movies[categorical_columns]
categoricas = categoricas.astype(object)


Dataset shape: (10000, 27)


In [35]:
# Clean and preprocess categorical data
categoricas = categoricas.fillna('Unknown')

# For genres, we'll take only the first genre if there are multiple
categoricas['genres'] = categoricas['genres'].apply(lambda x: str(x).split(',')[0] if pd.notna(x) else 'Unknown')
print("\nUnique genres:", categoricas['genres'].unique())

# For production country, take the first country
categoricas['productionCountry'] = categoricas['productionCountry'].apply(lambda x: str(x).split(',')[0] if pd.notna(x) else 'Unknown')
print("\nUnique countries:", categoricas['productionCountry'].unique())

print("\nCategorical data shape:", categoricas.shape)



Unique genres: ['Crime|Comedy' 'Action|Thriller|Crime' 'Adventure|Action|Science Fiction'
 ... 'Animation|Comedy|TV Movie' 'Drama|History|Thriller|Crime'
 'Documentary|Animation|History']

Unique countries: ['United States of America' 'Japan|United States of America'
 'Argentina|Denmark|Finland|France|Germany|Iceland|Italy|Netherlands|Norway|Sweden|United Kingdom|United States of America'
 'France' 'Germany' 'Germany|United States of America' 'United Kingdom'
 'Mexico' 'United States of America|Canada|Germany'
 'United Kingdom|United States of America' 'France|United Kingdom'
 'Austria|Switzerland|United States of America'
 'United States of America|Hong Kong|United Kingdom' 'China' 'Japan'
 'Uruguay|United States of America|Germany|Paraguay'
 'United States of America|France' 'France|Poland|Switzerland'
 'France|Poland' 'South Korea|Germany'
 'France|Ireland|Luxembourg|United Kingdom|United States of America'
 'Australia|United Kingdom|United States of America'
 'New Zealand|United S

In [36]:

# Convert the categorical data to a list of lists for apriori
records = []
for i in range(len(categoricas)):
    records.append([str(categoricas.values[i,j]) for j in range(len(categorical_columns))])

# Add some numerical variables
numerical_columns = ['voteAvg']  # Reduced to just one numerical variable for simplicity
numericas = movies[numerical_columns]

# Fill missing values with median
for col in numerical_columns:
    numericas[col] = numericas[col].fillna(numericas[col].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numericas[col] = numericas[col].fillna(numericas[col].median())


In [37]:

# Create bins for vote average
numericas['voteAvg_category'] = pd.qcut(numericas['voteAvg'], 
                                       q=3, 
                                       labels=['Low_Rating', 'Medium_Rating', 'High_Rating'])

# Add the vote category to our records
for i in range(len(records)):
    records[i].append(str(numericas['voteAvg_category'].iloc[i]))

print("\nSample of processed records:")
for i in range(5):
    print(records[i])

# Generate association rules with very low support to start
reglas_asociacion = apriori(records, 
                           min_support=0.01,  # Lowered to 1%
                           min_confidence=0.5)  # Lowered to 50%
reglas = list(reglas_asociacion)



Sample of processed records:
['Crime|Comedy', 'United States of America', 'Low_Rating']
['Action|Thriller|Crime', 'Japan|United States of America', 'Medium_Rating']
['Adventure|Action|Science Fiction', 'United States of America', 'High_Rating']
['Animation|Family', 'United States of America', 'High_Rating']
['Comedy|Drama|Romance', 'United States of America', 'High_Rating']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  numericas['voteAvg_category'] = pd.qcut(numericas['voteAvg'],


In [None]:
print("\nNumber of rules found:", len(reglas))

if len(reglas) > 0:
    print("\nExample of first rule structure:")
    print(reglas[0])

    def inspect(output):
        results = []
        for result in output:
            if len(result[2]) > 0:  # Check if there are any rules
                for ordered_stat in result[2]:
                    if len(ordered_stat[0]) > 0 and len(ordered_stat[1]) > 0:
                        lhs = [item for item in ordered_stat[0]]
                        rhs = [item for item in ordered_stat[1]]
                        support = result[1]
                        confidence = ordered_stat[2]
                        lift = ordered_stat[3]
                        results.append((lhs, rhs, support, confidence, lift))
        return results

    # Convert rules to DataFrame
    rules_list = inspect(reglas)
    output_DataFrame = pd.DataFrame(rules_list, 
                                  columns=['Left_Hand_Side', 'Right_Hand_Side', 
                                         'Support', 'Confidence', 'Lift'])

    # Sort rules by lift
    output_DataFrame = output_DataFrame.sort_values('Lift', ascending=False)

    print("\nTop 10 rules by lift:")
    print(output_DataFrame.head(10))
else:
    print("\nNo rules found. Consider adjusting the support and confidence thresholds.")


Number of rules found: 17

Example of first rule structure:
RelationRecord(items=frozenset({'Comedy', 'Low_Rating'}), support=0.023, ordered_statistics=[OrderedStatistic(items_base=frozenset({'Comedy'}), items_add=frozenset({'Low_Rating'}), confidence=0.5227272727272727, lift=1.5279955355956525)])

Top 10 rules by lift:
                                 Left_Hand_Side Right_Hand_Side  Support  \
18  [United States of America, Horror|Thriller]    [Low_Rating]   0.0100   
10                            [Horror|Thriller]    [Low_Rating]   0.0141   
7                                 [South Korea]   [High_Rating]   0.0101   
8                                      [Horror]    [Low_Rating]   0.0154   
3                                 [Documentary]   [High_Rating]   0.0119   
6                                       [Japan]   [High_Rating]   0.0361   
15           [United States of America, Comedy]    [Low_Rating]   0.0151   
16            [Drama, United States of America]   [High_Rating]   0.0

## III.	Análisis de Componentes Principales
