In [28]:
!pip freeze

asttokens==2.4.1
cdsgd @ git+https://github.com/ricardo-valdivia/CDSGD.git@aae1e2d49cc5e015cf50643ecaeb4090d0c4d2d3
cloudpickle==3.0.0
colorama==0.4.6
comm==0.2.2
contourpy==1.2.1
cycler==0.12.1
debugpy==1.8.2
decorator==5.1.1
dill==0.3.8
dsgd @ git+https://github.com/Sergio-P/DSGD.git@7d9bb9f0b417582040a3e3a964997a1e853dd820
executing==2.0.1
filelock==3.15.4
fonttools==4.53.1
fsspec==2024.6.1
imageio==2.34.2
intel-openmp==2021.4.0
ipykernel==6.29.5
ipython==8.26.0
jedi==0.19.1
Jinja2==3.1.4
joblib==1.4.2
jupyter_client==8.6.2
jupyter_core==5.7.2
kiwisolver==1.4.5
lazy_loader==0.4
lime==0.2.0.1
llvmlite==0.43.0
MarkupSafe==2.1.5
matplotlib==3.9.1
matplotlib-inline==0.1.7
mkl==2021.4.0
mpmath==1.3.0
nest-asyncio==1.6.0
networkx==3.3
numba==0.60.0
numpy==1.26.4
packaging==24.1
pandas==2.2.2
parso==0.8.4
pillow==10.4.0
platformdirs==4.2.2
prompt_toolkit==3.0.47
psutil==6.0.0
pure-eval==0.2.2
Pygments==2.18.0
pyparsing==3.1.2
python-dateutil==2.9.0.post0
pytz==2024.1
pywin32==306
pyzmq==26

In [29]:
from cdsgd import DSClustering
from dsgd import DSClassifierMultiQ
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris,load_wine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import wittgenstein as lw

Experimento

In [30]:
atom = pd.read_csv('data/Atom_Data.csv')
atom_labels = pd.read_csv('data/Atom_Labels.csv')
chainlink = pd.read_csv('data/Chainlink_Data.csv')
chainlink_labels = pd.read_csv('data/Chainlink_Labels.csv')
engytime = pd.read_csv('data/EngyTime_Data.csv')
engytime_labels = pd.read_csv('data/EngyTime_Labels.csv')
hepta = pd.read_csv('data/Hepta_Data.csv')
hepta_labels = pd.read_csv('data/Hepta_Labels.csv')
tetra = pd.read_csv('data/Tetra_Data.csv')
tetra_labels = pd.read_csv('data/Tetra_Labels.csv')
target = pd.read_csv('data/Target_Data.csv')
target_labels = pd.read_csv('data/Target_Labels.csv')
two_diamonds = pd.read_csv('data/TwoDiamonds_Data.csv')
two_diamonds_labels = pd.read_csv('data/TwoDiamonds_Labels.csv')
wing_nut = pd.read_csv('data/WingNut_Data.csv')
wing_nut_labels = pd.read_csv('data/WingNut_Labels.csv')
# Cargamos los datasets clasico de sklearn
iris = load_iris()
iris_data = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_labels = pd.DataFrame(iris.target, columns=['target'])
wine_data = pd.read_csv('data/wine.csv')
wine_labels = wine_data['good']
wine_data = wine_data.drop(columns=['good'])
wine_data["color"] = wine_data["color"].map({'red': 0, 'white': 1})
# Cargamos los datasets de prueba de la tesis
uniform = pd.read_csv('data/uniform_df.csv')
uniform_data = uniform.drop(columns=['labels'])
uniform_labels = pd.DataFrame(uniform['labels'])
rectangle = pd.read_csv('data/rectangle_df.csv')
rectangle_data = rectangle.drop(columns=['labels'])
rectangle_labels = pd.DataFrame(rectangle['labels'])
gaussian = pd.read_csv('data/gaussian_df.csv')
gaussian_data = gaussian.drop(columns=['labels'])
gaussian_labels = pd.DataFrame(gaussian['labels'])
gaussian_mix = pd.read_csv('data/gaussian_mix_df.csv')
gaussian_mix_data = gaussian_mix.drop(columns=['labels'])
gaussian_mix_labels = pd.DataFrame(gaussian_mix['labels'])
breast_cancer = pd.read_csv('data/breast-cancer-wisconsin.csv')
breast_cancer = breast_cancer.drop(columns=['id'])
breast_cancer_labels = breast_cancer['class'].map({2: 0, 4: 1})
breast_cancer = breast_cancer.drop(columns=['class'])
#pasar a numerico la columna bare_nucleoli
breast_cancer['bare_nucleoli'] = pd.to_numeric(breast_cancer['bare_nucleoli'], errors='coerce')
#dropear los nulos
breast_cancer = breast_cancer.dropna()
heart = pd.read_csv('data/SAheart.csv')
heart_labels = heart['chd']
heart = heart.drop(columns=['row.names','chd'])
heart['famhist'] = heart['famhist'].map({'Present': 1, 'Absent': 0})
datasets = [
    {
        'name': 'Atom',
        'data': atom,
        'labels': atom_labels,
        'n_clusters': atom_labels.nunique().values[0]
    },{
        'name': 'Chainlink',
        'data': chainlink,
        'labels': chainlink_labels,
        'n_clusters': chainlink_labels.nunique().values[0]
    },{
        'name': 'EngyTime',
        'data': engytime,
        'labels': engytime_labels,
        'n_clusters': engytime_labels.nunique().values[0]
    },{
        'name': 'Hepta',
        'data': hepta,
        'labels': hepta_labels,
        'n_clusters': hepta_labels.nunique().values[0]
    },{
        'name': 'Tetra',
        'data': tetra,
        'labels': tetra_labels,
        'n_clusters': tetra_labels.nunique().values[0]
    },{
        'name': 'Target',
        'data': target,
        'labels': target_labels,
        'n_clusters': target_labels.nunique().values[0]
    },{
        'name': 'TwoDiamonds',
        'data': two_diamonds,
        'labels': two_diamonds_labels,
        'n_clusters': two_diamonds_labels.nunique().values[0]
    },{
        'name': 'WingNut',
        'data': wing_nut,
        'labels': wing_nut_labels,
        'n_clusters': wing_nut_labels.nunique().values[0]
    },{
        'name': 'Iris',
        'data': iris_data,
        'labels': iris_labels,
        'n_clusters': 3
    },{
        'name': 'Wine',
        'data': wine_data,
        'labels': wine_labels,
        'n_clusters': 3
    },{
        'name': 'Uniform',
        'data': uniform_data,
        'labels': uniform_labels,
        'n_clusters': uniform_labels.nunique().values[0]
    },{
        'name': 'Rectangle',
        'data': rectangle_data,
        'labels': rectangle_labels,
        'n_clusters': rectangle_labels.nunique().values[0]
    },{
        'name': 'Gaussian',
        'data': gaussian_data,
        'labels': gaussian_labels,
        'n_clusters': gaussian_labels.nunique().values[0]
    },{
        'name': 'GaussianMix',
        'data': gaussian_mix_data,
        'labels': gaussian_mix_labels,
        'n_clusters': gaussian_mix_labels.nunique().values[0]
    },{
        'name': 'BreastCancer',
        'data': breast_cancer,
        'labels': breast_cancer_labels,
        'n_clusters': breast_cancer_labels.nunique()
    },{
        'name': 'Heart',
        'data': heart,
        'labels': heart_labels,
        'n_clusters': heart_labels.nunique()
    }
]
results = []

In [32]:
#Iteramos sobre los datasets
# testear al menos 10 casoss, evaluar diferencias con Ricardo
# buscar definiciones de interpretaabilidad y como lo miden
for dataset in datasets:
    print("Dataset: "+ dataset["name"])
    n_clusters = dataset['n_clusters']
    data = dataset['data']
    labels = dataset['labels'].values.ravel()
    #normalizamos los datos
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    data = pd.DataFrame(data, columns=dataset['data'].columns)
    #KMeans
    kmeans = KMeans(n_clusters=n_clusters)
    kmeans_labels = kmeans.fit_predict(data)
    #DBSCAN
    dbscan = DBSCAN(eps=0.5)
    dbscan_labels = dbscan.fit_predict(data)
    #Agglomerative
    agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
    agglomerative_labels = agglomerative.fit_predict(data)

    #Generamos un arbol de decisión para cada metodo y calculamos su precicion
    X_train_kmeans, X_test_kmeans, y_train_kmeans, y_test_kmeans = train_test_split(data, kmeans_labels, test_size=0.2)
    X_train_dbscan, X_test_dbscan, y_train_dbscan, y_test_dbscan = train_test_split(data, dbscan_labels, test_size=0.2)
    X_train_agglomerative, X_test_agglomerative, y_train_agglomerative, y_test_agglomerative = train_test_split(data, agglomerative_labels, test_size=0.2)    
    X_train_kmeans = X_train_kmeans.to_numpy()
    X_test_kmeans = X_test_kmeans.to_numpy()
    X_train_dbscan = X_train_dbscan.to_numpy()
    X_test_dbscan = X_test_dbscan.to_numpy()
    X_train_agglomerative = X_train_agglomerative.to_numpy()
    X_test_agglomerative = X_test_agglomerative.to_numpy()
    #DSGD
    """     dsgd = DSClassifierMultiQ(num_classes=n_clusters, min_iter=50, max_iter=400, debug_mode=True, \
                            lossfn="MSE", num_workers=0, min_dloss=1e-7)
        dsgd.fit(X_train_kmeans,y_train_kmeans, add_single_rules=True, single_rules_breaks=3,
                                    column_names=data.columns[:], print_every_epochs=31)
        print("Kmeans DSGD: ", accuracy_score(y_test_kmeans, dsgd.predict(X_test_kmeans)))
        results.append({
            'dataset': dataset['name'],
            "algorithm": "Kmeans DSGD",
            "accuracy": accuracy_score(y_test_kmeans, dsgd.predict(X_test_kmeans))
        }) """
    #debe existir mas de 1 cluster y todos los labels deben ser positivos
    if np.unique(dbscan_labels).shape[0] > 1 and np.all(dbscan_labels >= 0):
        dsgd = DSClassifierMultiQ(num_classes=np.unique(dbscan_labels).shape[0], min_iter=50, max_iter=400, debug_mode=True, \
                         lossfn="MSE", num_workers=0, min_dloss=1e-7)
        dsgd.fit(X_train_dbscan, y_train_dbscan, add_single_rules=True,
                            single_rules_breaks=3, add_mult_rules=False,
                                column_names=data.columns[:], print_every_epochs=31)
        print("Dbscan DSGD: ", accuracy_score(y_test_dbscan, dsgd.predict(X_test_dbscan)))
        results.append({
            'dataset': dataset['name'],
            "algorithm": "Dbscan DSGD",
            "accuracy": accuracy_score(y_test_dbscan, dsgd.predict(X_test_dbscan))
        })
    elif np.unique(dbscan_labels).shape[0] > 1:
        # print different labels
        y_train_dbscan = np.where(y_train_dbscan == -1, len(np.unique(dbscan_labels)) - 1, y_train_dbscan)
        y_test_dbscan = np.where(y_test_dbscan == -1, len(np.unique(dbscan_labels)) - 1, y_test_dbscan)
        dsgd = DSClassifierMultiQ(num_classes=np.unique(dbscan_labels).shape[0], min_iter=50, max_iter=400, debug_mode=True, \
                         lossfn="MSE", num_workers=0, min_dloss=1e-7)
        dsgd.fit(X_train_dbscan, y_train_dbscan, add_single_rules=True,
                            single_rules_breaks=3, add_mult_rules=False,
                                column_names=data.columns[:], print_every_epochs=31)
        print("Dbscan DSGD: ", accuracy_score(y_test_dbscan, dsgd.predict(X_test_dbscan)))
        results.append({
            'dataset': dataset['name'],
            "algorithm": "Dbscan DSGD",
            "accuracy": accuracy_score(y_test_dbscan, dsgd.predict(X_test_dbscan))
        })
    else:
        results.append({
            'dataset': dataset['name'],
            "algorithm": "Dbscan DSGD",
            "accuracy": 0
        })
    
    """     dsgd = DSClassifierMultiQ(num_classes=n_clusters, min_iter=50, max_iter=400, debug_mode=True, \
                            lossfn="MSE", num_workers=0, min_dloss=1e-7)
        dsgd.fit(X=X_train_agglomerative, y=y_train_agglomerative, add_single_rules=True,
                                single_rules_breaks=3, add_mult_rules=False,
                                    column_names=data.columns[:], print_every_epochs=31)
        print("Agglomerative DSGD: ", accuracy_score(y_test_agglomerative, dsgd.predict(X_test_agglomerative)))
        results.append({
            'dataset': dataset['name'],
            "algorithm": "Agglomerative DSGD",
            "accuracy": accuracy_score(y_test_agglomerative, dsgd.predict(X_test_agglomerative))
        }) """
    
    """ dt_kmeans = DecisionTreeClassifier(max_depth=3)
    dt_kmeans.fit(X_train_kmeans, y_train_kmeans)
    print("Kmeans Dt: ", accuracy_score(y_test_kmeans, dt_kmeans.predict(X_test_kmeans)))
    dt_dbscan = DecisionTreeClassifier(max_depth=3)
    dt_dbscan.fit(X_train_dbscan, y_train_dbscan)
    print("Dbscan Dt: ", accuracy_score(y_test_dbscan, dt_dbscan.predict(X_test_dbscan)))
    dt_agglomerative = DecisionTreeClassifier(max_depth=3)
    dt_agglomerative.fit(X_train_agglomerative, y_train_agglomerative)
    print("Agglomerative Dt: ", accuracy_score(y_test_agglomerative, dt_agglomerative.predict(X_test_agglomerative)))   
     """
    """ #Ripper accuracy
    kmeans_lw_accuracy = []
    for i in range(np.unique(kmeans_labels).shape[0]):
        kmeans_labels_pos = (kmeans_labels == i).astype(int)
        ripper = lw.RIPPER()
        df = pd.concat([data, pd.DataFrame(kmeans_labels_pos, columns=['cluster'])], axis=1)
        ripper.fit(df, class_feat="cluster", pos_class=1)
        kmeans_lw_accuracy.append(accuracy_score(kmeans_labels_pos, ripper.predict(df)))
    print("Kmeans Ripper: ", np.mean(kmeans_lw_accuracy))
    dbscan_lw_accuracy = []
    for i in range(np.unique(dbscan_labels).shape[0]):
        dbscan_labels_pos = (dbscan_labels == i).astype(int)
        ripper = lw.RIPPER()
        df = pd.concat([data, pd.DataFrame(dbscan_labels_pos, columns=['cluster'])], axis=1)
        ripper.fit(df, class_feat="cluster", pos_class=1)
        dbscan_lw_accuracy.append(accuracy_score(dbscan_labels_pos, ripper.predict(df)))
    print("Dbscan Ripper: ", np.mean(dbscan_lw_accuracy))
    agglomerative_lw_accuracy = []
    for i in range(np.unique(agglomerative_labels).shape[0]):
        agglomerative_labels_pos = (agglomerative_labels == i).astype(int)
        ripper = lw.RIPPER()
        df = pd.concat([data, pd.DataFrame(agglomerative_labels_pos, columns=['cluster'])], axis=1)
        ripper.fit(df, class_feat="cluster", pos_class=1)
        agglomerative_lw_accuracy.append(accuracy_score(agglomerative_labels_pos, ripper.predict(df)))
    print("Agglomerative Ripper: ", np.mean(agglomerative_lw_accuracy)) """

""" 

    #CDSDG
    print("CDSDG")
    cdsgd = DSClustering(data=data.copy())
    cdsgd.generate_categorical_rules()
    cdsgd_labels = cdsgd.predict()

    #CDSDG mas votados
    print("CDSDG most voted")
    cdsgd1 = DSClustering(data=data.copy(), most_voted=True)
    cdsgd1.generate_categorical_rules()
    cdsgd1_labels = cdsgd1.predict()

    # CDSDG con numero de clusters
    print("CDSDG with n_clusters")
    cdsgd2 = DSClustering(data=data.copy(), cluster=n_clusters)
    cdsgd2.generate_categorical_rules()
    cdsgd2_labels = cdsgd2.predict()

    # CDSDG con numero de clusters mas votado
    print("CDSDG with n_clusters most voted")
    cdsgd2 = DSClustering(data=data.copy(), cluster=n_clusters, most_voted=True)
    cdsgd2.generate_categorical_rules()
    cdsgd2_labels = cdsgd2.predict() """
    

Dataset: Atom
Optimization started
Processing epoch	94	0.0227	
Training time: 24.90s, epochs: 100

Least training loss reached: 0.020
Dbscan DSGD:  0.75
Dataset: Chainlink
Optimization started
Processing epoch	373	0.0201	
Training time: 99.64s, epochs: 400

Least training loss reached: 0.019
Dbscan DSGD:  0.985
Dataset: EngyTime
Optimization started
Processing epoch	373	0.0036	
Training time: 461.73s, epochs: 400

Least training loss reached: 0.003
Dbscan DSGD:  0.9975609756097561
Dataset: Hepta
Optimization started
Processing epoch	63	0.0629	
Training time: 4.64s, epochs: 83

Least training loss reached: 0.034
Dbscan DSGD:  0.9767441860465116
Dataset: Tetra
Optimization started
Processing epoch	218	0.0162	
Training time: 22.99s, epochs: 225

Least training loss reached: 0.016
Dbscan DSGD:  0.9625
Dataset: Target
Optimization started
Processing epoch	373	0.0162	
Training time: 75.59s, epochs: 400

Least training loss reached: 0.015
Dbscan DSGD:  0.9935064935064936
Dataset: TwoDiamonds


' \n\n    #CDSDG\n    print("CDSDG")\n    cdsgd = DSClustering(data=data.copy())\n    cdsgd.generate_categorical_rules()\n    cdsgd_labels = cdsgd.predict()\n\n    #CDSDG mas votados\n    print("CDSDG most voted")\n    cdsgd1 = DSClustering(data=data.copy(), most_voted=True)\n    cdsgd1.generate_categorical_rules()\n    cdsgd1_labels = cdsgd1.predict()\n\n    # CDSDG con numero de clusters\n    print("CDSDG with n_clusters")\n    cdsgd2 = DSClustering(data=data.copy(), cluster=n_clusters)\n    cdsgd2.generate_categorical_rules()\n    cdsgd2_labels = cdsgd2.predict()\n\n    # CDSDG con numero de clusters mas votado\n    print("CDSDG with n_clusters most voted")\n    cdsgd2 = DSClustering(data=data.copy(), cluster=n_clusters, most_voted=True)\n    cdsgd2.generate_categorical_rules()\n    cdsgd2_labels = cdsgd2.predict() '

In [None]:
results = pd.DataFrame(results)
results.to_csv('results.csv', index=False)