In [2]:
import pandas as pd
import numpy as np

from dsgd.DSClassifierMultiQ import DSClassifierMultiQ



from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Clasificadores
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Exploracion de Datos y pre procesamiento

In [6]:
data = pd.read_csv('data/list_attr_celeba.csv')
print("tamaño del dataset: ", data.shape)
data = data.drop('image_id', axis=1)
data = data.replace(-1, 0)
data = data.sample(frac=1).reset_index(drop=True)
print(data.dtypes)

tamaño del dataset:  (202599, 41)
5_o_Clock_Shadow       int64
Arched_Eyebrows        int64
Attractive             int64
Bags_Under_Eyes        int64
Bald                   int64
Bangs                  int64
Big_Lips               int64
Big_Nose               int64
Black_Hair             int64
Blond_Hair             int64
Blurry                 int64
Brown_Hair             int64
Bushy_Eyebrows         int64
Chubby                 int64
Double_Chin            int64
Eyeglasses             int64
Goatee                 int64
Gray_Hair              int64
Heavy_Makeup           int64
High_Cheekbones        int64
Male                   int64
Mouth_Slightly_Open    int64
Mustache               int64
Narrow_Eyes            int64
No_Beard               int64
Oval_Face              int64
Pale_Skin              int64
Pointy_Nose            int64
Receding_Hairline      int64
Rosy_Cheeks            int64
Sideburns              int64
Smiling                int64
Straight_Hair          int64
Wavy_Hair

In [7]:
y = data["Attractive"]
X = data.drop(columns=["Attractive"])

# Testeo de los clasificadores

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [9]:
# Diccionario de modelos
accuracys = {}
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(max_depth=5),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

# Función para entrenar y evaluar modelos
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'{name} Accuracy: {accuracy:.4f}')
        accuracys[name] = accuracy
        print(f'Classification Report for {name}:\n{classification_report(y_test, y_pred)}')
        print(f'Confusion Matrix for {name}:\n{confusion_matrix(y_test, y_pred)}\n')
        print(f'---------------------------------------------------\n')

# Llamada a la función
train_and_evaluate(models, X_train, y_train, X_test, y_test)


Logistic Regression Accuracy: 0.7890
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.80      0.76      0.78     24692
           1       0.78      0.82      0.80     25958

    accuracy                           0.79     50650
   macro avg       0.79      0.79      0.79     50650
weighted avg       0.79      0.79      0.79     50650

Confusion Matrix for Logistic Regression:
[[18758  5934]
 [ 4752 21206]]

---------------------------------------------------

Random Forest Accuracy: 0.7597
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.73      0.81      0.77     24692
           1       0.80      0.71      0.75     25958

    accuracy                           0.76     50650
   macro avg       0.76      0.76      0.76     50650
weighted avg       0.76      0.76      0.76     50650

Confusion Matrix for Random Forest:
[[19953  4739]
 [ 7431 1852

# Testeo del clasificador de DS

In [10]:
Y = y
X = X.apply(pd.to_numeric)

cut = int(0.25 * len(data))

X_train = X.iloc[:-cut].values
X_test = X.iloc[-cut:].values
Y_train = Y.iloc[:-cut].values
Y_test = Y.iloc[-cut:].values


print(len(X_train), len(X_test), len(Y_train), len(Y_test))
print(Y.head())


151950 50649 151950 50649
0    1
1    0
2    0
3    0
4    1
Name: Attractive, dtype: int64


In [11]:
DSC = DSClassifierMultiQ(2, min_iter=50, max_iter=400, debug_mode=True, \
                         lossfn="MSE", num_workers=0, min_dloss=1e-7)

In [12]:
losses, epoch, dt = DSC.fit(X_train, Y_train, add_single_rules=True,
                            single_rules_breaks=3, add_mult_rules=False,
                                column_names=data.columns[:-1], print_every_epochs=31, print_final_model=True)

Optimization started
Processing epoch	32	0.1448	
Training time: 3041.61s, epochs: 52

Least training loss reached: 0.145
DSModelMultiQ(
  DS Classifier using 78 rules
  
  Rule 1: 5_o_Clock_Shadow = 0
  	C1: 0.249	C2: 0.000	Unc: 0.751
  
  Rule 2: 5_o_Clock_Shadow = 1
  	C1: 0.002	C2: 0.163	Unc: 0.836
  
  Rule 3: Arched_Eyebrows = 0
  	C1: 0.099	C2: 0.022	Unc: 0.879
  
  Rule 4: Arched_Eyebrows = 1
  	C1: 0.029	C2: 0.102	Unc: 0.869
  
  Rule 5: Attractive = 0
  	C1: 0.104	C2: 0.015	Unc: 0.880
  
  Rule 6: Attractive = 1
  	C1: 0.089	C2: 0.056	Unc: 0.854
  
  Rule 7: Bags_Under_Eyes = 0
  	C1: 0.002	C2: 0.185	Unc: 0.813
  
  Rule 8: Bags_Under_Eyes = 1
  	C1: 0.682	C2: 0.003	Unc: 0.315
  
  Rule 9: Bald = 0
  	C1: 0.045	C2: 0.054	Unc: 0.901
  
  Rule 10: Bald = 1
  	C1: 0.132	C2: 0.000	Unc: 0.868
  
  Rule 11: Bangs = 0
  	C1: 0.069	C2: 0.042	Unc: 0.889
  
  Rule 12: Bangs = 1
  	C1: 0.099	C2: 0.004	Unc: 0.897
  
  Rule 13: Big_Lips = 0
  	C1: 0.002	C2: 0.175	Unc: 0.823
  
  Rule 14: B

In [16]:
DSC.print_most_important_rules(classes=["bad appearance",  "good appearance"])



Most important rules for class bad appearance

	[0.780] R25: Bushy_Eyebrows = 1
			bad: 0.780	goo: 0.000	Unc: 0.220

	[0.687] R19: Blond_Hair = 1
			bad: 0.686	goo: 0.002	Unc: 0.311

	[0.684] R7: Bags_Under_Eyes = 1
			bad: 0.682	goo: 0.003	Unc: 0.315

	[0.671] R33: Goatee = 1
			bad: 0.667	goo: 0.007	Unc: 0.326

	[0.620] R27: Chubby = 1
			bad: 0.620	goo: 0.000	Unc: 0.380

	[0.512] R76: Wearing_Necktie = 0
			bad: 0.512	goo: 0.000	Unc: 0.488

	[0.479] R55: Pointy_Nose = 1
			bad: 0.477	goo: 0.003	Unc: 0.519

	[0.478] R29: Double_Chin = 1
			bad: 0.478	goo: 0.000	Unc: 0.522

	[0.428] R70: Wearing_Hat = 0
			bad: 0.427	goo: 0.001	Unc: 0.572

	[0.411] R69: Wearing_Earrings = 1
			bad: 0.409	goo: 0.005	Unc: 0.586

	[0.411] R34: Gray_Hair = 0
			bad: 0.410	goo: 0.003	Unc: 0.587

	[0.340] R60: Sideburns = 0
			bad: 0.340	goo: 0.000	Unc: 0.660

	[0.287] R22: Brown_Hair = 0
			bad: 0.286	goo: 0.002	Unc: 0.712

	[0.250] R45: Mustache = 1
			bad: 0.247	goo: 0.005	Unc: 0.747

	[0.249] R0: 5_o_

In [13]:
Y_pred = DSC.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
accuracys["DSClassifierMultiQ"] = accuracy
print(f'Accuracy: {accuracy:.4f}')
print(f'Classification Report:\n{classification_report(Y_test, Y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(Y_test, Y_pred)}\n')

Accuracy: 0.7858
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.78      0.78     24766
           1       0.79      0.80      0.79     25883

    accuracy                           0.79     50649
   macro avg       0.79      0.79      0.79     50649
weighted avg       0.79      0.79      0.79     50649

Confusion Matrix:
[[19210  5556]
 [ 5295 20588]]



In [14]:
for key in accuracys:
    print(key, accuracys[key])

Logistic Regression 0.7890227048371175
Random Forest 0.7597235932872656
Support Vector Machine 0.7945113524185587
K-Nearest Neighbors 0.7612043435340573
DSClassifierMultiQ 0.7857608244980158
