In [12]:
import pandas as pd
import numpy as np

from dsgd.DSClassifierMultiQ import DSClassifierMultiQ



from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Clasificadores
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Exploracion de Datos y pre procesamiento

In [13]:
data = pd.read_csv('data/obesity.csv')
print("tamaño del dataset: ", data.shape)
data["Gender"] = data["Gender"].apply(lambda x: 1 if x =="Male" else 0)
data["CALC"]= data["CALC"].map({'no': 0,'Sometimes': 1,'Frequently': 2,'Always': 3})
data["FAVC"] = data["FAVC"].apply(lambda x: 1 if x =="yes" else 0)
data["SCC"] = data["SCC"].apply(lambda x: 1 if x =="yes" else 0)
data["SMOKE"] = data["SMOKE"].apply(lambda x: 1 if x =="yes" else 0)
data["family_history_with_overweight"] = data["family_history_with_overweight"].apply(lambda x: 1 if x =="yes" else 0)
data["CAEC"]= data["CAEC"].map({'no': 0,'Sometimes': 1,'Frequently': 2,'Always': 3})
data = pd.get_dummies(data, columns=["MTRANS"], dtype=int)
print(data.dtypes)
print(data["NObeyesdad"].unique())
#desordenar
data = data.sample(frac=1).reset_index(drop=True)

tamaño del dataset:  (2111, 17)
Age                               float64
Gender                              int64
Height                            float64
Weight                            float64
CALC                                int64
FAVC                                int64
FCVC                              float64
NCP                               float64
SCC                                 int64
SMOKE                               int64
CH2O                              float64
family_history_with_overweight      int64
FAF                               float64
TUE                               float64
CAEC                                int64
NObeyesdad                         object
MTRANS_Automobile                   int64
MTRANS_Bike                         int64
MTRANS_Motorbike                    int64
MTRANS_Public_Transportation        int64
MTRANS_Walking                      int64
dtype: object
['Normal_Weight' 'Overweight_Level_I' 'Overweight_Level_II'
 'Obesity_Ty

In [14]:
y = data["NObeyesdad"]
X = data.drop(columns=["NObeyesdad"])
print(y.head())

0         Obesity_Type_I
1          Normal_Weight
2    Insufficient_Weight
3       Obesity_Type_III
4       Obesity_Type_III
Name: NObeyesdad, dtype: object


# Testeo de los clasificadores

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [16]:
# Diccionario de modelos
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

# Función para entrenar y evaluar modelos
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'{name} Accuracy: {accuracy:.4f}')
        print(f'Classification Report for {name}:\n{classification_report(y_test, y_pred)}')
        print(f'Confusion Matrix for {name}:\n{confusion_matrix(y_test, y_pred)}\n')
        print(f'---------------------------------------------------\n')

# Llamada a la función
train_and_evaluate(models, X_train, y_train, X_test, y_test)


Logistic Regression Accuracy: 0.7064
Classification Report for Logistic Regression:
                     precision    recall  f1-score   support

Insufficient_Weight       0.76      0.78      0.77        68
      Normal_Weight       0.57      0.58      0.58        72
     Obesity_Type_I       0.59      0.52      0.55        88
    Obesity_Type_II       0.80      0.96      0.87        74
   Obesity_Type_III       0.87      0.99      0.92        81
 Overweight_Level_I       0.68      0.63      0.65        73
Overweight_Level_II       0.61      0.49      0.54        72

           accuracy                           0.71       528
          macro avg       0.70      0.71      0.70       528
       weighted avg       0.70      0.71      0.70       528

Confusion Matrix for Logistic Regression:
[[53 15  0  0  0  0  0]
 [17 42  2  0  0 10  1]
 [ 0  0 46 12 11  3 16]
 [ 0  1  1 71  0  0  1]
 [ 0  0  0  1 80  0  0]
 [ 0  9 11  2  1 46  4]
 [ 0  7 18  3  0  9 35]]

------------------------------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest Accuracy: 0.9621
Classification Report for Random Forest:
                     precision    recall  f1-score   support

Insufficient_Weight       0.95      0.93      0.94        68
      Normal_Weight       0.85      0.96      0.90        72
     Obesity_Type_I       0.98      0.99      0.98        88
    Obesity_Type_II       1.00      0.99      0.99        74
   Obesity_Type_III       1.00      0.99      0.99        81
 Overweight_Level_I       0.97      0.96      0.97        73
Overweight_Level_II       0.99      0.92      0.95        72

           accuracy                           0.96       528
          macro avg       0.96      0.96      0.96       528
       weighted avg       0.96      0.96      0.96       528

Confusion Matrix for Random Forest:
[[63  5  0  0  0  0  0]
 [ 3 69  0  0  0  0  0]
 [ 0  0 87  0  0  0  1]
 [ 0  1  0 73  0  0  0]
 [ 0  0  1  0 80  0  0]
 [ 0  3  0  0  0 70  0]
 [ 0  3  1  0  0  2 66]]

------------------------------------------------

# Testeo del clasificador de DS

In [17]:
Y = y.map({'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2, 'Overweight_Level_II': 3, 'Obesity_Type_I': 4, 'Obesity_Type_II': 5, 'Obesity_Type_III': 6})
X = X.apply(pd.to_numeric)

cut = int(0.25 * len(data))

X_train = X.iloc[:-cut].values
X_test = X.iloc[-cut:].values
Y_train = Y.iloc[:-cut].values
Y_test = Y.iloc[-cut:].values


print(len(X_train), len(X_test), len(Y_train), len(Y_test))
print(Y.head())


1584 527 1584 527
0    4
1    1
2    0
3    6
4    6
Name: NObeyesdad, dtype: int64


In [18]:
DSC = DSClassifierMultiQ(7, min_iter=50, max_iter=400, debug_mode=True, \
                         lossfn="MSE", num_workers=0, min_dloss=1e-7)

In [19]:
losses, epoch, dt = DSC.fit(X_train, Y_train, add_single_rules=True,
                            single_rules_breaks=3, add_mult_rules=False,
                                column_names=data.columns[:-1], print_every_epochs=31, print_final_model=True)

Optimization started
Processing epoch	94	0.0474	
Training time: 45.50s, epochs: 113

Least training loss reached: 0.040
DSModelMultiQ(
  DS Classifier using 60 rules
  
  Rule 1: Age < 20.084
  	C1: 0.329	C2: 0.228	C3: 0.129	C4: 0.014	C5: 0.220	C6: 0.000	C7: 0.061	Unc: 0.019
  
  Rule 2: 20.084 < Age < 24.401
  	C1: 0.116	C2: 0.211	C3: 0.252	C4: 0.000	C5: 0.323	C6: 0.000	C7: 0.026	Unc: 0.072
  
  Rule 3: 24.401 < Age < 28.718
  	C1: 0.000	C2: 0.052	C3: 0.069	C4: 0.000	C5: 0.000	C6: 0.426	C7: 0.261	Unc: 0.192
  
  Rule 4: Age > 28.718
  	C1: 0.000	C2: 0.031	C3: 0.021	C4: 0.367	C5: 0.133	C6: 0.327	C7: 0.000	Unc: 0.120
  
  Rule 5: Gender = 0.0
  	C1: 0.168	C2: 0.052	C3: 0.090	C4: 0.045	C5: 0.107	C6: 0.000	C7: 0.348	Unc: 0.191
  
  Rule 6: Gender = 1.0
  	C1: 0.193	C2: 0.246	C3: 0.047	C4: 0.109	C5: 0.000	C6: 0.367	C7: 0.000	Unc: 0.039
  
  Rule 7: Height < 1.640
  	C1: 0.000	C2: 0.168	C3: 0.152	C4: 0.000	C5: 0.351	C6: 0.108	C7: 0.153	Unc: 0.068
  
  Rule 8: 1.640 < Height < 1.703
  	C1: 0

In [20]:
Y_pred = DSC.predict(X_test)

print(f"Accuracy: {accuracy_score(Y_test, Y_pred)}")
print(f'Classification Report:\n{classification_report(Y_test, Y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(Y_test, Y_pred)}\n')

Accuracy: 0.7969639468690702
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.84      0.84        68
           1       0.71      0.73      0.72        63
           2       0.64      0.58      0.61        74
           3       0.66      0.67      0.67        70
           4       0.79      0.78      0.78        90
           5       0.86      0.93      0.89        70
           6       1.00      1.00      1.00        92

    accuracy                           0.80       527
   macro avg       0.79      0.79      0.79       527
weighted avg       0.79      0.80      0.80       527

Confusion Matrix:
[[57  7  4  0  0  0  0]
 [ 6 46  8  3  0  0  0]
 [ 1  7 43 15  8  0  0]
 [ 3  5  7 47  6  2  0]
 [ 0  0  5  6 70  9  0]
 [ 0  0  0  0  5 65  0]
 [ 0  0  0  0  0  0 92]]



In [21]:
DSC2 = DSClassifierMultiQ(7, min_iter=50, max_iter=400, debug_mode=True, \
                         lossfn="MSE", num_workers=0, min_dloss=1e-7, precompute_rules=True)

In [22]:
losses, epoch, dt = DSC2.fit(X_train, Y_train, add_single_rules=True,
                            single_rules_breaks=3, add_mult_rules=True,
                                column_names=data.columns[:-1], print_every_epochs=31, print_final_model=True)

Optimization started
Processing epoch	32	0.1040	
Training time: 24.07s, epochs: 54

Least training loss reached: 0.102
DSModelMultiQ(
  DS Classifier using 440 rules
  
  Rule 1: Age < 20.084
  	C1: 0.062	C2: 0.046	C3: 0.074	C4: 0.054	C5: 0.045	C6: 0.000	C7: 0.010	Unc: 0.709
  
  Rule 2: 20.084 < Age < 24.401
  	C1: 0.002	C2: 0.052	C3: 0.031	C4: 0.022	C5: 0.035	C6: 0.000	C7: 0.076	Unc: 0.782
  
  Rule 3: 24.401 < Age < 28.718
  	C1: 0.000	C2: 0.000	C3: 0.000	C4: 0.004	C5: 0.012	C6: 0.088	C7: 0.068	Unc: 0.828
  
  Rule 4: Age > 28.718
  	C1: 0.000	C2: 0.043	C3: 0.015	C4: 0.063	C5: 0.022	C6: 0.140	C7: 0.000	Unc: 0.717
  
  Rule 5: Gender = 0.0
  	C1: 0.047	C2: 0.004	C3: 0.000	C4: 0.000	C5: 0.019	C6: 0.000	C7: 0.106	Unc: 0.824
  
  Rule 6: Gender = 1.0
  	C1: 0.003	C2: 0.048	C3: 0.038	C4: 0.049	C5: 0.024	C6: 0.165	C7: 0.000	Unc: 0.672
  
  Rule 7: Height < 1.640
  	C1: 0.074	C2: 0.060	C3: 0.057	C4: 0.000	C5: 0.010	C6: 0.000	C7: 0.051	Unc: 0.749
  
  Rule 8: 1.640 < Height < 1.703
  	C1: 0

In [23]:
Y2_pred = DSC2.predict(X_test)

print(f"Accuracy: {accuracy_score(Y_test, Y2_pred)}")
print(f'Classification Report:\n{classification_report(Y_test, Y2_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(Y_test, Y2_pred)}\n')

Accuracy: 0.3225806451612903
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.12      0.21        68
           1       0.00      0.00      0.00        63
           2       0.00      0.00      0.00        74
           3       0.00      0.00      0.00        70
           4       0.00      0.00      0.00        90
           5       0.17      1.00      0.30        70
           6       0.78      1.00      0.88        92

    accuracy                           0.32       527
   macro avg       0.26      0.30      0.20       527
weighted avg       0.27      0.32      0.22       527

Confusion Matrix:
[[ 8  0  0  0  0 54  6]
 [ 1  0  0  0  0 61  1]
 [ 0  0  0  0  0 72  2]
 [ 0  0  0  0  0 66  4]
 [ 0  0  0  0  0 77 13]
 [ 0  0  0  0  0 70  0]
 [ 0  0  0  0  0  0 92]]



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
DSC3 = DSClassifierMultiQ(7, min_iter=50, max_iter=400, debug_mode=True, \
                         lossfn="MSE", num_workers=0, min_dloss=1e-7, precompute_rules=True)

In [25]:
losses, epoch, dt = DSC3.fit(X_train, Y_train, add_single_rules=True,
                            single_rules_breaks=5, add_mult_rules=False,
                                column_names=data.columns[:-1], print_every_epochs=31, print_final_model=True)

Optimization started
Processing epoch	125	0.0292	
Training time: 47.03s, epochs: 132

Least training loss reached: 0.028
DSModelMultiQ(
  DS Classifier using 76 rules
  
  Rule 1: Age < 18.209
  	C1: 0.287	C2: 0.218	C3: 0.178	C4: 0.024	C5: 0.156	C6: 0.000	C7: 0.135	Unc: 0.000
  
  Rule 2: 18.209 < Age < 21.644
  	C1: 0.383	C2: 0.220	C3: 0.180	C4: 0.000	C5: 0.081	C6: 0.001	C7: 0.115	Unc: 0.020
  
  Rule 3: 21.644 < Age < 24.401
  	C1: 0.201	C2: 0.301	C3: 0.210	C4: 0.002	C5: 0.154	C6: 0.059	C7: 0.025	Unc: 0.049
  
  Rule 4: 24.401 < Age < 27.158
  	C1: 0.000	C2: 0.096	C3: 0.115	C4: 0.000	C5: 0.077	C6: 0.170	C7: 0.222	Unc: 0.320
  
  Rule 5: 27.158 < Age < 30.593
  	C1: 0.000	C2: 0.000	C3: 0.000	C4: 0.168	C5: 0.006	C6: 0.419	C7: 0.000	Unc: 0.406
  
  Rule 6: Age > 30.593
  	C1: 0.000	C2: 0.050	C3: 0.024	C4: 0.461	C5: 0.000	C6: 0.282	C7: 0.000	Unc: 0.183
  
  Rule 7: Gender = 0.0
  	C1: 0.101	C2: 0.126	C3: 0.178	C4: 0.047	C5: 0.127	C6: 0.000	C7: 0.358	Unc: 0.063
  
  Rule 8: Gender = 1.0
 

In [29]:
DSC3.print_most_important_rules(classes=["Insufficient_Weight", "Normal_Weight", "Overweight_Level_I", "Overweight_Level_II", "Obesity_Type_I", "Obesity_Type_II", "Obesity_Type_III"])



Most important rules for class Insufficient_Weight

	[0.706] R14: Weight < 61.293
			Ins: 0.512	Nor: 0.460	Ove: 0.000	Ove: 0.000	Obe: 0.000	Obe: 0.000	Obe: 0.000	Unc: 0.028

	[0.613] R1: 18.209 < Age < 21.644
			Ins: 0.383	Nor: 0.220	Ove: 0.180	Ove: 0.000	Obe: 0.081	Obe: 0.001	Obe: 0.115	Unc: 0.020

	[0.592] R53: 1.011 < FAF < 1.378
			Ins: 0.351	Nor: 0.000	Ove: 0.170	Ove: 0.089	Obe: 0.096	Obe: 0.207	Obe: 0.088	Unc: 0.000

	[0.585] R64: CAEC = 2.0
			Ins: 0.446	Nor: 0.136	Ove: 0.000	Ove: 0.175	Obe: 0.007	Obe: 0.000	Obe: 0.002	Unc: 0.233

	[0.574] R36: 3.023 < NCP < 3.438
			Ins: 0.356	Nor: 0.000	Ove: 0.511	Ove: 0.040	Obe: 0.000	Obe: 0.017	Obe: 0.000	Unc: 0.075

	[0.536] R11: 1.703 < Height < 1.743
			Ins: 0.319	Nor: 0.001	Ove: 0.137	Ove: 0.308	Obe: 0.000	Obe: 0.000	Obe: 0.137	Unc: 0.098

	[0.536] R0: Age < 18.209
			Ins: 0.287	Nor: 0.218	Ove: 0.178	Ove: 0.024	Obe: 0.156	Obe: 0.000	Obe: 0.135	Unc: 0.000

	[0.535] R57: 0.067 < TUE < 0.398
			Ins: 0.286	Nor: 0.000	Ove: 0.233	Ove: 0.088	

In [26]:
Y3_pred = DSC3.predict(X_test)

print(f"Accuracy: {accuracy_score(Y_test, Y3_pred)}")
print(f'Classification Report:\n{classification_report(Y_test, Y3_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(Y_test, Y3_pred)}\n')

Accuracy: 0.8425047438330171
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.90        68
           1       0.68      0.79      0.73        63
           2       0.75      0.76      0.75        74
           3       0.78      0.60      0.68        70
           4       0.83      0.84      0.84        90
           5       0.93      0.97      0.95        70
           6       0.99      1.00      0.99        92

    accuracy                           0.84       527
   macro avg       0.84      0.84      0.83       527
weighted avg       0.84      0.84      0.84       527

Confusion Matrix:
[[60  8  0  0  0  0  0]
 [ 3 50  7  1  2  0  0]
 [ 2  8 56  5  2  1  0]
 [ 1  6  7 42 11  3  0]
 [ 0  1  5  6 76  1  1]
 [ 0  1  0  0  1 68  0]
 [ 0  0  0  0  0  0 92]]



In [27]:
DSC4 = DSClassifierMultiQ(7, min_iter=50, max_iter=400, debug_mode=True, \
                         lossfn="MSE", num_workers=0, min_dloss=1e-7, precompute_rules=True)

In [31]:
losses, epoch, dt = DSC4.fit(X_train, Y_train, add_single_rules=True,
                            single_rules_breaks=2, add_mult_rules=False,
                                column_names=data.columns[:-1], print_every_epochs=31, print_final_model=True)

Optimization started
Processing epoch	94	0.0468	
Training time: 38.98s, epochs: 113

Least training loss reached: 0.043
DSModelMultiQ(
  DS Classifier using 52 rules
  
  Rule 1: Age < 21.644
  	C1: 0.328	C2: 0.267	C3: 0.131	C4: 0.034	C5: 0.107	C6: 0.000	C7: 0.113	Unc: 0.021
  
  Rule 2: 21.644 < Age < 27.158
  	C1: 0.067	C2: 0.093	C3: 0.047	C4: 0.000	C5: 0.097	C6: 0.263	C7: 0.214	Unc: 0.219
  
  Rule 3: Age > 27.158
  	C1: 0.000	C2: 0.007	C3: 0.060	C4: 0.451	C5: 0.000	C6: 0.340	C7: 0.000	Unc: 0.142
  
  Rule 4: Gender = 0.0
  	C1: 0.178	C2: 0.048	C3: 0.096	C4: 0.000	C5: 0.000	C6: 0.000	C7: 0.352	Unc: 0.326
  
  Rule 5: Gender = 1.0
  	C1: 0.116	C2: 0.238	C3: 0.066	C4: 0.202	C5: 0.030	C6: 0.338	C7: 0.000	Unc: 0.010
  
  Rule 6: Height < 1.663
  	C1: 0.000	C2: 0.111	C3: 0.034	C4: 0.041	C5: 0.388	C6: 0.099	C7: 0.301	Unc: 0.025
  
  Rule 7: 1.663 < Height < 1.743
  	C1: 0.102	C2: 0.016	C3: 0.031	C4: 0.251	C5: 0.140	C6: 0.000	C7: 0.312	Unc: 0.149
  
  Rule 8: Height > 1.743
  	C1: 0.321	C2

In [32]:
Y4_pred = DSC4.predict(X_test)

print(f"Accuracy: {accuracy_score(Y_test, Y4_pred)}")
print(f'Classification Report:\n{classification_report(Y_test, Y4_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(Y_test, Y4_pred)}\n')

Accuracy: 0.7817836812144212
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.81      0.80        68
           1       0.69      0.67      0.68        63
           2       0.68      0.69      0.68        74
           3       0.64      0.61      0.63        70
           4       0.81      0.72      0.76        90
           5       0.83      0.91      0.87        70
           6       0.95      1.00      0.97        92

    accuracy                           0.78       527
   macro avg       0.77      0.77      0.77       527
weighted avg       0.78      0.78      0.78       527

Confusion Matrix:
[[55  9  4  0  0  0  0]
 [ 9 42 10  2  0  0  0]
 [ 2  7 51  9  3  0  2]
 [ 4  3 10 43  6  2  2]
 [ 0  0  0 13 65 11  1]
 [ 0  0  0  0  6 64  0]
 [ 0  0  0  0  0  0 92]]

