In [91]:
import pandas as pd
import numpy as np

from dsgd.DSClassifierMultiQ import DSClassifierMultiQ



from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt

# Clasificadores
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Exploracion de Datos y pre procesamiento

In [92]:
data = pd.read_csv('data/obesity.csv')
print("tamaño del dataset: ", data.shape)
data["Gender"] = data["Gender"].apply(lambda x: 1 if x =="Male" else 0)
data["CALC"]= data["CALC"].map({'no': 0,'Sometimes': 1,'Frequently': 2,'Always': 3})
data["FAVC"] = data["FAVC"].apply(lambda x: 1 if x =="yes" else 0)
data["SCC"] = data["SCC"].apply(lambda x: 1 if x =="yes" else 0)
data["SMOKE"] = data["SMOKE"].apply(lambda x: 1 if x =="yes" else 0)
data["family_history_with_overweight"] = data["family_history_with_overweight"].apply(lambda x: 1 if x =="yes" else 0)
data["CAEC"]= data["CAEC"].map({'no': 0,'Sometimes': 1,'Frequently': 2,'Always': 3})
data = pd.get_dummies(data, columns=["MTRANS"], dtype=int)
print(data.dtypes)
print(data["NObeyesdad"].unique())
#desordenar
data = data.sample(frac=1).reset_index(drop=True)

tamaño del dataset:  (2111, 17)
Age                               float64
Gender                              int64
Height                            float64
Weight                            float64
CALC                                int64
FAVC                                int64
FCVC                              float64
NCP                               float64
SCC                                 int64
SMOKE                               int64
CH2O                              float64
family_history_with_overweight      int64
FAF                               float64
TUE                               float64
CAEC                                int64
NObeyesdad                         object
MTRANS_Automobile                   int64
MTRANS_Bike                         int64
MTRANS_Motorbike                    int64
MTRANS_Public_Transportation        int64
MTRANS_Walking                      int64
dtype: object
['Normal_Weight' 'Overweight_Level_I' 'Overweight_Level_II'
 'Obesity_Ty

In [93]:
y = data["NObeyesdad"]
X = data.drop(columns=["NObeyesdad"])
print(y.head())

0     Overweight_Level_I
1        Obesity_Type_II
2     Overweight_Level_I
3    Insufficient_Weight
4       Obesity_Type_III
Name: NObeyesdad, dtype: object


# Testeo de los clasificadores

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

In [95]:
# Diccionario de modelos
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

# Función para entrenar y evaluar modelos
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'{name} Accuracy: {accuracy:.4f}')
        print(f'Classification Report for {name}:\n{classification_report(y_test, y_pred)}')
        print(f'Confusion Matrix for {name}:\n{confusion_matrix(y_test, y_pred)}\n')
        print(f'---------------------------------------------------\n')

# Llamada a la función
train_and_evaluate(models, X_train, y_train, X_test, y_test)


Logistic Regression Accuracy: 0.6420
Classification Report for Logistic Regression:
                     precision    recall  f1-score   support

Insufficient_Weight       0.83      0.72      0.77        68
      Normal_Weight       0.53      0.58      0.55        72
     Obesity_Type_I       0.42      0.38      0.40        88
    Obesity_Type_II       0.85      0.91      0.88        74
   Obesity_Type_III       0.79      0.99      0.88        81
 Overweight_Level_I       0.60      0.62      0.61        73
Overweight_Level_II       0.42      0.32      0.36        72

           accuracy                           0.64       528
          macro avg       0.63      0.64      0.63       528
       weighted avg       0.63      0.64      0.63       528

Confusion Matrix for Logistic Regression:
[[49 19  0  0  0  0  0]
 [ 9 42  4  0  2 11  4]
 [ 0  1 33 11 18  6 19]
 [ 0  1  6 67  0  0  0]
 [ 0  0  0  1 80  0  0]
 [ 1 11  6  0  1 45  9]
 [ 0  6 30  0  0 13 23]]

------------------------------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest Accuracy: 0.9451
Classification Report for Random Forest:
                     precision    recall  f1-score   support

Insufficient_Weight       1.00      0.91      0.95        68
      Normal_Weight       0.77      0.96      0.85        72
     Obesity_Type_I       0.98      0.95      0.97        88
    Obesity_Type_II       0.99      0.99      0.99        74
   Obesity_Type_III       1.00      0.99      0.99        81
 Overweight_Level_I       0.95      0.86      0.91        73
Overweight_Level_II       0.97      0.94      0.96        72

           accuracy                           0.95       528
          macro avg       0.95      0.94      0.95       528
       weighted avg       0.95      0.95      0.95       528

Confusion Matrix for Random Forest:
[[62  6  0  0  0  0  0]
 [ 0 69  0  0  0  2  1]
 [ 0  4 84  0  0  0  0]
 [ 0  1  0 73  0  0  0]
 [ 0  0  0  1 80  0  0]
 [ 0  9  0  0  0 63  1]
 [ 0  1  2  0  0  1 68]]

------------------------------------------------

# Testeo del clasificador de DS

In [96]:
Y = y.map({'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Overweight_Level_I': 2, 'Overweight_Level_II': 3, 'Obesity_Type_I': 4, 'Obesity_Type_II': 5, 'Obesity_Type_III': 6})
X = X.apply(pd.to_numeric)

cut = int(0.25 * len(data))

X_train = X.iloc[:-cut].values
X_test = X.iloc[-cut:].values
Y_train = Y.iloc[:-cut].values
Y_test = Y.iloc[-cut:].values


print(len(X_train), len(X_test), len(Y_train), len(Y_test))
print(Y.head())


1584 527 1584 527
0    2
1    5
2    2
3    0
4    6
Name: NObeyesdad, dtype: int64


In [97]:
DSC = DSClassifierMultiQ(7, min_iter=50, max_iter=400, debug_mode=True, \
                         lossfn="MSE", num_workers=0, min_dloss=1e-7)

In [98]:
losses, epoch, dt = DSC.fit(X_train, Y_train, add_single_rules=True,
                            single_rules_breaks=3, add_mult_rules=False,
                                column_names=data.columns[:-1], print_every_epochs=31, print_final_model=True)

Optimization started
Processing epoch	94	0.0483	
Training time: 60.13s, epochs: 113

Least training loss reached: 0.040
DSModelMultiQ(
  DS Classifier using 60 rules
  
  Rule 1: Age < 20.038
  	C1: 0.339	C2: 0.167	C3: 0.129	C4: 0.044	C5: 0.208	C6: 0.000	C7: 0.107	Unc: 0.006
  
  Rule 2: 20.038 < Age < 24.388
  	C1: 0.062	C2: 0.143	C3: 0.328	C4: 0.000	C5: 0.287	C6: 0.000	C7: 0.021	Unc: 0.160
  
  Rule 3: 24.388 < Age < 28.737
  	C1: 0.000	C2: 0.122	C3: 0.015	C4: 0.000	C5: 0.000	C6: 0.480	C7: 0.263	Unc: 0.120
  
  Rule 4: Age > 28.737
  	C1: 0.000	C2: 0.042	C3: 0.153	C4: 0.347	C5: 0.098	C6: 0.323	C7: 0.000	Unc: 0.037
  
  Rule 5: Gender = 0.0
  	C1: 0.205	C2: 0.035	C3: 0.128	C4: 0.010	C5: 0.143	C6: 0.000	C7: 0.320	Unc: 0.160
  
  Rule 6: Gender = 1.0
  	C1: 0.138	C2: 0.224	C3: 0.044	C4: 0.118	C5: 0.000	C6: 0.377	C7: 0.000	Unc: 0.099
  
  Rule 7: Height < 1.641
  	C1: 0.000	C2: 0.209	C3: 0.101	C4: 0.032	C5: 0.361	C6: 0.089	C7: 0.152	Unc: 0.056
  
  Rule 8: 1.641 < Height < 1.703
  	C1: 0

In [99]:
Y_pred = DSC.predict(X_test)

print(f"Accuracy: {accuracy_score(Y_test, Y_pred)}")
print(f'Classification Report:\n{classification_report(Y_test, Y_pred)}')
print(f'Confusion Matrix:\n{confusion_matrix(Y_test, Y_pred)}\n')

Accuracy: 0.7722960151802657
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.87      0.85        77
           1       0.65      0.71      0.68        66
           2       0.68      0.61      0.64        71
           3       0.73      0.56      0.64        73
           4       0.68      0.78      0.72        86
           5       0.83      0.86      0.84        69
           6       0.97      0.98      0.97        85

    accuracy                           0.77       527
   macro avg       0.77      0.77      0.76       527
weighted avg       0.77      0.77      0.77       527

Confusion Matrix:
[[67  9  1  0  0  0  0]
 [ 9 47  7  2  1  0  0]
 [ 3  8 43  7  9  0  1]
 [ 0  7 11 41 13  1  0]
 [ 1  1  1  6 67  9  1]
 [ 0  0  0  0  9 59  1]
 [ 0  0  0  0  0  2 83]]

