In [3]:
import mlflow
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import mlflow.sklearn
from urllib.parse import urlparse

In [4]:
df = pd.read_csv('C:/Users/franc/OneDrive/Documentos/ISTEA/1er cuatrimestre 2025 - 4to de carrera/Laboratorio de mineria de datos/1erparcial/train.csv')

In [5]:
total_faltantes = df.isna().sum().sum()
print("Cantidad total de valores faltantes:", total_faltantes)

Cantidad total de valores faltantes: 0


In [None]:
proporcion_A = (df["label"] == "A").mean()
resultado = (round(proporcion_A, 2))
print("Proporcion de la clase A:", resultado)

Proporción de la clase A: 0.33


In [7]:
print(df)

     feature_1  feature_2  feature_3  feature_4 label
0     1.041272  -0.897334   0.744985   0.920314     C
1    -1.012247  -0.741571  -1.424422  -0.818969     A
2     2.783979  -1.352007  -0.546231   1.903337     C
3    -1.817238  -1.433484   0.316078   0.079265     A
4     3.103991  -0.715809  -1.612968   1.791170     C
..         ...        ...        ...        ...   ...
295  -1.580501  -2.499941   0.075922   0.064212     B
296  -1.299354  -0.470641  -2.356597  -0.862707     A
297   1.088015  -0.407751   1.223643  -1.212280     C
298   1.210784  -1.186221  -1.973701   1.253640     C
299  -1.042827  -0.994664  -0.537058  -2.428352     A

[300 rows x 5 columns]


In [8]:
def eval_metrics(actual, pred):
    acc = accuracy_score(actual, pred)
    f1 = f1_score(actual, pred, average = 'weighted')
    return acc, f1

In [9]:
train, test = train_test_split(df, test_size=0.25)

In [10]:
x_train = train.drop("label", axis = 1)
y_train = train["label"]

In [11]:
x_test = test.drop("label", axis = 1)
y_test = test["label"]

In [12]:
max_depth = 3
min_samples_split = 4
random_state = 42

In [13]:
with mlflow.start_run():
    dtc = DecisionTreeClassifier(max_depth = max_depth, min_samples_split = min_samples_split, random_state = random_state)
    dtc.fit(x_train, y_train) #Entrenamos el modelo con los datos de entrenamiento
    predicted_label = dtc.predict(x_test) #Predecimos el label con los datos de test
    (acc, f1) = eval_metrics(y_test, predicted_label) #Evaluamos el modelo con las metricas
    
    #Ahora registramos esta informacion 
    mlflow.log_param("max_depth", max_depth) #Registramos el hiperparametro max_depth
    mlflow.log_param("min_samples_split", min_samples_split) #Registramos el hiperparametro min_samples_split
    mlflow.log_metric("acc", acc) #Registramos la metrica de accuracy
    mlflow.log_metric("f1", f1) #Registramos la metrica f1
    
    print("Decision Tree Classifier (max_depth = %f, min_samples_split = %f):" % (max_depth, min_samples_split))
    print("acc:", acc)
    print("f1:", f1)
    
    tracking_url = urlparse(mlflow.get_tracking_uri()).scheme
    
    if tracking_url != "file":
        mlflow.sklearn.log_model(dtc, "model", registered_model_name = "model_1.pkl")
    else:
        mlflow.sklearn.log_model(dtc, "model")

Decision Tree Classifier (max_depth = 3.000000, min_samples_split = 4.000000):
acc: 0.8133333333333334
f1: 0.8166770436988354




A partir de ahora voy a hacer el experimento con los otros hiperparametros solicitados

In [14]:
max_depth2 = 5
min_samples_split2 = 2
random_state2 = 42

In [19]:
with mlflow.start_run():
    dtc2 = DecisionTreeClassifier(max_depth = max_depth2, min_samples_split = min_samples_split2, random_state = random_state2)
    dtc2.fit(x_train, y_train) #Entrenamos el modelo con los datos de entrenamiento
    predicted_label = dtc2.predict(x_test) #Predecimos el label con los datos de test
    (acc, f1) = eval_metrics(y_test, predicted_label) #Evaluamos el modelo con las metricas
    
    #Ahora registramos esta informacion 
    mlflow.log_param("max_depth", max_depth2) #Registramos el hiperparametro max_depth
    mlflow.log_param("min_samples_split", min_samples_split2) #Registramos el hiperparametro min_samples_split
    mlflow.log_metric("acc", acc) #Registramos la metrica de accuracy
    mlflow.log_metric("f1", f1) #Registramos la metrica f1
    
    print("Decision Tree Classifier (max_depth = %f, min_samples_split = %f):" % (max_depth2, min_samples_split2))
    print("acc2:", acc)
    print("f12:", f1)
    
    tracking_url = urlparse(mlflow.get_tracking_uri()).scheme
    
    if tracking_url != "file":
        mlflow.sklearn.log_model(dtc2, "model2", registered_model_name = "model_2.pkl")
    else:
        mlflow.sklearn.log_model(dtc2, "model2")

Decision Tree Classifier (max_depth = 5.000000, min_samples_split = 2.000000):
acc2: 0.8666666666666667
f12: 0.8662309368191721


