# Práctica Final: Clasificación con Scikit-learn y MLflow

In [6]:
#  Exploración de Datos: Analiza el conjunto de datos proporcionado para comprender su estructura y contenido.
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, classification_report

In [9]:
from sklearn.datasets import load_wine

# Cargar dataset
wine = load_wine()

# Convertir a DataFrame
df = pd.DataFrame(wine["data"], columns=wine["feature_names"])
df["target"] = wine["target"]

# Ver distribución de las clases
print("\nDistribución de clases:\n", df["target"].value_counts())



Distribución de clases:
 target
1    71
0    59
2    48
Name: count, dtype: int64


In [10]:
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [22]:
# Dividir en Train/Test
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns="target"), df["target"], test_size=0.2, random_state=42, stratify=df["target"])

In [31]:
# Entrenar un modelo Random Forest
clf = RandomForestClassifier(n_estimators=4, 
                             min_samples_leaf=2, 
                             class_weight="balanced", 
                             random_state=42)


preprocessor = Pipeline(steps=[("scaler", StandardScaler())])

model = Pipeline(steps=[("preprocessor", preprocessor), ("RandomForestClassifier", clf)])

In [32]:
model.fit(x_train, y_train)

# Predicciones
y_pred = model.predict(x_test)

# Evaluar el modelo
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nReporte de Clasificación:\n", classification_report(y_test, y_pred))


Accuracy: 1.0

Reporte de Clasificación:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        10

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [35]:
accuracy_train = model.score(x_train, y_train)
accuracy_test = model.score(x_test, y_test)
print(accuracy_train)
print(accuracy_test)

0.971830985915493
1.0


# Evaluación del Modelo y Obtención de Parámetros

In [40]:
import mlflow
mlflow.set_experiment('Wine Classifier with Acuracy BD14')
with mlflow.start_run(run_name = 'First training with random state 2'): 
    mlflow.log_metric('accuracy_train', accuracy_train)
    mlflow.log_param('random_state', 20)
    mlflow.log_metric('accuracy_test', '0.1')
    mlflow.sklearn.log_model(model, 'clf')

2025/02/21 12:10:52 INFO mlflow.tracking.fluent: Experiment with name 'Wine Classifier with Acuracy BD14' does not exist. Creating a new experiment.


In [41]:
import pandas as pd
import numpy as np
import mlflow
import time
import mlflow.sklearn
import subprocess
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_wine

def load_dataset():
  wine = load_wine()
  df = pd.DataFrame(wine['data'], columns=wine['feature_names'])
  df['target'] = wine['target']
  return df

def data_treatment(df):
  train, test = train_test_split(df, test_size=0.2)
  test_target = test['target']
  test[['target']].to_csv('test-target.csv', index=False)
  del test['target']
  test.to_csv('test.csv', index=False)

  x_train, x_test, y_train, y_test = train_test_split(df.drop(columns="target"), df["target"],
                                                      test_size=0.2,
                                                      random_state=42,
                                                      stratify=df["target"])
  return x_train, x_test, y_train, y_test

def mlflow_tracking(x_train, x_test, y_train, y_test):
  n_stimators = [2,10,20,30,50,80,100,130]

  mlflow.set_experiment('Pruebas desde Script')
  for i in n_stimators:
    with mlflow.start_run() as run:
      clf = RandomForestClassifier(n_estimators=i,
                                  min_samples_leaf=2,
                                  class_weight='balanced',
                                  random_state=123)

      preprocessor = Pipeline(steps=[('scaler', StandardScaler())])

      model = Pipeline(steps=[('preprocessor', preprocessor),
                                ('RandomForestClassifier', clf)])
      model.fit(x_train, y_train)
      accuracy_train = model.score(x_train, y_train)
      model.score(x_test, y_test)

      mlflow.log_metric('accuraty_train', accuracy_train)
      mlflow.log_param('n_stimators', i)
      mlflow.sklearn.log_model(model, 'clf-modellll')
  print("Se ha acabado de entrenar el modelo correctamente! \n")

In [39]:
def main():
    print("Ejecutemos el main.")
    df = load_dataset()
    x_train, x_test, y_train, y_test = data_treatment(df)
    mlflow_tracking( x_train, x_test, y_train, y_test)

In [45]:
main

<function __main__.main()>