In [99]:


import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier



In [100]:
# Cargar los datos
data_path = r'C:\Users\Usuario\OneDrive\Documents\Portfolio\Portfolio\clasic_models\data\german.data'
df = pd.read_csv(data_path, header=None, delim_whitespace=True)

column_names = [
    "Status of Existing Checking Account", "Duration in Months", "Credit History",
    "Purpose", "Credit Amount", "Savings Account/Bonds", "Present Employment Since",
    "Installment Rate in Percentage of Disposable Income", "Personal Status and Sex",
    "Other Debtors/Guarantors", "Present Residence Since", "Property", "Age in Years",
    "Other Installment Plans", "Housing", "Number of Existing Credits at This Bank",
    "Job", "Number of People Liable to Provide Maintenance For", "Telephone",
    "Foreign Worker","Target"
]

# Assign column names to the DataFrame
df.columns = column_names

# Exploración básica
print(df.head())
print(df.describe())


  Status of Existing Checking Account  Duration in Months Credit History  \
0                                 A11                   6            A34   
1                                 A12                  48            A32   
2                                 A14                  12            A34   
3                                 A11                  42            A32   
4                                 A11                  24            A33   

  Purpose  Credit Amount Savings Account/Bonds Present Employment Since  \
0     A43           1169                   A65                      A75   
1     A43           5951                   A61                      A73   
2     A46           2096                   A61                      A74   
3     A42           7882                   A61                      A74   
4     A40           4870                   A61                      A73   

   Installment Rate in Percentage of Disposable Income  \
0                                 

In [101]:
# Encoding categorical columns

categorical_columns = df.dtypes[df.dtypes == 'object'].index.tolist()

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for col in categorical_columns:
    num_unique_values = df[col].nunique()
    if num_unique_values == 2:
        # Boolean numerical encoding
        df[col] = df[col].astype('category').cat.codes
    elif num_unique_values > 2:
        # Ordinal enco
        df[col] = label_encoder.fit_transform(df[col])

df.dtypes

Status of Existing Checking Account                    int32
Duration in Months                                     int64
Credit History                                         int32
Purpose                                                int32
Credit Amount                                          int64
Savings Account/Bonds                                  int32
Present Employment Since                               int32
Installment Rate in Percentage of Disposable Income    int64
Personal Status and Sex                                int32
Other Debtors/Guarantors                               int32
Present Residence Since                                int64
Property                                               int32
Age in Years                                           int64
Other Installment Plans                                int32
Housing                                                int32
Number of Existing Credits at This Bank                int64
Job                     

In [102]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Preprocesamiento (suponiendo un conjunto de datos básico)
X = df.drop('Target', axis=1)  # Reemplaza 'target' con el nombre de la columna objetivo
y = df['Target']

sm = SMOTE(sampling_strategy=1, k_neighbors=5, random_state=1)
X, y = sm.fit_resample(X, y)

# Dividir el dataset en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Escalado de características
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Definir los modelos
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "Neural Network": MLPClassifier(max_iter=1000)
}


In [103]:
import mlflow


# Configura el URI de tracking para conectarte al servidor MLflow
mlflow.set_tracking_uri("http://localhost:5000")

# Establecer el experimento (puedes usar el "Default" o crear uno nuevo)
mlflow.set_experiment("Base")  # O el nombre del experimento que desees usar



<Experiment: artifact_location='file:///C:/Users/Usuario/mlruns/2', creation_time=1723497374173, experiment_id='2', last_update_time=1723497374173, lifecycle_stage='active', name='Base', tags={}>

In [104]:
import warnings
import mlflow.sklearn
# Ignorar solo las advertencias específicas de distutils y setuptools
warnings.filterwarnings("ignore", message="Setuptools is replacing distutils.")
warnings.filterwarnings("ignore", message="Distutils was imported before Setuptools")

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Entrenar el modelo
        model.fit(X_train, y_train)
        
        # Predicciones
        y_pred = model.predict(X_test)
        y_probs = model.predict_proba(X_test)[:, 1]
        
        # Calcular métricas
        acc = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_probs)
        loss = log_loss(y_test, y_probs)
        
        # Loguear parámetros y métricas en MLflow
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("auc_roc", auc)
        mlflow.log_metric("log_loss", loss)
        
        
        
        # Guardar el modelo
        input_example = X_test[0:1]
        mlflow.sklearn.log_model(model, "model",input_example=input_example)


2024/08/12 15:21:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: http://localhost:5000/#/experiments/2/runs/4ae90e36fd2349108e1a17402a44b435.
2024/08/12 15:21:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/2.
2024/08/12 15:21:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run Decision Tree at: http://localhost:5000/#/experiments/2/runs/e653bf0e9aff4f9b940bfeef3ca35f71.
2024/08/12 15:21:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/2.
2024/08/12 15:21:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://localhost:5000/#/experiments/2/runs/336f6cd738a640fe8cb05129d7cd019a.
2024/08/12 15:21:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/2.
2024/08/12 15:21:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVM at: http:

In [105]:
# Configura el URI de tracking para conectarte al servidor MLflow
mlflow.set_tracking_uri("http://localhost:5000")

# Establecer el experimento
mlflow.set_experiment("GridSearch_Portfolio")

<Experiment: artifact_location='file:///C:/Users/Usuario/mlruns/4', creation_time=1723497384990, experiment_id='4', last_update_time=1723497384990, lifecycle_stage='active', name='GridSearch_Portfolio', tags={}>

In [106]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV




# Definir los modelos y sus parámetros para Grid Search
model_param_grids = {
    "Logistic Regression": (LogisticRegression(), {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    }),
    "Decision Tree": (DecisionTreeClassifier(), {
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }),
    "Random Forest": (RandomForestClassifier(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }),
    "SVM": (SVC(probability=True), {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['rbf']
    }),
    "Neural Network": (MLPClassifier(max_iter=300), {
        'hidden_layer_sizes': [(50,50,50), (100,), (100,100,100)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05],
        'learning_rate': ['constant','adaptive'],
    })
}

# Iterar sobre cada modelo y sus hiperparámetros
for model_name, (model, param_grid) in model_param_grids.items():
    with mlflow.start_run(run_name=model_name):
        # Configurar GridSearchCV
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='precision', n_jobs=-1)
        
        # Entrenar el modelo
        grid_search.fit(X_train, y_train)
        
        # Obtener el mejor modelo encontrado por GridSearchCV
        best_model = grid_search.best_estimator_
        
        # Realizar predicciones
        y_pred = best_model.predict(X_test)
        y_probs = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None
        
        # Calcular métricas
        acc = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        # Registrar parámetros y métricas en MLflow
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        
        # Registrar el mejor modelo en MLflow
        mlflow.sklearn.log_model(best_model, "best_model")
        
        print(f"Mejor modelo para {model_name} registrado con accuracy: {acc}, precision: {precision}, recall: {recall}, F1 Score: {f1}")


2024/08/12 15:22:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: http://localhost:5000/#/experiments/4/runs/e15bc14f0b33474e8e6173188b759a9d.
2024/08/12 15:22:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/4.


Mejor modelo para Logistic Regression registrado con accuracy: 0.8142857142857143, precision: 0.821043964488813, recall: 0.8142857142857143, F1 Score: 0.8142857142857143


2024/08/12 15:22:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run Decision Tree at: http://localhost:5000/#/experiments/4/runs/bd60fd21cc904df0bc241496cedb3205.
2024/08/12 15:22:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/4.


Mejor modelo para Decision Tree registrado con accuracy: 0.7464285714285714, precision: 0.7465580028080029, recall: 0.7464285714285714, F1 Score: 0.7464837584609447


2024/08/12 15:22:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://localhost:5000/#/experiments/4/runs/ba2b0a1c4d3b4055b27dc5563cecc877.
2024/08/12 15:22:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/4.


Mejor modelo para Random Forest registrado con accuracy: 0.8357142857142857, precision: 0.8401331873745668, recall: 0.8357142857142857, F1 Score: 0.8358316566063044


2024/08/12 15:22:18 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVM at: http://localhost:5000/#/experiments/4/runs/07e09c6e6e8f4f92971999bee193cb25.
2024/08/12 15:22:18 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/4.


Mejor modelo para SVM registrado con accuracy: 0.85, precision: 0.8556876886092782, recall: 0.85, F1 Score: 0.8500612276136538


2024/08/12 15:22:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run Neural Network at: http://localhost:5000/#/experiments/4/runs/5f71f91c29bf40ff8c25eab4657556a0.
2024/08/12 15:22:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/4.


Mejor modelo para Neural Network registrado con accuracy: 0.8357142857142857, precision: 0.8367840189008072, recall: 0.8357142857142857, F1 Score: 0.8358654379764582
