In [46]:


import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier



In [47]:
data_path = r'C:\Users\Usuario\OneDrive\Documents\Portfolio\Portfolio\clasic_models\data\german.data'
df = pd.read_csv(data_path, header=None, delim_whitespace=True)

column_names = [
    "Status of Existing Checking Account", "Duration in Months", "Credit History",
    "Purpose", "Credit Amount", "Savings Account/Bonds", "Present Employment Since",
    "Installment Rate in Percentage of Disposable Income", "Personal Status and Sex",
    "Other Debtors/Guarantors", "Present Residence Since", "Property", "Age in Years",
    "Other Installment Plans", "Housing", "Number of Existing Credits at This Bank",
    "Job", "Number of People Liable to Provide Maintenance For", "Telephone",
    "Foreign Worker","Target"
]

# Assign column names to the DataFrame
df.columns = column_names

# Displaying the first few rows of the dataset to ensure correct loading
df.head()

Unnamed: 0,Status of Existing Checking Account,Duration in Months,Credit History,Purpose,Credit Amount,Savings Account/Bonds,Present Employment Since,Installment Rate in Percentage of Disposable Income,Personal Status and Sex,Other Debtors/Guarantors,...,Property,Age in Years,Other Installment Plans,Housing,Number of Existing Credits at This Bank,Job,Number of People Liable to Provide Maintenance For,Telephone,Foreign Worker,Target
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [48]:

# Encoding categorical columns

categorical_columns = df.dtypes[df.dtypes == 'object'].index.tolist()

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for col in categorical_columns:
    num_unique_values = df[col].nunique()
    if num_unique_values == 2:
        # Boolean numerical encoding
        df[col] = df[col].astype('category').cat.codes
    elif num_unique_values > 2:
        # Ordinal enco
        df[col] = label_encoder.fit_transform(df[col])

df.dtypes


Status of Existing Checking Account                    int32
Duration in Months                                     int64
Credit History                                         int32
Purpose                                                int32
Credit Amount                                          int64
Savings Account/Bonds                                  int32
Present Employment Since                               int32
Installment Rate in Percentage of Disposable Income    int64
Personal Status and Sex                                int32
Other Debtors/Guarantors                               int32
Present Residence Since                                int64
Property                                               int32
Age in Years                                           int64
Other Installment Plans                                int32
Housing                                                int32
Number of Existing Credits at This Bank                int64
Job                     

In [49]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Preprocesamiento (suponiendo un conjunto de datos básico)
X = df.drop('Target', axis=1)  # Reemplaza 'target' con el nombre de la columna objetivo
y = df['Target']

sm = SMOTE(sampling_strategy=1, k_neighbors=5, random_state=1)
X, y = sm.fit_resample(X, y)

# Dividir el dataset en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Escalado de características
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



# Definir los modelos
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True),
    "Neural Network": MLPClassifier(max_iter=1000)
}


In [50]:
import mlflow


# Configura el URI de tracking para conectarte al servidor MLflow
mlflow.set_tracking_uri("http://localhost:5000")

# Establecer el experimento (puedes usar el "Default" o crear uno nuevo)
mlflow.set_experiment("Base")  # O el nombre del experimento que desees usar



<Experiment: artifact_location='file:///C:/Users/Usuario/AppData/Local/Programs/Python/Python311/Scripts/1', creation_time=1723566247708, experiment_id='1', last_update_time=1723566247708, lifecycle_stage='active', name='Base', tags={}>

In [51]:
import warnings
import mlflow.sklearn
import pandas as pd

# Ignorar solo las advertencias específicas de distutils y setuptools
warnings.filterwarnings("ignore", message="Setuptools is replacing distutils.")
warnings.filterwarnings("ignore", message="Distutils was imported before Setuptools")

# Crear una lista para almacenar los resultados
results = []

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Entrenar el modelo
        model.fit(X_train, y_train)
        
        # Predicciones
        y_pred = model.predict(X_test)
        y_probs = model.predict_proba(X_test)[:, 1]
        
        # Calcular métricas
        acc = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_probs)
        loss = log_loss(y_test, y_probs)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        # Guardar los resultados en la lista
        results.append({
            "Model": model_name,
            "Accuracy": acc,
            "AUC ROC": auc,
            "Log Loss": loss,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1
        })
        
        # Loguear parámetros y métricas en MLflow
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("auc_roc", auc)
        mlflow.log_metric("log_loss", loss)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        
        # Guardar el modelo
        input_example = X_test[0:1]
        mlflow.sklearn.log_model(model, "model", input_example=input_example)

# Crear un DataFrame con los resultados
results_df = pd.DataFrame(results)



# Seleccionar el mejor modelo basado en una métrica (por ejemplo, F1 Score)
best_model_row = results_df.loc[results_df["F1 Score"].idxmax()]
best_model_name = best_model_row["Model"]
best_acc = best_model_row["Accuracy"]
best_precision = best_model_row["Precision"]
best_recall = best_model_row["Recall"]
best_f1 = best_model_row["F1 Score"]

print(f"Mejor modelo: {best_model_name} registrado con Accuracy: {best_acc}, Precision: {best_precision}, Recall: {best_recall}, F1 Score: {best_f1}")


2024/08/13 12:29:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: http://localhost:5000/#/experiments/1/runs/ec9ec0c69e574f708fc1bf33dcfb8e32.
2024/08/13 12:29:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2024/08/13 12:29:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run Decision Tree at: http://localhost:5000/#/experiments/1/runs/df5b106929e046139e8cbea26cd163c5.
2024/08/13 12:29:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2024/08/13 12:29:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://localhost:5000/#/experiments/1/runs/597498b535b04637b8b5521268b095c4.
2024/08/13 12:29:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/1.
2024/08/13 12:29:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVM at: http:

Mejor modelo: SVM registrado con Accuracy: 0.85, Precision: 0.8524746160518394, Recall: 0.85, F1 Score: 0.8501531862745098


In [52]:
# Configura el URI de tracking para conectarte al servidor MLflow
mlflow.set_tracking_uri("http://localhost:5000")

# Establecer el experimento
mlflow.set_experiment("GridSearch_Portfolio")

<Experiment: artifact_location='file:///C:/Users/Usuario/AppData/Local/Programs/Python/Python311/Scripts/2', creation_time=1723566259823, experiment_id='2', last_update_time=1723566259823, lifecycle_stage='active', name='GridSearch_Portfolio', tags={}>

In [53]:

from sklearn.model_selection import GridSearchCV

results_grid = []


# Definir los modelos y sus parámetros para Grid Search
model_param_grids = {
    "Logistic Regression Grid": (LogisticRegression(), {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    }),
    "Decision Tree Grid": (DecisionTreeClassifier(), {
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }),
    "Random Forest Grid": (RandomForestClassifier(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }),
    "SVM Grid": (SVC(probability=True), {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['rbf']
    }),
    "Neural Network Grid": (MLPClassifier(max_iter=1000), {
        'hidden_layer_sizes': [(50,50,50), (100,100,100,), (150,150,150)],
        'activation': ['tanh', 'relu'],
        'solver': ['sgd', 'adam'],
        'alpha': [0.0001, 0.05],
        'learning_rate': ['constant','adaptive'],
    })
}

# Iterar sobre cada modelo y sus hiperparámetros
for model_name, (model, param_grid) in model_param_grids.items():
    with mlflow.start_run(run_name=model_name):
        # Configurar GridSearchCV
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='precision', n_jobs=-1)
        
        # Entrenar el modelo
        grid_search.fit(X_train, y_train)
        
        # Obtener el mejor modelo encontrado por GridSearchCV
        best_model = grid_search.best_estimator_
        
        # Realizar predicciones
        y_pred = best_model.predict(X_test)
        y_probs = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None
        
        # Calcular métricas
        acc = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_probs)
        loss = log_loss(y_test, y_probs)
        acc = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        
                # Guardar los resultados en la lista
        results_grid.append({
            "Model": model_name,
            "Accuracy": acc,
            "AUC ROC": auc,
            "Log Loss": loss,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1
        })
        
        # Registrar parámetros y métricas en MLflow
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("auc_roc", auc)
        mlflow.log_metric("log_loss", loss)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        
   


results_grid = pd.DataFrame(results_grid)

# Seleccionar el mejor modelo basado en una métrica (por ejemplo, F1 Score)
best_model_row = results_grid.loc[results_grid["F1 Score"].idxmax()]
best_model_name = best_model_row["Model"]
best_acc = best_model_row["Accuracy"]
best_precision = best_model_row["Precision"]
best_recall = best_model_row["Recall"]
best_f1 = best_model_row["F1 Score"]
        
print(f"Mejor modelo: {best_model_name} registrado con Accuracy: {best_acc}, Precision: {best_precision}, Recall: {best_recall}, F1 Score: {best_f1}")


2024/08/13 12:29:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression Grid at: http://localhost:5000/#/experiments/2/runs/7b2282a9e12040729e2f53e27e80ff56.
2024/08/13 12:29:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/2.
2024/08/13 12:29:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run Decision Tree Grid at: http://localhost:5000/#/experiments/2/runs/0f218750cc804783b13400e49fda27f1.
2024/08/13 12:29:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/2.
2024/08/13 12:30:05 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest Grid at: http://localhost:5000/#/experiments/2/runs/9a21480eb693477badd23090a91faee4.
2024/08/13 12:30:05 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/2.
2024/08/13 12:30:07 INFO mlflow.tracking._tracking_service.client: 🏃 View ru

Mejor modelo: SVM Grid registrado con Accuracy: 0.85, Precision: 0.8556876886092782, Recall: 0.85, F1 Score: 0.8500612276136538


In [54]:
models_master = pd.concat([results_df,results_grid])
models_master.sort_values(by=["F1 Score"],ascending=False)

Unnamed: 0,Model,Accuracy,AUC ROC,Log Loss,Precision,Recall,F1 Score
3,SVM,0.85,0.910856,0.387212,0.852475,0.85,0.850153
3,SVM Grid,0.85,0.897997,0.406287,0.855688,0.85,0.850061
2,Random Forest Grid,0.839286,0.922819,0.392574,0.840651,0.839286,0.839444
2,Random Forest,0.828571,0.917388,0.395397,0.830274,0.828571,0.828747
4,Neural Network,0.817857,0.876428,1.012577,0.819923,0.817857,0.818046
0,Logistic Regression Grid,0.814286,0.884472,0.456048,0.821044,0.814286,0.814286
0,Logistic Regression,0.803571,0.884369,0.431577,0.808418,0.803571,0.803684
4,Neural Network Grid,0.796429,0.863005,1.384534,0.796846,0.796429,0.796546
1,Decision Tree,0.75,0.749424,9.010913,0.750281,0.75,0.750102
1,Decision Tree Grid,0.739286,0.778933,5.374331,0.74019,0.739286,0.739502


In [55]:


# Cargar el modelo registrado por su Run ID y nombre
model_uri = "runs:/daa2a6f82a974d09b08f17ba7eea27d1/model"

loaded_model = mlflow.sklearn.load_model(model_uri)

# Usar el modelo para hacer predicciones
predictions = loaded_model.predict(X_test)


In [56]:
import requests
import json

# La URL donde el servidor está corriendo
url = 'http://127.0.0.1:5001/invocations'

# Encabezados para la solicitud
headers = {"Content-Type": "application/json"}

# Ajustar el formato de los datos de entrada
data = {
    "instances": X_test[0:5].tolist()  # Conviertes los primeros 5 registros a una lista
}

# Envías la solicitud POST
response = requests.post(url, headers=headers, data=json.dumps(data))

# Imprimes las predicciones
print("Predicciones:", response.json())


Predicciones: {'predictions': [2, 2, 1, 1, 1]}
