In [118]:
import pickle
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
import mlflow 

In [101]:
data_train=pd.read_csv('../data/processed/features_for_model.csv')
data_test=pd.read_csv('../data/processed/test_dataset.csv')

In [102]:
x_features=data_train.drop(['Default'],axis=1)
y_target=data_train['Default']

x_features_test=data_test.drop(['Default'],axis=1)
y_target_test=data_test['Default']

In [103]:
with open ('../Artifacts/pipeline.pkl','rb') as f:
    credit_default_model_pipeline=pickle.load(f)

In [104]:
x_features_test_arr=credit_default_model_pipeline.transform(x_features_test)
df_features_test=pd.DataFrame(x_features_test_arr,columns=x_features_test.columns)
df_features_test

Unnamed: 0,Prct_uso_tc,Edad,Nro_prestao_retrasados,Prct_deuda_vs_ingresos,Mto_ingreso_mensual,Nro_prod_financieros_deuda,Nro_retraso_60dias,Nro_creditos_hipotecarios,Nro_retraso_ultm3anios,Nro_dependiente
0,0.310211,-0.286772,-0.099806,-0.691093,0.689499,-0.068067,-0.064097,-0.017477,-0.058353,0.216263
1,1.351092,-0.083727,-0.099806,1.811477,0.171510,0.354444,-0.064097,-0.017477,-0.058353,2.024714
2,1.900000,-1.369679,-0.099806,-0.137763,-1.517463,-0.913090,-0.064097,-0.907701,-0.058353,-0.687963
3,1.841611,-0.828225,0.826895,1.811477,0.171510,-0.279323,-0.064097,-0.017477,-0.058353,2.024714
4,-0.205455,-0.828225,-0.099806,-0.336876,0.132749,0.988211,-0.064097,0.872746,-0.058353,-0.687963
...,...,...,...,...,...,...,...,...,...,...
31495,-0.848637,1.472951,-0.099806,-0.931702,1.438730,0.565700,-0.064097,-0.017477,-0.058353,-0.687963
31496,1.156834,-1.437360,-0.099806,-0.659442,-0.630605,-0.490579,-0.064097,-0.907701,-0.058353,-0.687963
31497,0.733603,-0.286772,-0.099806,-0.146622,1.757670,1.621979,-0.064097,0.872746,-0.058353,0.216263
31498,-0.793920,-0.895907,-0.099806,-0.858712,0.371216,0.143188,-0.064097,-0.907701,-0.058353,-0.687963


In [119]:
mlflow.set_tracking_uri('http://127.0.0.1:8080')
mlflow.set_experiment('Credit Default Predict Model')

2024/12/19 22:01:25 INFO mlflow.tracking.fluent: Experiment with name 'Credit Default Predict Model' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/772824320481412301', creation_time=1734667285124, experiment_id='772824320481412301', last_update_time=1734667285124, lifecycle_stage='active', name='Credit Default Predict Model', tags={}>

### Modelo Naive Bayes

In [120]:
with mlflow.start_run():
    model_nb = GaussianNB(var_smoothing=0.5)
    model_nb.fit(x_features, y_target)
    y_pred_nb = model_nb.predict(df_features_test)

    acc_nb=accuracy_score(y_target_test,y_pred_nb)

    params_vals=dict(var_smoothing=0.5)
    mlflow.log_param(params_vals)

    #Registramos métricas
    mlflow.log_metric('accuracy_score',acc_nb)

    #Registramos  modelo entrenado
    mlflow.sklearn.log_model(model_nb,"Naive Bayes")


KeyboardInterrupt: 

### Modelo Random Forest

In [106]:
rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42, min_samples_split=10)
rf_classifier.fit(x_features, y_target)
y_pred_rf = rf_classifier.predict(df_features_test)

acc_rf=accuracy_score(y_target_test,y_pred_rf)


### Modelo Decision Tree

In [107]:
clf = DecisionTreeClassifier(criterion='gini',max_depth=5, min_samples_split=5)
clf.fit(x_features, y_target)
y_pred_dt = clf.predict(df_features_test)

acc_dt=accuracy_score(y_target_test,y_pred_dt)

### Modelo de Regresion

In [108]:
model_rl = LogisticRegression(C=10,solver='liblinear',penalty='l2')
model_rl.fit(x_features, y_target)
y_pred_rl = model_rl.predict(df_features_test)

acc_rl=accuracy_score(y_target_test,y_pred_rl)

### Modelo de Red Neuronal

In [109]:
model_rn = Sequential([

    Dense(5, activation='relu', input_shape=(10,)),  # Capa oculta con 5 neuronas
    Dense(1, activation='sigmoid') 
])

model_rn.compile(optimizer='adam',
              loss='binary_crossentropy', 

              metrics=['accuracy'])

model_rn.fit(x_features, y_target, epochs=10) 

y_pred_rn1 = model_rn.predict(df_features_test)
y_pred_rn = (y_pred_rn1 > 0.5).astype(int)
acc_rn=accuracy_score(y_target_test,y_pred_rn)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2297/2297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 630us/step - accuracy: 0.8066 - loss: 0.4323
Epoch 2/10
[1m2297/2297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 620us/step - accuracy: 0.9329 - loss: 0.2022
Epoch 3/10
[1m2297/2297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 614us/step - accuracy: 0.9332 - loss: 0.1917
Epoch 4/10
[1m2297/2297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 608us/step - accuracy: 0.9355 - loss: 0.1841
Epoch 5/10
[1m2297/2297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 656us/step - accuracy: 0.9342 - loss: 0.1821
Epoch 6/10
[1m2297/2297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 612us/step - accuracy: 0.9351 - loss: 0.1842
Epoch 7/10
[1m2297/2297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 608us/step - accuracy: 0.9341 - loss: 0.1849
Epoch 8/10
[1m2297/2297[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 611us/step - accuracy: 0.9355 - loss: 0.1822
Epoch 9/10
[1m2297

In [110]:
# Definir la red neuronal
def create_neural_network():
    model = Sequential()
    model.add(Dense(5, activation='relu', input_shape=(10,)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


# Crear un wrapper para la red neuronal
class NeuralNetworkStep:
    def __init__(self):
        self.model = create_neural_network()

    def fit(self, X, y):
        self.model.fit(x_features, y_target, epochs=10)

    def predict(self, X):
        return self.model.predict(test_dataset)

In [111]:
# Crear un diccionario con los modelos y sus precisiones
modelos = {'nb': acc_nb, 'rf': acc_rf, 'dt': acc_dt, 'rl': acc_rl, 'rn': acc_rn}

# Encontrar el modelo con la mayor precisión
mejor_modelo = max(modelos, key=modelos.get)

print("El mejor modelo es:", mejor_modelo)


El mejor modelo es: rf


In [112]:
if mejor_modelo == "nb":
    credit_default_model_pipeline.steps.append(('modelo_naive_bayes', GaussianNB()))
elif mejor_modelo == "rf":
    credit_default_model_pipeline.steps.append(('modelo_random_forest', RandomForestClassifier()))
elif mejor_modelo == "dt":
    credit_default_model_pipeline.steps.append(('modelo_decision_tree', DecisionTreeClassifier()))
elif mejor_modelo == "rl":
    credit_default_model_pipeline.steps.append(('modelo_regresion_lineal', LogisticRegression()))
elif mejor_modelo == "rn":
    credit_default_model_pipeline.steps.append(('modelo_red_neuronal', NeuralNetworkStep()))

In [113]:
credit_default_model_pipeline

In [114]:
train_dataset = pd.read_csv("../data/raw/train.csv")
train_dataset.drop(['ID'], axis=1, inplace=True)
train_dataset_features = train_dataset.drop('Default', axis=1)
train_dataset_target = train_dataset['Default']

In [115]:
credit_default_model_pipeline.fit(train_dataset_features,train_dataset_target)

In [116]:
with open('../artifacts/pipeline_winner_models.pkl','wb') as f:
    pickle.dump(credit_default_model_pipeline,f)