# Scripts del Proyecto

### Script 1: Preparacion de datos para el entrenamiento

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder


In [None]:
# Leemos la tabla de entrenamiento
df = pd.read_csv("../data/raw/defaultloan.csv").set_index('Loan_ID')

In [None]:
# Eliminanos na's
df=df.dropna()

In [None]:
# Recodificación de variables
lb=LabelEncoder()
df['Gender']=lb.fit_transform(df['Gender'])
df['Married']=lb.fit_transform(df['Married'])
df['Education']=lb.fit_transform(df['Education'])
df['Self_Employed']=lb.fit_transform(df['Self_Employed'])
df['Property_Area']=lb.fit_transform(df['Property_Area'])
df['Loan_Status']=lb.fit_transform(df['Loan_Status'])


In [None]:
# Cambiamos el tipo de datos
df['LoanAmount']=df['LoanAmount'].apply(np.int64)
df['CoapplicantIncome']=df['CoapplicantIncome'].apply(np.int64)
df['Loan_Amount_Term']=df['Loan_Amount_Term'].apply(np.int64)
df['Credit_History']=df['Credit_History'].apply(np.int64)

In [None]:
#Eliminación de columnas
df=df.drop(['Dependents'], axis=1)

In [None]:
#Seguimos eliminando columnas por redundancia
df=df.drop(['Loan_Amount_Term','Gender','Education'], axis=1)
df_f=df.drop(['Married'], axis=1)

In [None]:
df_f.to_csv("../data/processed/loan_train.csv")

### Script 2: Código de Entrenamiento

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import pickle

In [None]:
# Cargar la tabla transformada
df = pd.read_csv("../data/processed/loan_train.csv").set_index('Loan_ID')
X_train = df.drop(['Loan_Status'],axis=1)
y_train = df[['Loan_Status']]

In [None]:
# Entrenamos el modelo con toda la muestra
lr_mod = LogisticRegression()
lr_mod.fit(X_train,y_train)

In [None]:
# Guardamos el modelo entrenado para usarlo en produccion
filename = '../models/best_model.pkl'
pickle.dump(lr_mod, open(filename, 'wb'))

### Script 3: Preparación de Datos de Validación

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
# Leemos la tabla de entrenamiento
df = pd.read_csv("../data/raw/defaultloan_new.csv").set_index('Loan_ID')

In [None]:
# Eliminanos na's
df=df.dropna()

In [None]:
# Recodificación de variables
lb=LabelEncoder()
df['Gender']=lb.fit_transform(df['Gender'])
df['Married']=lb.fit_transform(df['Married'])
df['Education']=lb.fit_transform(df['Education'])
df['Self_Employed']=lb.fit_transform(df['Self_Employed'])
df['Property_Area']=lb.fit_transform(df['Property_Area'])
df['Loan_Status']=lb.fit_transform(df['Loan_Status'])


In [None]:
# Cambiamos el tipo de datos
df['LoanAmount']=df['LoanAmount'].apply(np.int64)
df['CoapplicantIncome']=df['CoapplicantIncome'].apply(np.int64)
df['Loan_Amount_Term']=df['Loan_Amount_Term'].apply(np.int64)
df['Credit_History']=df['Credit_History'].apply(np.int64)

In [None]:
#Eliminación de columnas
df=df.drop(['Dependents'], axis=1)

In [None]:
#Seguimos eliminando columnas por redundancia
df=df.drop(['Loan_Amount_Term','Gender','Education'], axis=1)
df_f=df.drop(['Married'], axis=1)

In [None]:
df_f.to_csv("../data/processed/loan_val.csv")

### Script 4: Código de Validación

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from sklearn.metrics import *

In [None]:
# Cargar la tabla transformada
df = pd.read_csv("../data/processed/loan_val.csv").set_index('Loan_ID')
X_test = df.drop(['Loan_Status'],axis=1)
y_test = df[['Loan_Status']]

In [None]:
# Leemos el modelo entrenado!
filename = '../models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [None]:
# Predecimos sobre el set de datos de implementacion con el modelo entrenado
y_pred_test=model.predict(df.drop(['Loan_Status'],axis=1)) 

In [None]:
## Metricas de validación
def calc_metrics(y_test,y_pred_test):
    cm_test = confusion_matrix(y_test,y_pred_test)
    print("Matriz de confusion: ")
    print(cm_test)
    accuracy_test=accuracy_score(y_test,y_pred_test)
    print("Accuracy: ", accuracy_test)
    precision_test=precision_score(y_test,y_pred_test)
    print("Precision: ", precision_test)
    recall_test=recall_score(y_test,y_pred_test)
    print("Recall: ", recall_test)

In [None]:
def save_plot(title):
    plt.title(title)
    fig = plt.gcf()
    filename = title.replace(" ", "_").lower()
    fig.savefig('{}'.format(filename), dpi=500)
    plt.clf()

In [None]:
plot_confusion_matrix(model, X_test, y_test)
save_plot('Confusion Matrix')

In [None]:
plot_roc_curve(model, X_test, y_test)
save_plot('ROC Curve')

### Script 5: Preparación de Datos de Score (Automatización)

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [None]:
# Leemos la tabla de entrenamiento
df = pd.read_csv("../data/raw/defaultloan_score.csv").set_index('Loan_ID')

In [None]:
# Eliminanos na's
df=df.dropna()

In [None]:
# Recodificación de variables
lb=LabelEncoder()
df['Gender']=lb.fit_transform(df['Gender'])
df['Married']=lb.fit_transform(df['Married'])
df['Education']=lb.fit_transform(df['Education'])
df['Self_Employed']=lb.fit_transform(df['Self_Employed'])
df['Property_Area']=lb.fit_transform(df['Property_Area'])
df['Loan_Status']=lb.fit_transform(df['Loan_Status'])


In [None]:
# Cambiamos el tipo de datos
df['LoanAmount']=df['LoanAmount'].apply(np.int64)
df['CoapplicantIncome']=df['CoapplicantIncome'].apply(np.int64)
df['Loan_Amount_Term']=df['Loan_Amount_Term'].apply(np.int64)
df['Credit_History']=df['Credit_History'].apply(np.int64)

In [None]:
#Eliminación de columnas
df=df.drop(['Dependents'], axis=1)

In [None]:
#Seguimos eliminando columnas por redundancia
df=df.drop(['Loan_Amount_Term','Gender','Education'], axis=1)
df_f=df.drop(['Married'], axis=1)

In [None]:
df_f.to_csv("../data/processed/loan_score.csv")

### Scipt 6: Código de Scoring (Automatización)

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
import pickle

In [None]:
# Cargar la tabla transformada
df = pd.read_csv("../data/processed/loan_score.csv").set_index('Loan_ID')

In [None]:
# Leemos el modelo entrenado!
filename = '../models/best_model.pkl'
model = pickle.load(open(filename, 'rb'))

In [None]:
# Predecimos sobre el set de datos de implementacion con el modelo entrenado
scores=model.predict(df).reshape(-1,1)

In [None]:
# Exportamos el resultado del modelo para cargarlo en el Feature Store o Data Mart de Modelos
# Le asignamos nombres a las columnas
df_score = pd.DataFrame(scores, columns=['PREDICT'])
# Exportamos la solucion
df_score.to_csv('../data/scores/final_score_loan.csv')