In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Métricas
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [2]:
df = pd.read_csv("heloc.csv")

In [3]:
df

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,Bad,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,Bad,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0
2,Bad,67,66,5,24,9,0,0,100,-7,...,44,0,4,4,53,66,4,2,1,86
3,Bad,66,169,1,73,28,1,1,93,76,...,57,0,5,4,72,83,6,4,3,91
4,Bad,81,333,27,132,12,0,0,100,-7,...,25,0,1,1,51,89,3,1,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10454,Good,73,131,5,57,21,0,0,95,80,...,19,7,0,0,26,-8,5,2,0,100
10455,Bad,65,147,39,68,11,0,0,92,28,...,42,1,1,1,86,53,2,2,1,80
10456,Bad,74,129,6,64,18,1,1,100,-7,...,33,3,4,4,6,-8,5,-8,0,56
10457,Bad,72,234,12,113,42,2,2,96,35,...,20,6,0,0,19,-8,4,1,0,38


In [4]:
mapeo = {'Good': True, 'Bad': False}
df = df.replace(mapeo)

In [5]:
df[df.apply(lambda row: any(row < 0), axis=1)]

Unnamed: 0,RiskPerformance,ExternalRiskEstimate,MSinceOldestTradeOpen,MSinceMostRecentTradeOpen,AverageMInFile,NumSatisfactoryTrades,NumTrades60Ever2DerogPubRec,NumTrades90Ever2DerogPubRec,PercentTradesNeverDelq,MSinceMostRecentDelq,...,PercentInstallTrades,MSinceMostRecentInqexcl7days,NumInqLast6M,NumInqLast6Mexcl7days,NetFractionRevolvingBurden,NetFractionInstallBurden,NumRevolvingTradesWBalance,NumInstallTradesWBalance,NumBank2NatlTradesWHighUtilization,PercentTradesWBalance
0,False,55,144,4,84,20,3,0,83,2,...,43,0,0,0,33,-8,8,1,1,69
1,False,61,58,15,41,2,4,4,100,-7,...,67,0,0,0,0,-8,0,-8,-8,0
2,False,67,66,5,24,9,0,0,100,-7,...,44,0,4,4,53,66,4,2,1,86
4,False,81,333,27,132,12,0,0,100,-7,...,25,0,1,1,51,89,3,1,0,80
8,False,59,324,2,138,24,0,0,85,5,...,26,0,1,1,68,-8,7,1,3,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10453,False,75,410,2,121,37,1,1,83,10,...,7,1,1,1,34,-8,7,1,1,53
10454,True,73,131,5,57,21,0,0,95,80,...,19,7,0,0,26,-8,5,2,0,100
10456,False,74,129,6,64,18,1,1,100,-7,...,33,3,4,4,6,-8,5,-8,0,56
10457,False,72,234,12,113,42,2,2,96,35,...,20,6,0,0,19,-8,4,1,0,38


In [6]:
df_con_nans = df.replace([-9,-8,-7], np.nan)

In [7]:
%%time

best_results = []

for neighbors in range(1,21):
    imputer = KNNImputer(n_neighbors= neighbors)
    df_imputed = pd.DataFrame(imputer.fit_transform(df_con_nans), columns=df_con_nans.columns)
    #df_imputed = df_imputed[columnas_relevantes]
    X = df_imputed.drop(["RiskPerformance"], axis= 1)
    y= df_imputed["RiskPerformance"]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42, stratify = y)
    
    best_model = RandomForestClassifier()

    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)

    # Metricas:
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)

    # Optimizar para, si sale mejor precision que el anterior, sustituir!
    best_results.append([str(best_model), best_model, accuracy, precision, neighbors])

df_best_results = pd.DataFrame(best_results, columns= ["Nombre", "Modelo", "Accuracy", "Precision", "neighbors"])

CPU times: total: 1min 24s
Wall time: 2min


In [8]:
df_best_results.sort_values(by='Precision', ascending=False)

Unnamed: 0,Nombre,Modelo,Accuracy,Precision,neighbors
16,RandomForestClassifier(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.74761,0.744813,17
5,RandomForestClassifier(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.748088,0.744571,6
6,RandomForestClassifier(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.746176,0.74351,7
18,RandomForestClassifier(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.748088,0.742564,19
19,RandomForestClassifier(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.74761,0.7423,20
4,RandomForestClassifier(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.74761,0.7423,5
0,RandomForestClassifier(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.744742,0.741701,1
2,RandomForestClassifier(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.74522,0.741468,3
7,RandomForestClassifier(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.744742,0.741201,8
12,RandomForestClassifier(),"(DecisionTreeClassifier(max_features='sqrt', r...",0.747132,0.741044,13
