# Librerías

In [12]:
# Plotting libraries
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
# Scikit learn
import sklearn.preprocessing as skp
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
# Scikit Metrics
from sklearn.metrics import log_loss, classification_report, accuracy_score, f1_score,roc_auc_score
# Data manipulation libraries
import pandas as pd
import numpy as np
# General use
from tqdm.notebook import tqdm
from joblib import dump, load
import warnings
warnings.filterwarnings("ignore")

# Funciones

In [13]:
def numeric_columns(df):
  numeric_columns = []
  for column in df.columns.tolist():
    if df.dtypes[column] != 'object':
      numeric_columns.append(column)
  return numeric_columns

def plot_numeric_data(df, nrows=3, ncols=4, exclude_columns=['SK_ID_BUREAU', 'SK_ID_CURR']):
  fig, ax = plt.subplots(figsize=(16, 18), nrows=nrows, ncols=ncols)
  cont = 0
  for idx, column in enumerate(tqdm(numeric_columns(df))):
    if df.dtypes[column] != 'object' and column not in exclude_columns:
      df[column].plot(kind='kde',
                      xlabel=column,
                      ax=ax[(idx + cont)//4, (idx + cont)%4])
      ax[(idx + cont)//4, (idx + cont)%4].set_xlabel(column, fontsize=8)
    else:
      cont -= 1

  plt.subplots_adjust(bottom=.25, left=.25)
  plt.tight_layout()
  plt.show()

# Descripción de las columnas
def dataframe_columns_description(df):
  print('-' * 160)
  for index, row in description_df[description_df['Row'].isin(df)].drop('Table', axis=1).iterrows():
    print(f"|{row['Row']}: {row['Description']} ({row['Special']})" if type(row['Special']) != float else f"|{row['Row']}: {row['Description']}")
    print('-' * 160)

# Datos
Se cargan los datos para explorarlos.

In [14]:
data_pth = '../Data/'

description_df = pd.read_csv(data_pth + 'columns_description.csv', index_col=0)
loan_history_df = pd.read_csv(data_pth + 'historial_prestamos.csv', index_col=0) #.drop(['Unnamed: 0'], axis=1)
loan_history_movements_df = pd.read_csv(data_pth + 'historial_prestamos_movimientos.csv', index_col=0) #.drop(['Unnamed: 0'], axis=1)
request_test_df = pd.read_csv(data_pth + 'solicitudes_test.csv', index_col=0)
request_train_df = pd.read_csv(data_pth + 'solicitudes_train.csv', index_col=0)
credit_card_movements_df = pd.read_csv(data_pth + 'tarjeta_credito_movimientos.csv', index_col=0) #.drop(['Unnamed: 0'], axis=1)

loan_history_df.to_csv('historial_prestamos.csv', index=False)
loan_history_movements_df.to_csv('historial_prestamos_movimientos.csv', index=False)
credit_card_movements_df.to_csv('tarjeta_credito_movimientos.csv', index=False)
request_train_df.to_csv('solicitudes_train.csv', index=False)
request_test_df.to_csv('solicitudes_test.csv', index=False)

Se hace 'display' para ver la forma que tienen los data frames

In [22]:
display(description_df.head())
print(description_df.shape)
display(loan_history_df.head())
print(loan_history_df.shape)
display(loan_history_movements_df.head())
print(loan_history_movements_df.shape)
display(credit_card_movements_df.head())
print(credit_card_movements_df.shape)
display(request_train_df.head())
print(request_train_df.shape)

Unnamed: 0,Table,Row,Description,Special
0,solicitud_{train/test}.csv,SK_ID_CURR,ID of loan in our sample,
1,solicitud_{train/test}.csv,TARGET,Target variable (1 - client with payment diffi...,
2,solicitud_{train/test}.csv,NAME_CONTRACT_TYPE,Identification if loan is cash or revolving,
3,solicitud_{train/test}.csv,CODE_GENDER,Gender of the client,
4,solicitud_{train/test}.csv,FLAG_OWN_CAR,Flag if the client owns a car,


(165, 4)


Unnamed: 0_level_0,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


(1025651, 16)


Unnamed: 0_level_0,MONTHS_BALANCE,STATUS
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1
5715448,0,C
5715448,-1,C
5715448,-2,C
5715448,-3,C
5715448,-4,C


(10314567, 2)


Unnamed: 0_level_0,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,...,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
SK_ID_PREV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2582071,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,...,64875.555,64875.555,1.0,1,0.0,0.0,69.0,Active,0,0
1389973,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,11925.0,...,233048.97,233048.97,1.0,1,0.0,0.0,10.0,Active,0,0
1891521,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,27000.0,...,453919.455,453919.455,0.0,1,0.0,1.0,101.0,Active,0,0
2181852,367360,-4,291543.075,292500,90000.0,289339.425,0.0,199339.425,130.5,4093.515,...,286831.575,286831.575,3.0,8,0.0,5.0,3.0,Active,0,0
1235299,203885,-5,201261.195,225000,76500.0,111026.7,0.0,34526.7,6338.34,45000.0,...,197224.695,197224.695,3.0,9,0.0,6.0,38.0,Active,0,0


(2267051, 22)


Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
331475,0,Cash loans,F,Y,Y,0,135000.0,835380.0,30955.5,675000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
366488,0,Cash loans,F,N,Y,0,103500.0,659533.5,26284.5,589500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
444791,0,Cash loans,F,N,Y,0,135000.0,219042.0,23130.0,193500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
112960,0,Revolving loans,F,N,Y,0,67500.0,675000.0,33750.0,675000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,9.0
418735,0,Revolving loans,F,N,Y,0,315000.0,900000.0,45000.0,900000.0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,1.0


(150679, 121)


In [16]:
nans_cols = []
for idx, row in pd.DataFrame(request_train_df.isna().sum()).iterrows():
  if row[0] == 0:
    nans_cols.append(idx)

# Importante la utilización de copy para que python explicite un nuevo espacio de memoria para nuevo dataframe
request_train_filtered_df = request_train_df[nans_cols].copy()
request_train_filtered_df

Unnamed: 0_level_0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,...,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
331475,0,Cash loans,F,Y,Y,0,135000.0,835380.0,Pensioner,Secondary / secondary special,...,0,0,0,0,0,0,0,0,0,0
366488,0,Cash loans,F,N,Y,0,103500.0,659533.5,Pensioner,Secondary / secondary special,...,0,0,0,0,0,0,0,0,0,0
444791,0,Cash loans,F,N,Y,0,135000.0,219042.0,Working,Incomplete higher,...,0,0,0,0,0,0,0,0,0,0
112960,0,Revolving loans,F,N,Y,0,67500.0,675000.0,Working,Secondary / secondary special,...,0,0,0,0,0,0,0,0,0,0
418735,0,Revolving loans,F,N,Y,0,315000.0,900000.0,Pensioner,Higher education,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244356,0,Cash loans,M,N,Y,0,112500.0,112068.0,Working,Secondary / secondary special,...,0,0,0,0,0,0,0,0,0,0
213955,0,Cash loans,F,N,Y,0,135000.0,781920.0,State servant,Secondary / secondary special,...,0,0,0,0,0,0,0,0,0,0
376877,0,Revolving loans,F,N,Y,0,135000.0,202500.0,Pensioner,Secondary / secondary special,...,0,0,0,0,0,0,0,0,0,0
246747,0,Cash loans,F,N,N,1,135000.0,508495.5,Working,Higher education,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# utilización del descriptor de columnas
print('\nloan_history_df:\n')
dataframe_columns_description(loan_history_df)
print('\nloan_history_movements_df:\n')
dataframe_columns_description(loan_history_movements_df)


loan_history_df:

----------------------------------------------------------------------------------------------------------------------------------------------------------------
|AMT_ANNUITY: Loan annuity
----------------------------------------------------------------------------------------------------------------------------------------------------------------
|CREDIT_ACTIVE: Status of the Credit Bureau (CB) reported credits
----------------------------------------------------------------------------------------------------------------------------------------------------------------
|CREDIT_CURRENCY: Recoded currency of the Credit Bureau credit (recoded)
----------------------------------------------------------------------------------------------------------------------------------------------------------------
|DAYS_CREDIT: How many days before current application did client apply for Credit Bureau credit (time only relative to the application)
----------------------------------

In [18]:
print(f"Percentage of Closed Loan credits: {len(loan_history_df[loan_history_df['CREDIT_ACTIVE'] == 'Closed']) / len(loan_history_df)}")
print(f"Percentage of Active Loan credits: {len(loan_history_df[loan_history_df['CREDIT_ACTIVE'] == 'Active']) / len(loan_history_df)}")
print(f'Forma del dataframe {loan_history_df.shape}')
loan_history_df.head()

Percentage of Closed Loan credits: 0.6257976641177164
Percentage of Active Loan credits: 0.3703715981362081
Forma del dataframe (1025651, 16)


Unnamed: 0_level_0,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,


In [19]:
plot_numeric_data(loan_history_df[loan_history_df['CREDIT_ACTIVE'] == 'Active'].groupby(['SK_ID_CURR']).mean())

TypeError: Could not convert ActiveActive to numeric