# DATOS EXTERNOS SHINY

# Creación de dataframe con datos de clientes de Bureau

El objetivo de este notebook es llevar a cabo la creación de una base de datos sobre información que ya se conoce, o se puede obtener de antemano, de los ciudadanos en relación a su historial crediticio.

In [2]:
import pandas as pd
import numpy as np
import pickle

In [8]:
data_path = "../data/application_train.csv"

In [9]:
datos = pd.read_csv(data_path)

### Ingeniería de variables llevada a cabo

eliminamos 4 NAs en la variable CODE_GENDER

In [13]:
datos = datos[datos['CODE_GENDER'] != 'XNA']

In [14]:
# Categorical features with Binary encode (0 or 1; two categories)

for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        datos[bin_feature], uniques = pd.factorize(datos[bin_feature])

In [15]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [16]:
# Categorical features with One-Hot encode
datos, cat_cols = one_hot_encoder(datos)

In [17]:
datos['DAYS_EMPLOYED'].value_counts()

 365243    55374
-200         156
-224         152
-230         151
-199         151
           ...  
-11866         1
-9817          1
-13659         1
-9561          1
-16314         1
Name: DAYS_EMPLOYED, Length: 12574, dtype: int64

In [18]:
# NaN values for DAYS_EMPLOYED: 365.243 -> nan
datos['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)

In [19]:
# Some simple new features (percentages)
datos['DAYS_EMPLOYED_PERC'] = datos['DAYS_EMPLOYED'] / datos['DAYS_BIRTH'] # porcentaje de su vida trabajado
datos['INCOME_CREDIT_PERC'] = datos['AMT_INCOME_TOTAL'] / datos['AMT_CREDIT'] # porcentaje de prestamo sobre ingresos
datos['INCOME_PER_PERSON'] = datos['AMT_INCOME_TOTAL'] / datos['CNT_FAM_MEMBERS'] # ingresos por cada familiar medio
datos['ANNUITY_INCOME_PERC'] = datos['AMT_ANNUITY'] / datos['AMT_INCOME_TOTAL'] # cuanto corresponde la anualidad a los ingresos
datos['PAYMENT_RATE'] = datos['AMT_ANNUITY'] / datos['AMT_CREDIT']

In [20]:
datos['AMT_ANNUITY']

0         24700.5
1         35698.5
2          6750.0
3         29686.5
4         21865.5
           ...   
307506    27558.0
307507    12001.5
307508    29979.0
307509    20205.0
307510    49117.5
Name: AMT_ANNUITY, Length: 307507, dtype: float64

## Preprocesado juntando los dataframes complementarios

### Preprocess bureau.csv and bureau_balance.csv

In [21]:
bureau = pd.read_csv('../data/bureau.csv', nrows = None)
bb = pd.read_csv('../data/bureau_balance.csv', nrows = None)

In [22]:
bb, bb_cat = one_hot_encoder(bb, nan_as_category=True)
bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category=True)

In [23]:
# Bureau balance: Perform aggregations and merge with bureau.csv
bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
for col in bb_cat:
    bb_aggregations[col] = ['mean']
bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
del bb, bb_agg

In [24]:
# Bureau and bureau_balance numeric features
num_aggregations = {
    'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
    'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
    'DAYS_CREDIT_UPDATE': ['mean'],
    'CREDIT_DAY_OVERDUE': ['max', 'mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
    'AMT_ANNUITY': ['max', 'mean'],
    'CNT_CREDIT_PROLONG': ['sum'],
    'MONTHS_BALANCE_MIN': ['min'],
    'MONTHS_BALANCE_MAX': ['max'],
    'MONTHS_BALANCE_SIZE': ['mean', 'sum']
}
    
cat_aggregations = {}
for cat in bureau_cat: cat_aggregations[cat] = ['mean']
for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']

bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])

# Bureau: Active credits - using only numerical aggregations

active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
del active, active_agg

In [25]:
closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
del closed, closed_agg, bureau

In [26]:
# bureau_agg

### Preprocess previous_applications.csv 

In [27]:
prev = pd.read_csv('../data/previous_application.csv', nrows = None)
prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
# Days 365.243 values -> nan
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
# Add feature: value ask / value received percentage
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
# Previous applications numeric features
num_aggregations = {
    'AMT_ANNUITY': ['min', 'max', 'mean'],
    'AMT_APPLICATION': ['min', 'max', 'mean'],
    'AMT_CREDIT': ['min', 'max', 'mean'],
    'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
    'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
    'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
    'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum'],
}
# Previous applications categorical features
cat_aggregations = {}
for cat in cat_cols:
    cat_aggregations[cat] = ['mean']

prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
# Previous Applications: Approved Applications - only numerical features
approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
# Previous Applications: Refused Applications - only numerical features
refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
del refused, refused_agg, approved, approved_agg, prev

In [28]:
# prev_agg

### Preprocess POS_CASH_balance.csv

In [29]:
pos = pd.read_csv('../data/POS_CASH_balance.csv', nrows = None)
pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
# Features
aggregations = {
    'MONTHS_BALANCE': ['max', 'mean', 'size'],
    'SK_DPD': ['max', 'mean'],
    'SK_DPD_DEF': ['max', 'mean']
}
for cat in cat_cols:
    aggregations[cat] = ['mean']

pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
# Count pos cash accounts
pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
del pos

In [30]:
# pos_agg

### Preprocess installments_payments.csv

In [31]:
ins = pd.read_csv('../data/installments_payments.csv', nrows = None)
ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
# Percentage and difference paid in each installment (amount paid and installment value)
ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
# Days past due and days before due (no negative values)
ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
# Features: Perform aggregations
aggregations = {
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'DPD': ['max', 'mean', 'sum'],
    'DBD': ['max', 'mean', 'sum'],
    'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
    'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
    'AMT_INSTALMENT': ['max', 'mean', 'sum'],
    'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
    'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
}
for cat in cat_cols:
    aggregations[cat] = ['mean']
ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
# Count installments accounts
ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
del ins

In [32]:
# ins_agg

### Preprocess credit_card_balance.csv

In [33]:
cc = pd.read_csv('../data/credit_card_balance.csv', nrows = None)
cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
# General aggregations
cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
# Count credit card lines
cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
del cc

In [34]:
# cc_agg

In [35]:
datos_bureau = datos.loc[:, ['SK_ID_CURR',
     'EXT_SOURCE_1',
     'EXT_SOURCE_2',
     'EXT_SOURCE_3','AMT_REQ_CREDIT_BUREAU_HOUR',
     'AMT_REQ_CREDIT_BUREAU_DAY',
     'AMT_REQ_CREDIT_BUREAU_WEEK',
     'AMT_REQ_CREDIT_BUREAU_MON',
     'AMT_REQ_CREDIT_BUREAU_QRT',
     'AMT_REQ_CREDIT_BUREAU_YEAR']]

datos_bureau

Unnamed: 0,SK_ID_CURR,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,0.083037,0.262949,0.139376,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0.311267,0.622246,,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,,0.555912,0.729567,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,,0.650442,,,,,,,
4,100007,,0.322738,,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
307506,456251,0.145570,0.681632,,,,,,,
307507,456252,,0.115992,,,,,,,
307508,456253,0.744026,0.535722,0.218859,1.0,0.0,0.0,1.0,0.0,1.0
307509,456254,,0.514163,0.661024,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
variables_externas = datos_bureau.join(bureau_agg, how='left', on='SK_ID_CURR')
variables_externas = variables_externas.join(prev_agg, how='left', on='SK_ID_CURR')
variables_externas = variables_externas.join(pos_agg, how='left', on='SK_ID_CURR')
variables_externas = variables_externas.join(ins_agg, how='left', on='SK_ID_CURR')
variables_externas = variables_externas.join(cc_agg, how='left', on='SK_ID_CURR')

In [37]:
variables_externas

Unnamed: 0,SK_ID_CURR,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,100002,0.083037,0.262949,0.139376,0.0,0.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
1,100003,0.311267,0.622246,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,100004,,0.555912,0.729567,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,100006,,0.650442,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,100007,,0.322738,,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,456251,0.145570,0.681632,,,,,,,,...,,,,,,,,,,
307507,456252,,0.115992,,,,,,,,...,,,,,,,,,,
307508,456253,0.744026,0.535722,0.218859,1.0,0.0,0.0,1.0,0.0,1.0,...,,,,,,,,,,
307509,456254,,0.514163,0.661024,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [38]:
variables_externas = variables_externas.rename(columns = lambda x: x.replace(':', '').replace(',', ''))

In [49]:
variables_externas = variables_externas.set_index('SK_ID_CURR')
variables_externas

Unnamed: 0_level_0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,BURO_DAYS_CREDIT_MIN,...,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,0.083037,0.262949,0.139376,0.0,0.0,0.0,0.0,0.0,1.0,-1437.0,...,,,,,,,,,,
100003,0.311267,0.622246,,0.0,0.0,0.0,0.0,0.0,0.0,-2586.0,...,,,,,,,,,,
100004,,0.555912,0.729567,0.0,0.0,0.0,0.0,0.0,0.0,-1326.0,...,,,,,,,,,,
100006,,0.650442,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
100007,,0.322738,,0.0,0.0,0.0,0.0,0.0,0.0,-1149.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456251,0.145570,0.681632,,,,,,,,,...,,,,,,,,,,
456252,,0.115992,,,,,,,,,...,,,,,,,,,,
456253,0.744026,0.535722,0.218859,1.0,0.0,0.0,1.0,0.0,1.0,-919.0,...,,,,,,,,,,
456254,,0.514163,0.661024,0.0,0.0,0.0,0.0,0.0,0.0,-1104.0,...,,,,,,,,,,


Cargamos la base de datos resultante del EDAspy para poder obtener las variables que se encuentren en dicho DataFrame, ya que son las que se incluyen en el modelo de clasificación.

In [39]:
datos_Edaspy = pd.read_csv('../data/datos_edaspy.csv', nrows = None)

In [40]:
datos_Edaspy

Unnamed: 0,CODE_GENDER,AMT_INCOME_TOTAL,AMT_ANNUITY,AMT_GOODS_PRICE,DAYS_BIRTH,DAYS_ID_PUBLISH,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_EMAIL,REGION_RATING_CLIENT,...,CC_NAME_CONTRACT_STATUS_Refused_SUM,CC_NAME_CONTRACT_STATUS_Sent proposal_MIN,CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN,CC_NAME_CONTRACT_STATUS_Sent proposal_VAR,CC_NAME_CONTRACT_STATUS_Signed_MIN,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_SUM
0,0,202500.0,24700.5,351000.0,-9461,-2120,0,1,0,2,...,,,,,,,,,,
1,1,270000.0,35698.5,1129500.0,-16765,-291,0,1,0,1,...,,,,,,,,,,
2,0,67500.0,6750.0,135000.0,-19046,-2531,1,1,0,2,...,,,,,,,,,,
3,1,135000.0,29686.5,297000.0,-19005,-2437,0,1,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,121500.0,21865.5,513000.0,-19932,-3458,0,1,0,2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307502,0,157500.0,27558.0,225000.0,-9327,-1982,0,1,0,1,...,,,,,,,,,,
307503,1,72000.0,12001.5,225000.0,-20775,-4090,0,1,0,2,...,,,,,,,,,,
307504,1,153000.0,29979.0,585000.0,-14966,-5150,0,1,1,3,...,,,,,,,,,,
307505,1,171000.0,20205.0,319500.0,-11961,-931,0,1,0,2,...,,,,,,,,,,


In [41]:
EDA_features = datos_Edaspy.columns

In [42]:
EDA_features = list(EDA_features)

Generamos una lista con las variables que se encuentran en los datos externos y en la base de datos resultante del EDAspy.

In [43]:
lista = []
for i in variables_externas.columns:
    if i in EDA_features:
        lista.append(i)
    else:
        pass

In [44]:
lista

['EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'BURO_DAYS_CREDIT_MIN',
 'BURO_DAYS_CREDIT_MAX',
 'BURO_DAYS_CREDIT_VAR',
 'BURO_DAYS_CREDIT_ENDDATE_MIN',
 'BURO_DAYS_CREDIT_UPDATE_MEAN',
 'BURO_CREDIT_DAY_OVERDUE_MAX',
 'BURO_CREDIT_DAY_OVERDUE_MEAN',
 'BURO_AMT_CREDIT_SUM_MEAN',
 'BURO_AMT_CREDIT_SUM_DEBT_MAX',
 'BURO_AMT_ANNUITY_MAX',
 'BURO_AMT_ANNUITY_MEAN',
 'BURO_CNT_CREDIT_PROLONG_SUM',
 'BURO_MONTHS_BALANCE_MIN_MIN',
 'BURO_MONTHS_BALANCE_SIZE_SUM',
 'BURO_CREDIT_ACTIVE_Active_MEAN',
 'BURO_CREDIT_ACTIVE_Closed_MEAN',
 'BURO_CREDIT_ACTIVE_Sold_MEAN',
 'BURO_CREDIT_ACTIVE_nan_MEAN',
 'BURO_CREDIT_CURRENCY_currency 1_MEAN',
 'BURO_CREDIT_CURRENCY_currency 3_MEAN',
 'BURO_CREDIT_CURRENCY_currency 4_MEAN',
 'BURO_CREDIT_TYPE_Car loan_MEAN',
 'BURO_CREDIT_TYPE_Microloan_MEAN',
 'BURO_STATUS_1_MEAN_MEAN',
 'BURO_STATUS_3_MEAN_MEAN',
 'BURO_STATUS_5_MEAN_MEAN',
 'BURO_STATUS_nan_MEAN_MEAN',
 'ACTIVE_DAYS_CREDIT_MEAN',
 

In [50]:
datos_externos = variables_externas.loc[:,lista]
datos_externos

Unnamed: 0_level_0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,BURO_DAYS_CREDIT_MIN,BURO_DAYS_CREDIT_MAX,BURO_DAYS_CREDIT_VAR,BURO_DAYS_CREDIT_ENDDATE_MIN,BURO_DAYS_CREDIT_UPDATE_MEAN,...,CC_NAME_CONTRACT_STATUS_Refused_SUM,CC_NAME_CONTRACT_STATUS_Sent proposal_MIN,CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN,CC_NAME_CONTRACT_STATUS_Sent proposal_VAR,CC_NAME_CONTRACT_STATUS_Signed_MIN,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_nan_MIN,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_SUM
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100002,0.083037,0.262949,0.139376,0.0,0.0,-1437.0,-103.0,186150.000000,-1072.0,-499.875000,...,,,,,,,,,,
100003,0.311267,0.622246,,0.0,0.0,-2586.0,-606.0,827783.583333,-2434.0,-816.000000,...,,,,,,,,,,
100004,,0.555912,0.729567,0.0,0.0,-1326.0,-408.0,421362.000000,-595.0,-532.000000,...,,,,,,,,,,
100006,,0.650442,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100007,,0.322738,,0.0,0.0,-1149.0,-1149.0,,-783.0,-783.000000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456251,0.145570,0.681632,,,,,,,,,...,,,,,,,,,,
456252,,0.115992,,,,,,,,,...,,,,,,,,,,
456253,0.744026,0.535722,0.218859,0.0,1.0,-919.0,-713.0,10609.000000,-189.0,-253.250000,...,,,,,,,,,,
456254,,0.514163,0.661024,0.0,0.0,-1104.0,-1104.0,,-859.0,-401.000000,...,,,,,,,,,,


In [51]:
datos_externos.to_csv('../data/datos_externos.csv')