In [11]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from feature_engineering import discretization as dc
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Leer dataset

In [55]:
data = pd.read_csv('./data/bank-additional-full.csv', sep=';')

In [57]:
data.head(3)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [58]:
X_train, X_test, y_train, y_test = train_test_split(data, data.y, test_size=0.3,
                                                    random_state=0)
X_train.shape, X_test.shape

((28831, 21), (12357, 21))

In [62]:
# Discretización 3 intervalos de igual tamaño

In [63]:
from sklearn.preprocessing import KBinsDiscretizer
enc_equal_width = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform').fit(X_train[['duration']])

In [64]:
enc_equal_width.bin_edges_

array([array([   0.        , 1399.66666667, 2799.33333333, 4199.        ])],
      dtype=object)

In [65]:
result = enc_equal_width.transform(X_train[['duration']])
pd.DataFrame(result)[0].value_counts()

0
0.0    28638
1.0      178
2.0       15
Name: count, dtype: int64

In [66]:
X_train_copy = X_train.copy(deep=True)
X_train_copy['duration_equal_width'] = enc_equal_width.transform(X_train[['duration']])
print(X_train_copy.head(10))

       age           job   marital            education  default housing loan  \
31880   37    management   married    university.degree  unknown      no  yes   
38177   54    management  divorced    university.degree  unknown     yes   no   
2459    49   blue-collar   married             basic.9y  unknown      no   no   
756     30      services   married    university.degree       no     yes   no   
11275   23   blue-collar    single             basic.9y       no      no   no   
29677   49  entrepreneur   married    university.degree       no     yes   no   
13016   27        admin.    single  professional.course       no     yes   no   
1518    38    management   married    university.degree  unknown      no   no   
34983   46        admin.   married          high.school       no     yes   no   
24965   47      services   married          high.school       no     yes   no   

         contact month day_of_week  ...  pdays  previous     poutcome  \
31880   cellular   may         thu 

In [67]:
# Discretización 3 grupos con la misma cantidad de observaciones 

In [68]:
enc_equal_freq = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='quantile').fit(X_train[['duration']])

In [69]:
enc_equal_freq.bin_edges_

array([array([   0.,  126.,  258., 4199.])], dtype=object)

In [70]:
result = enc_equal_freq.transform(X_train[['duration']])
pd.DataFrame(result)[0].value_counts()

0
1.0    9620
2.0    9616
0.0    9595
Name: count, dtype: int64

In [71]:
X_train_copy = X_train.copy(deep=True)
X_train_copy['duration_equal_freq'] = enc_equal_freq.transform(X_train[['duration']])
print(X_train_copy.head(10))

       age           job   marital            education  default housing loan  \
31880   37    management   married    university.degree  unknown      no  yes   
38177   54    management  divorced    university.degree  unknown     yes   no   
2459    49   blue-collar   married             basic.9y  unknown      no   no   
756     30      services   married    university.degree       no     yes   no   
11275   23   blue-collar    single             basic.9y       no      no   no   
29677   49  entrepreneur   married    university.degree       no     yes   no   
13016   27        admin.    single  professional.course       no     yes   no   
1518    38    management   married    university.degree  unknown      no   no   
34983   46        admin.   married          high.school       no     yes   no   
24965   47      services   married          high.school       no     yes   no   

         contact month day_of_week  ...  pdays  previous     poutcome  \
31880   cellular   may         thu 

In [72]:
# Discretización agrupación en 3 grupos

In [73]:
enc_kmeans = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='kmeans').fit(X_train[['duration']])

In [74]:
enc_kmeans.bin_edges_

array([array([   0.        ,  309.54072312,  819.81365142, 4199.        ])],
      dtype=object)

In [75]:
result = enc_kmeans.transform(X_train[['duration']])
pd.DataFrame(result)[0].value_counts()

0
0.0    21306
1.0     6376
2.0     1149
Name: count, dtype: int64

In [76]:
X_train_copy = X_train.copy(deep=True)
X_train_copy['duration_kmeans'] = enc_kmeans.transform(X_train[['duration']])
print(X_train_copy.head(10))

       age           job   marital            education  default housing loan  \
31880   37    management   married    university.degree  unknown      no  yes   
38177   54    management  divorced    university.degree  unknown     yes   no   
2459    49   blue-collar   married             basic.9y  unknown      no   no   
756     30      services   married    university.degree       no     yes   no   
11275   23   blue-collar    single             basic.9y       no      no   no   
29677   49  entrepreneur   married    university.degree       no     yes   no   
13016   27        admin.    single  professional.course       no     yes   no   
1518    38    management   married    university.degree  unknown      no   no   
34983   46        admin.   married          high.school       no     yes   no   
24965   47      services   married          high.school       no     yes   no   

         contact month day_of_week  ...  pdays  previous     poutcome  \
31880   cellular   may         thu 

In [77]:
# Discretización con árbol de decisiones

In [78]:
enc1 = dc.DiscretizeByDecisionTree(col='duration',max_depth=2).fit(X=X_train,y=y_train)

In [79]:
enc1.tree_model

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,2
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [80]:
data1 = enc1.transform(data)

In [81]:
print(data1.head(5))
print(data1.duration_tree_discret.unique())

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  pdays  previous     poutcome  emp.var.rate  \
0   may         mon  ...    999         0  nonexistent           1.1   
1   may         mon  ...    999         0  nonexistent           1.1   
2   may         mon  ...    999         0  nonexistent           1.1   
3   may         mon  ...    999         0  nonexistent           1.1   
4   may         mon  ...    999         0  nonexistent           1.1   

  cons.price.idx  cons.conf.idx  euribor3m  nr.employed  y  \
0         93.994          -36.4      4.857      

In [82]:
col='duration'
bins = pd.concat([data1.groupby([col+'_tree_discret'])[col].min(),
                  data1.groupby([col+'_tree_discret'])[col].max()], axis=1)
print(bins)

                       duration  duration
duration_tree_discret                    
0.033354                      0       205
0.141218                    206       524
0.363194                    525       835
0.583794                    836      4918


In [83]:
# Discretización con ChiMerge

In [85]:
enc3 = dc.ChiMerge(col='duration',num_of_bins=5).fit(X=X_train,y='y')

Interval for variable duration
   variable interval  flag_0  flag_1
0  duration  -inf,94    6332      39
1  duration   94,168    6919     289
2  duration  168,508   10430    1503
3  duration  508,835    1446     787
4  duration     835+     452     634


In [86]:
enc3.bins

[-0.1, 94, 168, 508, 835, 4199]

In [87]:
data3 = enc3.transform(data)

In [88]:
print(data3.head(5))

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  pdays  previous     poutcome  emp.var.rate  \
0   may         mon  ...    999         0  nonexistent           1.1   
1   may         mon  ...    999         0  nonexistent           1.1   
2   may         mon  ...    999         0  nonexistent           1.1   
3   may         mon  ...    999         0  nonexistent           1.1   
4   may         mon  ...    999         0  nonexistent           1.1   

  cons.price.idx  cons.conf.idx  euribor3m  nr.employed  y  duration_chimerge  
0         93.994          -36.

In [90]:
data3.duration_chimerge.unique()

[(168.0, 508.0], (94.0, 168.0], (-0.101, 94.0], (835.0, 4199.0], (508.0, 835.0], NaN]
Categories (5, interval[float64, right]): [(-0.101, 94.0] < (94.0, 168.0] < (168.0, 508.0] < (508.0, 835.0] < (835.0, 4199.0]]