In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("../data/customer_dataset.csv", sep=",")

data

Unnamed: 0,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,transactionAmount,transactionFailed,fraudulent,emailProvider,emailDomain,customerIPAddressSimplified,sameCity
0,18.0,pending,True,card,JCB 16 digit,Citizens First Banks,18,False,False,yahoo,com,only_letters,yes
1,26.0,fulfilled,True,bitcoin,VISA 16 digit,Solace Banks,26,False,True,yahoo,com,only_letters,no
2,45.0,fulfilled,False,card,VISA 16 digit,Vertex Bancorp,45,False,False,yahoo,com,digits_and_letters,no
3,23.0,fulfilled,False,card,Diners Club / Carte Blanche,His Majesty Bank Corp.,23,False,False,yahoo,com,digits_and_letters,no
4,43.0,fulfilled,False,card,Mastercard,Vertex Bancorp,43,True,True,other,com,only_letters,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...
618,,,,,,,25,True,,weird,weird,only_letters,unknown
619,,,,,,,25,True,,weird,weird,only_letters,unknown
620,,,,,,,25,False,,weird,weird,only_letters,unknown
621,,,,,,,19,False,,weird,weird,only_letters,unknown


Copiamos el dataframe para evitar modificar el original:

In [3]:
data['orderAmount']

0      18.0
1      26.0
2      45.0
3      23.0
4      43.0
       ... 
618     NaN
619     NaN
620     NaN
621     NaN
622     NaN
Name: orderAmount, Length: 623, dtype: float64

In [4]:
test = data.copy()

Usamos qcut para CATEGORIZAR la variable 'orderAmount'. Con q=5 establecemos la cantidad de categorias.

Nota: con categorizar nos referimos a crear categorias (en este caso desde un valor hasta otro valor) para que en vez de decir "el valor de 'orderAmount' es X, decimos que el valor de orderAmount para este registro se encuentra en la categoria X. Pasamos de tener multiples valores a tener solo 5 valores posibles. 

En 'saved_bins_orders' guardamos los puntos de cortes, es decir, cuales son los valores que limitan cada categoria. 


In [5]:
#Categorizamos la variable en cinco grupos y guardamos los puntos de cortes. 
#Eliminamos los duplicados si los ubiese (duplicates='drop') y retornamos los bins o puntos de corte (retbins=True)
#_Al usar retbins el metodo qcut nos devuelve una tupla con las categorias y los puntos de corte. Aprovechamos esto
#_para guardarlo en sus respectivas variables
data['orderAmount'], saved_bins_order = pd.qcut(data['orderAmount'], q=5, duplicates='drop', retbins=True)

In [6]:
data['orderAmount']

0      (9.999, 18.4]
1       (18.4, 30.0]
2       (39.0, 47.0]
3       (18.4, 30.0]
4       (39.0, 47.0]
           ...      
618              NaN
619              NaN
620              NaN
621              NaN
622              NaN
Name: orderAmount, Length: 623, dtype: category
Categories (5, interval[float64, right]): [(9.999, 18.4] < (18.4, 30.0] < (30.0, 39.0] < (39.0, 47.0] < (47.0, 353.0]]

In [7]:
saved_bins_order

array([ 10. ,  18.4,  30. ,  39. ,  47. , 353. ])

In [10]:
import pickle

In [11]:
with open('../data/saved_bins_order.pickle', 'wb') as handle:
  pickle.dump(saved_bins_order, handle, protocol=pickle.HIGHEST_PROTOCOL)

Realizamos el mismo proceso para 'transactionAmount':

In [12]:
data['transactionAmount'], saved_bins_order_transactions_amount = pd.qcut(data['transactionAmount'], 
                                                                          q=4, 
                                                                          duplicates='drop', 
                                                                          retbins=True)

In [13]:
with open('../data/saved_bins_transaction.pickle', 'wb') as handle:
  pickle.dump(saved_bins_order_transactions_amount, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
with open('../data/saved_bins_order.pickle', 'rb') as handle:
    new_saved_bins_order = pickle.load(handle)

In [16]:
with open('../data/saved_bins_transaction.pickle', 'rb') as handle:
    new_saved_bins_transaction = pickle.load(handle)

In [17]:
test["orderAmount"] = pd.cut(
    test['orderAmount'],
    bins=new_saved_bins_order, 
    include_lowest=True) # importante para que coincidan todos

In [18]:
test["transactionAmount"] = pd.cut(
    test['transactionAmount'],
    bins=new_saved_bins_transaction, 
    include_lowest=True) # importante para que coincidan todos

In [20]:
test.head(3)

Unnamed: 0,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,transactionAmount,transactionFailed,fraudulent,emailProvider,emailDomain,customerIPAddressSimplified,sameCity
0,"(9.999, 18.4]",pending,True,card,JCB 16 digit,Citizens First Banks,"(9.999, 21.0]",False,False,yahoo,com,only_letters,yes
1,"(18.4, 30.0]",fulfilled,True,bitcoin,VISA 16 digit,Solace Banks,"(21.0, 34.0]",False,True,yahoo,com,only_letters,no
2,"(39.0, 47.0]",fulfilled,False,card,VISA 16 digit,Vertex Bancorp,"(34.0, 45.0]",False,False,yahoo,com,digits_and_letters,no


In [21]:
data.head(3)

Unnamed: 0,orderAmount,orderState,paymentMethodRegistrationFailure,paymentMethodType,paymentMethodProvider,paymentMethodIssuer,transactionAmount,transactionFailed,fraudulent,emailProvider,emailDomain,customerIPAddressSimplified,sameCity
0,"(9.999, 18.4]",pending,True,card,JCB 16 digit,Citizens First Banks,"(9.999, 21.0]",False,False,yahoo,com,only_letters,yes
1,"(18.4, 30.0]",fulfilled,True,bitcoin,VISA 16 digit,Solace Banks,"(21.0, 34.0]",False,True,yahoo,com,only_letters,no
2,"(39.0, 47.0]",fulfilled,False,card,VISA 16 digit,Vertex Bancorp,"(34.0, 45.0]",False,False,yahoo,com,digits_and_letters,no


In [22]:
from funpymodeling import freq_tbl

freq_tbl(test['orderAmount'])

Unnamed: 0,orderAmount,frequency,percentage,cumulative_perc
0,"(30.0, 39.0]",98,0.157303,0.205021
1,"(39.0, 47.0]",97,0.155698,0.40795
2,"(9.999, 18.4]",96,0.154093,0.608787
3,"(18.4, 30.0]",96,0.154093,0.809623
4,"(47.0, 353.0]",91,0.146067,1.0


In [23]:
freq_tbl(data['orderAmount'])

Unnamed: 0,orderAmount,frequency,percentage,cumulative_perc
0,"(30.0, 39.0]",98,0.157303,0.205021
1,"(39.0, 47.0]",97,0.155698,0.40795
2,"(9.999, 18.4]",96,0.154093,0.608787
3,"(18.4, 30.0]",96,0.154093,0.809623
4,"(47.0, 353.0]",91,0.146067,1.0
