# Conversión de categóricas a numéricas

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

In [2]:
import sys, os
sys.path.append(os.getcwd().replace('/notebooks',''))  # Replace with current directory name

In [3]:
from utils.data import Datasets
from utils.eda import column_explore

In [4]:
ds = Datasets()
cash, fees = ds.get_datasets()

In [5]:
# Check for missing values in the DataFrame
missing_values = pd.isnull(cash)
# Count missing values in each column
missing_counts = missing_values.sum()
print('Missing Values in Each Column:')
print(missing_counts)

Missing Values in Each Column:
cash_request_id                   0
amount                            0
status                            0
created_at                        0
updated_at                        0
user_id                        2103
moderated_at                   7935
deleted_account_id            21866
reimbursement_date                0
cash_request_received_date     7681
money_back_date                7427
transfer_type                     0
send_at                        7329
recovery_status               20640
reco_creation                 20640
reco_last_update              20640
id_usuario                        0
dtype: int64


In [6]:
# Check for missing values in the DataFrame
missing_values = pd.isnull(fees)
# Count missing values in each column
missing_counts = missing_values.sum()
print('Missing Values in Each Column:')
print(missing_counts)

Missing Values in Each Column:
id                     0
cash_request_id        0
type                   0
status                 0
category           18865
total_amount           0
reason                 0
created_at             0
updated_at             0
paid_at             5530
from_date          13295
to_date            13295
charge_moment          0
dtype: int64


In [7]:
cash.columns

Index(['cash_request_id', 'amount', 'status', 'created_at', 'updated_at',
       'user_id', 'moderated_at', 'deleted_account_id', 'reimbursement_date',
       'cash_request_received_date', 'money_back_date', 'transfer_type',
       'send_at', 'recovery_status', 'reco_creation', 'reco_last_update',
       'id_usuario'],
      dtype='object')

In [8]:
cash_extra = ds.desglose_created_at(cash)
cash_extra

Unnamed: 0,cash_request_id,amount,status,created_at,updated_at,user_id,moderated_at,deleted_account_id,reimbursement_date,cash_request_received_date,...,send_at,recovery_status,reco_creation,reco_last_update,id_usuario,created_year,created_month,created_year_month,created_dayofweek,created_hour
0,5,100.0,rejected,2019-12-10 19:05:21.596873+00:00,2019-12-11 16:47:42.407830+00:00,804.0,2019-12-11 16:47:42.405646+00:00,,2020-01-09 19:05:21.596363+00:00,NaT,...,NaT,,NaT,NaT,804,2019,12,2019-12,2,19
1,70,100.0,rejected,2019-12-10 19:50:12.347780+00:00,2019-12-11 14:24:22.900054+00:00,231.0,2019-12-11 14:24:22.897988+00:00,,2020-01-09 19:50:12.347780+00:00,NaT,...,NaT,,NaT,NaT,231,2019,12,2019-12,2,19
2,7,100.0,rejected,2019-12-10 19:13:35.825460+00:00,2019-12-11 09:46:59.779773+00:00,191.0,2019-12-11 09:46:59.777728+00:00,,2020-01-09 19:13:35.825041+00:00,NaT,...,NaT,,NaT,NaT,191,2019,12,2019-12,2,19
3,10,99.0,rejected,2019-12-10 19:16:10.880172+00:00,2019-12-18 14:26:18.136163+00:00,761.0,2019-12-18 14:26:18.128407+00:00,,2020-01-09 19:16:10.879606+00:00,NaT,...,NaT,,NaT,NaT,761,2019,12,2019-12,2,19
4,1594,100.0,rejected,2020-05-06 09:59:38.877376+00:00,2020-05-07 09:21:55.340080+00:00,7686.0,2020-05-07 09:21:55.320193+00:00,,2020-06-05 22:00:00+00:00,NaT,...,NaT,,NaT,NaT,7686,2020,5,2020-5,3,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23965,20616,100.0,money_back,2020-10-12 13:54:11.686225+00:00,2021-02-06 20:17:49.292493+00:00,13681.0,NaT,,2021-02-06 11:00:00+00:00,2020-10-13,...,2020-10-12 13:54:24.352856+00:00,,NaT,NaT,13681,2020,10,2020-10,1,13
23966,25243,50.0,money_back,2020-10-27 14:41:25.734910+00:00,2020-12-18 13:15:40.843946+00:00,,NaT,30367.0,2020-11-03 22:00:00+00:00,2020-10-28,...,2020-10-27 14:41:57.901946+00:00,completed,2020-11-12 23:20:41.928788+00:00,2020-12-01 13:26:53.815504+00:00,30367,2020,10,2020-10,2,14
23967,22357,100.0,money_back,2020-10-20 07:58:04.006937+00:00,2021-02-05 12:19:30.656816+00:00,82122.0,NaT,,2021-02-05 11:00:00+00:00,2020-10-21,...,2020-10-20 07:58:14.171553+00:00,,NaT,NaT,82122,2020,10,2020-10,2,7
23968,20256,100.0,money_back,2020-10-10 05:40:55.700422+00:00,2021-02-05 13:14:19.707627+00:00,64517.0,NaT,,2021-02-05 11:00:00+00:00,2020-10-12,...,2020-10-10 05:41:23.368363+00:00,,NaT,NaT,64517,2020,10,2020-10,6,5


In [10]:
cash_dummies = ds.get_dummies_and_drop_cols(cash)
cash_dummies

Unnamed: 0,cash_request_id,amount,created_at,updated_at,user_id,moderated_at,deleted_account_id,reimbursement_date,cash_request_received_date,money_back_date,...,cstatus_canceled,cstatus_direct_debit_rejected,cstatus_direct_debit_sent,cstatus_money_back,cstatus_rejected,cstatus_transaction_declined,ctranstype_regular,crecostatus_completed,crecostatus_pending,crecostatus_pending_direct_debit
0,5,100.0,2019-12-10 19:05:21.596873+00:00,2019-12-11 16:47:42.407830+00:00,804.0,2019-12-11 16:47:42.405646+00:00,,2020-01-09 19:05:21.596363+00:00,NaT,NaT,...,0,0,0,0,1,0,1,0,0,0
1,70,100.0,2019-12-10 19:50:12.347780+00:00,2019-12-11 14:24:22.900054+00:00,231.0,2019-12-11 14:24:22.897988+00:00,,2020-01-09 19:50:12.347780+00:00,NaT,NaT,...,0,0,0,0,1,0,1,0,0,0
2,7,100.0,2019-12-10 19:13:35.825460+00:00,2019-12-11 09:46:59.779773+00:00,191.0,2019-12-11 09:46:59.777728+00:00,,2020-01-09 19:13:35.825041+00:00,NaT,NaT,...,0,0,0,0,1,0,1,0,0,0
3,10,99.0,2019-12-10 19:16:10.880172+00:00,2019-12-18 14:26:18.136163+00:00,761.0,2019-12-18 14:26:18.128407+00:00,,2020-01-09 19:16:10.879606+00:00,NaT,NaT,...,0,0,0,0,1,0,1,0,0,0
4,1594,100.0,2020-05-06 09:59:38.877376+00:00,2020-05-07 09:21:55.340080+00:00,7686.0,2020-05-07 09:21:55.320193+00:00,,2020-06-05 22:00:00+00:00,NaT,NaT,...,0,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23965,20616,100.0,2020-10-12 13:54:11.686225+00:00,2021-02-06 20:17:49.292493+00:00,13681.0,NaT,,2021-02-06 11:00:00+00:00,2020-10-13,2021-02-06 20:17:49.257521+00:00,...,0,0,0,1,0,0,0,0,0,0
23966,25243,50.0,2020-10-27 14:41:25.734910+00:00,2020-12-18 13:15:40.843946+00:00,,NaT,30367.0,2020-11-03 22:00:00+00:00,2020-10-28,2020-12-01 13:26:53.787672+00:00,...,0,0,0,1,0,0,0,1,0,0
23967,22357,100.0,2020-10-20 07:58:04.006937+00:00,2021-02-05 12:19:30.656816+00:00,82122.0,NaT,,2021-02-05 11:00:00+00:00,2020-10-21,2021-02-05 12:19:30.626289+00:00,...,0,0,0,1,0,0,0,0,0,0
23968,20256,100.0,2020-10-10 05:40:55.700422+00:00,2021-02-05 13:14:19.707627+00:00,64517.0,NaT,,2021-02-05 11:00:00+00:00,2020-10-12,2021-02-05 13:14:19.689906+00:00,...,0,0,0,1,0,0,0,0,0,0


In [11]:
fees_dummies = ds.get_dummies_and_drop_cols(fees)
fees_dummies

Unnamed: 0,id,cash_request_id,total_amount,reason,created_at,updated_at,paid_at,from_date,to_date,ftype_instant_payment,ftype_postpone,fstatus_cancelled,fstatus_confirmed,fstatus_rejected,fcategory_ninguna,fcategory_rejected_direct_debit,fchargemoment_before
0,6537,14941,5.0,Instant Payment Cash Request 14941,2020-09-07 10:47:27.423150+00:00,2020-10-13 14:25:09.396112+00:00,2020-12-17 14:50:07.470110+00:00,NaT,NaT,1,0,0,0,1,1,0,0
1,6961,11714,5.0,rejected direct debit,2020-09-09 20:51:17.998653+00:00,2020-10-13 14:25:15.537063+00:00,2020-12-08 17:13:10.459080+00:00,NaT,NaT,0,0,0,0,0,0,1,0
2,16296,23371,5.0,Instant Payment Cash Request 23371,2020-10-23 10:10:58.352972+00:00,2020-10-23 10:10:58.352994+00:00,2020-11-04 19:34:37.432910+00:00,NaT,NaT,1,0,0,0,0,1,0,0
3,20775,26772,5.0,Instant Payment Cash Request 26772,2020-10-31 15:46:53.643958+00:00,2020-10-31 15:46:53.643982+00:00,2020-11-19 05:09:22.500223+00:00,NaT,NaT,1,0,0,0,0,1,0,0
4,11242,19350,5.0,Instant Payment Cash Request 19350,2020-10-06 08:20:17.170432+00:00,2020-10-13 14:25:03.267983+00:00,2020-11-02 14:45:20.355598+00:00,NaT,NaT,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21056,12372,20262,5.0,Instant Payment Cash Request 20262,2020-10-10 06:42:22.822743+00:00,2020-10-13 14:25:04.180490+00:00,2020-11-17 05:14:00.080854+00:00,NaT,NaT,1,0,0,0,1,1,0,0
21057,20768,26764,5.0,Instant Payment Cash Request 26764,2020-10-31 15:24:18.680694+00:00,2020-10-31 15:24:18.680715+00:00,2020-12-16 07:10:54.697639+00:00,NaT,NaT,1,0,0,0,1,1,0,0
21058,18779,25331,5.0,Instant Payment Cash Request 25331,2020-10-27 17:28:51.749177+00:00,2020-10-27 17:28:51.749200+00:00,2020-11-18 04:35:42.915511+00:00,NaT,NaT,1,0,0,0,1,1,0,0
21059,16542,23628,5.0,Instant Payment Cash Request 23628,2020-10-23 16:27:52.047457+00:00,2020-10-23 16:27:52.047486+00:00,2020-12-18 05:18:01.465317+00:00,NaT,NaT,1,0,0,0,1,1,0,0


In [9]:
fees_prefixed = fees.add_prefix('fee_')

merged = pd.merge(cash, fees_prefixed, left_on='cash_request_id', right_on='fee_cash_request_id', how='outer') # 32098 rows

In [12]:
incidents = merged[merged['status']=='money_back'].groupby('cash_request_id')['fee_id'].count()
incidents.sort_values(ascending=False)

cash_request_id
12225.0    35
5006.0     28
4410.0     24
12452.0    23
11376.0    21
           ..
7249.0      0
7248.0      0
7246.0      0
7243.0      0
4.0         0
Name: fee_id, Length: 16397, dtype: int64

In [13]:
merged[merged['cash_request_id']==12225.0]

Unnamed: 0,cash_request_id,amount,status,created_at,updated_at,user_id,moderated_at,deleted_account_id,reimbursement_date,cash_request_received_date,...,fee_status,fee_category,fee_total_amount,fee_reason,fee_created_at,fee_updated_at,fee_paid_at,fee_from_date,fee_to_date,fee_charge_moment
13466,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,accepted,,5.0,Postpone Cash Request 12225,2020-08-29 13:54:26.955268+00:00,2020-10-13 14:25:07.874678+00:00,2020-08-29 13:54:31.574311+00:00,2020-09-02 22:00:00+00:00,2020-09-25 22:00:00+00:00,before
13467,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,accepted,,5.0,Instant Payment Cash Request 12225,2020-08-21 12:42:29.814857+00:00,2020-10-13 14:25:01.925030+00:00,2020-10-02 11:27:24.787458+00:00,NaT,NaT,after
13468,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-09-28 17:47:20.603563+00:00,2020-10-13 14:25:06.217143+00:00,NaT,2020-09-25 22:00:00+00:00,2020-10-12 17:47:16.318000+00:00,after
13469,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-09-28 17:47:46.627533+00:00,2020-10-13 14:25:06.286355+00:00,NaT,2020-09-25 22:00:00+00:00,2020-10-12 22:00:00+00:00,after
13470,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-09-28 17:47:59.117769+00:00,2020-10-13 14:25:06.367464+00:00,NaT,2020-09-25 22:00:00+00:00,2020-10-13 22:00:00+00:00,after
13471,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-09-28 17:48:13.216002+00:00,2020-10-13 14:25:06.441736+00:00,NaT,2020-09-25 22:00:00+00:00,2020-10-19 22:00:00+00:00,after
13472,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-09-28 17:48:26.285615+00:00,2020-10-13 14:25:06.499704+00:00,NaT,2020-09-25 22:00:00+00:00,2020-10-12 17:48:21.881000+00:00,after
13473,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-09-27 16:53:09.919637+00:00,2020-10-13 14:25:14.379708+00:00,NaT,2020-09-25 22:00:00+00:00,2020-10-10 22:00:00+00:00,after
13474,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-08-27 20:50:23.440630+00:00,2020-10-13 14:25:17.256643+00:00,NaT,2020-09-02 22:00:00+00:00,2020-09-25 22:00:00+00:00,after
13475,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-08-27 20:48:23.860125+00:00,2020-10-13 14:25:17.268587+00:00,NaT,2020-09-02 22:00:00+00:00,2020-09-25 22:00:00+00:00,after
