# Conversión de categóricas a numéricas

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd

In [2]:
import sys, os
sys.path.append(os.getcwd().replace('/notebooks',''))  # Replace with current directory name

In [3]:
from utils.data import Datasets
from utils.eda import column_explore

In [4]:
ds = Datasets()
cash, fees = ds.get_datasets()

In [12]:
# Check for missing values in the DataFrame
missing_values = pd.isnull(cash)
# Count missing values in each column
missing_counts = missing_values.sum()
print('Missing Values in Each Column:')
print(missing_counts)

Missing Values in Each Column:
cash_request_id                   0
amount                            0
status                            0
created_at                        0
updated_at                        0
user_id                        2103
moderated_at                   7935
deleted_account_id            21866
reimbursement_date                0
cash_request_received_date     7681
money_back_date                7427
transfer_type                     0
send_at                        7329
recovery_status               20640
reco_creation                 20640
reco_last_update              20640
id_usuario                        0
dtype: int64


In [13]:
# Check for missing values in the DataFrame
missing_values = pd.isnull(fees)
# Count missing values in each column
missing_counts = missing_values.sum()
print('Missing Values in Each Column:')
print(missing_counts)

Missing Values in Each Column:
id                     0
cash_request_id        0
type                   0
status                 0
category           18865
total_amount           0
reason                 0
created_at             0
updated_at             0
paid_at             5530
from_date          13295
to_date            13295
charge_moment          0
dtype: int64


In [7]:
cash.columns

Index(['cash_request_id', 'amount', 'status', 'created_at', 'updated_at',
       'user_id', 'moderated_at', 'deleted_account_id', 'reimbursement_date',
       'cash_request_received_date', 'money_back_date', 'transfer_type',
       'send_at', 'recovery_status', 'reco_creation', 'reco_last_update',
       'id_usuario'],
      dtype='object')

In [14]:
fees.columns

Index(['id', 'cash_request_id', 'type', 'status', 'category', 'total_amount',
       'reason', 'created_at', 'updated_at', 'paid_at', 'from_date', 'to_date',
       'charge_moment'],
      dtype='object')

In [38]:
cash = ds.desglose_created_at(cash)

In [39]:
subset_cash=['cash_request_id','amount','status','transfer_type','recovery_status','id_usuario','created_year_month']
cash_ss = cash[subset_cash]
cash_ss

Unnamed: 0,cash_request_id,amount,status,transfer_type,recovery_status,id_usuario,created_year_month
0,5,100.0,rejected,regular,,804,2019-12
1,70,100.0,rejected,regular,,231,2019-12
2,7,100.0,rejected,regular,,191,2019-12
3,10,99.0,rejected,regular,,761,2019-12
4,1594,100.0,rejected,regular,,7686,2020-5
...,...,...,...,...,...,...,...
23965,20616,100.0,money_back,instant,,13681,2020-10
23966,25243,50.0,money_back,instant,completed,30367,2020-10
23967,22357,100.0,money_back,instant,,82122,2020-10
23968,20256,100.0,money_back,instant,,64517,2020-10


In [40]:
subset_fees=['cash_request_id','type', 'status', 'category', 'total_amount','charge_moment']
fees_ss = fees[subset_fees]
fees_ss

Unnamed: 0,cash_request_id,type,status,category,total_amount,charge_moment
0,14941,instant_payment,rejected,,5.0,after
1,11714,incident,accepted,rejected_direct_debit,5.0,after
2,23371,instant_payment,accepted,,5.0,after
3,26772,instant_payment,accepted,,5.0,after
4,19350,instant_payment,accepted,,5.0,after
...,...,...,...,...,...,...
21056,20262,instant_payment,rejected,,5.0,after
21057,26764,instant_payment,rejected,,5.0,after
21058,25331,instant_payment,rejected,,5.0,after
21059,23628,instant_payment,rejected,,5.0,after


In [41]:
merged = pd.merge(cash_ss, fees_ss.add_prefix('fee_'), left_on='cash_request_id', right_on='fee_cash_request_id', how='outer') # 32098 rows

In [78]:
# Beneficio por cliente
metricas_usuario = merged.groupby(['created_year_month','id_usuario','status','fee_charge_moment']).agg({
    'cash_request_id': 'count', 
    'amount': 'sum',
    'fee_total_amount': 'sum'})
metricas_usuario


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cash_request_id,amount,fee_total_amount
created_year_month,id_usuario,status,fee_charge_moment,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-10,47.0,money_back,after,2,6.0,10.0
2020-10,53.0,money_back,after,1,100.0,5.0
2020-10,73.0,money_back,after,1,100.0,5.0
2020-10,78.0,money_back,after,1,50.0,5.0
2020-10,87.0,money_back,after,1,50.0,5.0
...,...,...,...,...,...,...
2020-9,57130.0,money_back,before,1,100.0,5.0
2020-9,57178.0,money_back,after,1,50.0,5.0
2020-9,57231.0,money_back,after,1,100.0,5.0
2020-9,57289.0,money_back,after,1,100.0,5.0


In [75]:
# top10 = metricas_usuario.sort_values(by='amount', ascending=False).iloc[:10]
# top10
#metricas_usuario[metricas_usuario['id_usuario'].isin(top10['id_usuario'])].sort_values(by='created_year_month', ascending=True)
metricas_usuario.sort_values(by='cash_request_id', ascending=False).iloc[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cash_request_id,amount,fee_total_amount
created_year_month,id_usuario,status,fee_charge_moment,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-8,16391.0,money_back,after,34,3400.0,170.0
2020-6,15593.0,money_back,after,25,2500.0,125.0
2020-6,7731.0,money_back,after,23,2300.0,115.0
2020-8,23823.0,money_back,after,21,1050.0,105.0
2020-8,528.0,money_back,after,19,1900.0,95.0
2020-5,11648.0,money_back,after,18,1800.0,90.0
2020-8,21934.0,money_back,after,16,1600.0,80.0
2020-7,21706.0,money_back,after,15,1500.0,75.0
2020-8,20871.0,money_back,after,15,1500.0,75.0
2020-6,4558.0,direct_debit_rejected,after,15,750.0,75.0


In [76]:
metricas_usuario.sort_values(by='amount', ascending=False).iloc[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cash_request_id,amount,fee_total_amount
created_year_month,id_usuario,status,fee_charge_moment,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-8,16391.0,money_back,after,34,3400.0,170.0
2020-6,15593.0,money_back,after,25,2500.0,125.0
2020-6,7731.0,money_back,after,23,2300.0,115.0
2020-8,528.0,money_back,after,19,1900.0,95.0
2020-5,11648.0,money_back,after,18,1800.0,90.0
2020-8,21934.0,money_back,after,16,1600.0,80.0
2020-7,21706.0,money_back,after,15,1500.0,75.0
2020-8,20871.0,money_back,after,15,1500.0,75.0
2020-9,54879.0,direct_debit_rejected,after,14,1400.0,70.0
2020-9,50026.0,money_back,after,14,1400.0,70.0


In [77]:
metricas_usuario.sort_values(by='fee_total_amount', ascending=False).iloc[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,cash_request_id,amount,fee_total_amount
created_year_month,id_usuario,status,fee_charge_moment,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-8,16391.0,money_back,after,34,3400.0,170.0
2020-6,15593.0,money_back,after,25,2500.0,125.0
2020-6,7731.0,money_back,after,23,2300.0,115.0
2020-8,23823.0,money_back,after,21,1050.0,105.0
2020-8,528.0,money_back,after,19,1900.0,95.0
2020-5,11648.0,money_back,after,18,1800.0,90.0
2020-8,21934.0,money_back,after,16,1600.0,80.0
2020-7,21706.0,money_back,after,15,1500.0,75.0
2020-8,20871.0,money_back,after,15,1500.0,75.0
2020-6,4558.0,direct_debit_rejected,after,15,750.0,75.0


In [25]:
cash_dummies = ds.get_dummies_and_drop_cols(cash)
fees_dummies = ds.get_dummies_and_drop_cols(fees_ss)

In [26]:
merged = pd.merge(cash_dummies, fees.add_prefix('fee_'), left_on='cash_request_id', right_on='fee_cash_request_id', how='outer') # 32098 rows

In [None]:
# incidents = merged[merged['status']=='money_back'].groupby('cash_request_id')['fee_id'].count()
# incidents.sort_values(ascending=False)

cash_request_id
12225.0    35
5006.0     28
4410.0     24
12452.0    23
11376.0    21
           ..
7249.0      0
7248.0      0
7246.0      0
7243.0      0
4.0         0
Name: fee_id, Length: 16397, dtype: int64

In [None]:
# merged[merged['cash_request_id']==12225.0]

Unnamed: 0,cash_request_id,amount,status,created_at,updated_at,user_id,moderated_at,deleted_account_id,reimbursement_date,cash_request_received_date,...,fee_status,fee_category,fee_total_amount,fee_reason,fee_created_at,fee_updated_at,fee_paid_at,fee_from_date,fee_to_date,fee_charge_moment
13466,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,accepted,,5.0,Postpone Cash Request 12225,2020-08-29 13:54:26.955268+00:00,2020-10-13 14:25:07.874678+00:00,2020-08-29 13:54:31.574311+00:00,2020-09-02 22:00:00+00:00,2020-09-25 22:00:00+00:00,before
13467,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,accepted,,5.0,Instant Payment Cash Request 12225,2020-08-21 12:42:29.814857+00:00,2020-10-13 14:25:01.925030+00:00,2020-10-02 11:27:24.787458+00:00,NaT,NaT,after
13468,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-09-28 17:47:20.603563+00:00,2020-10-13 14:25:06.217143+00:00,NaT,2020-09-25 22:00:00+00:00,2020-10-12 17:47:16.318000+00:00,after
13469,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-09-28 17:47:46.627533+00:00,2020-10-13 14:25:06.286355+00:00,NaT,2020-09-25 22:00:00+00:00,2020-10-12 22:00:00+00:00,after
13470,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-09-28 17:47:59.117769+00:00,2020-10-13 14:25:06.367464+00:00,NaT,2020-09-25 22:00:00+00:00,2020-10-13 22:00:00+00:00,after
13471,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-09-28 17:48:13.216002+00:00,2020-10-13 14:25:06.441736+00:00,NaT,2020-09-25 22:00:00+00:00,2020-10-19 22:00:00+00:00,after
13472,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-09-28 17:48:26.285615+00:00,2020-10-13 14:25:06.499704+00:00,NaT,2020-09-25 22:00:00+00:00,2020-10-12 17:48:21.881000+00:00,after
13473,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-09-27 16:53:09.919637+00:00,2020-10-13 14:25:14.379708+00:00,NaT,2020-09-25 22:00:00+00:00,2020-10-10 22:00:00+00:00,after
13474,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-08-27 20:50:23.440630+00:00,2020-10-13 14:25:17.256643+00:00,NaT,2020-09-02 22:00:00+00:00,2020-09-25 22:00:00+00:00,after
13475,12225.0,100.0,money_back,2020-08-12 15:38:54.262170+00:00,2020-12-18 13:09:47.053230+00:00,16391.0,2020-08-12 21:13:52.196320+00:00,,2020-09-25 22:00:00+00:00,2020-08-18,...,cancelled,,5.0,Postpone Cash Request 12225,2020-08-27 20:48:23.860125+00:00,2020-10-13 14:25:17.268587+00:00,NaT,2020-09-02 22:00:00+00:00,2020-09-25 22:00:00+00:00,after
