# In this Notebook

I'll make some initial EDA, trying to understand the tables we got, create some visualizations and metrics that could help us.

In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

pd.options.display.float_format = '{:.6f}'.format

# Reading the Tables

In this section, before even making sanity checks on the table, I'll see what each table is, if it is imported correctly and ask myself some questions about it.

### Accounts

In [3]:
accounts = pd.read_csv('files/processed_accounts.csv')
accounts.head()

Unnamed: 0,id,owner_id,owner_document,owner_type,inserted_at
0,ac8a1faa-3748-4ef7-b140-8fe818cdc974,d2e3e3b7-e98d-4ef3-8a51-83db807b9a94,5e998c51-53cc-48f0-aef4-45ec12a466be,user,2017-03-16 16:33:43.603803
1,b2ccbb65-19b0-43fb-891f-faf785d00ade,e01a38b9-5e4c-43fb-9aef-d866054b00bd,4ca9177f-35cc-4067-9d0b-af621aaa2e65,user,2017-03-25 14:01:25.919929
2,c890a24f-f539-4dfe-a3d3-fe2c11c0aed6,c66561a4-66cc-4e2c-8379-3fd060b0e1ba,7df16b0b-0cda-40b8-91b5-5ed4d371a834,user,2017-03-25 14:04:11.917901
3,678269b6-7b71-48a2-82fb-19018432abf9,49a50a56-3f7a-435d-80a1-6e459c3948e8,d74146d2-af1c-4396-86f7-c98ccc177411,user,2017-03-27 13:59:43.513024
4,c6f2982a-4c2a-4dc2-b237-2c683235c3d8,596160cd-c41d-4689-bd4d-a055db788931,23d532c1-855b-42c2-a011-2e64b7078d4a,user,2017-03-27 14:00:58.316963


In [4]:
accounts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 293002 entries, 0 to 293001
Data columns (total 5 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   id              293002 non-null  object
 1   owner_id        293002 non-null  object
 2   owner_document  293002 non-null  object
 3   owner_type      293002 non-null  object
 4   inserted_at     293002 non-null  object
dtypes: object(5)
memory usage: 11.2+ MB


In [5]:
## Curious to see what the owner_type is about

accounts.owner_type.value_counts(dropna=False)

organization    197777
user             95225
Name: owner_type, dtype: int64

Seems to be only an table talking about account creation and what is the type of this instance.

### Labeled Transactions

In [6]:
labeled_transactions = pd.read_csv('files/processed_labeled_transactions.csv')
labeled_transactions.head()

Unnamed: 0,operation_id,request_id,account_id,device_id,counterparty_document,operation_type,amount,balance,requested_at,processed_at,is_fraud
0,37e105f1-003f-465b-8179-e7705b12a24d,c1820b88-ac54-444f-b9a0-70de30124d95,a7b4f041-ef6a-4e17-baf9-cd471ef0f484,ef1756b6-354c-4ea4-b175-4eba835b60a5,38e6f7c9-e935-4c7c-8915-ef923a5ca914,cash_out_type_1,9.9e-05,0.000167,2018-11-21 15:41:23,2018-11-21 15:41:23,
1,72d66c19-130b-4d51-8ed7-e3b6c0b65034,9400a9f4-6ab8-40cd-b806-1569b693a88e,ed310df0-6834-4899-811e-5faf13b2a99c,ef1756b6-354c-4ea4-b175-4eba835b60a5,9bd89feb-e952-47c1-bbf5-d4c17b36d852,cash_out_type_3,1.3e-05,0.000533,,2018-10-22 23:43:38,
2,31abd386-09f7-4f18-9b01-d2682e3c2c65,80adf85b-56cc-4730-903d-e63c75ae839a,ed310df0-6834-4899-811e-5faf13b2a99c,ef1756b6-354c-4ea4-b175-4eba835b60a5,9bd89feb-e952-47c1-bbf5-d4c17b36d852,cash_out_type_3,1.6e-05,0.000347,2018-11-19 16:11:57,2018-11-19 16:12:00,
3,27543733-e8a8-4593-88fb-921e205a6e0c,6edfea5c-4125-49fa-bbe7-b39433f6e49a,ed310df0-6834-4899-811e-5faf13b2a99c,ef1756b6-354c-4ea4-b175-4eba835b60a5,e573e2d3-39af-4f71-97df-bf400e854a8a,cash_out_type_3,1e-06,0.000572,,2018-10-29 15:20:49,
4,6aff8f2d-0730-4f94-846b-14c059550ab2,88ddff46-6714-4a47-9407-a7d54996dde1,ed310df0-6834-4899-811e-5faf13b2a99c,ef1756b6-354c-4ea4-b175-4eba835b60a5,88f94e8a-0145-489f-9f1d-d4755456e965,cash_out_type_3,4.5e-05,0.000346,,2018-10-31 12:06:08,


In [7]:
labeled_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 940935 entries, 0 to 940934
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   operation_id           940766 non-null  object 
 1   request_id             940935 non-null  object 
 2   account_id             940935 non-null  object 
 3   device_id              940935 non-null  object 
 4   counterparty_document  940935 non-null  object 
 5   operation_type         940935 non-null  object 
 6   amount                 940935 non-null  float64
 7   balance                886975 non-null  float64
 8   requested_at           396444 non-null  object 
 9   processed_at           940935 non-null  object 
 10  is_fraud               1535 non-null    object 
dtypes: float64(2), object(9)
memory usage: 79.0+ MB


In [8]:
## Operation Types

labeled_transactions.operation_type.value_counts(dropna=False)

cash_out_type_2    665133
cash_out_type_3    260343
cash_out_type_1     15449
cash_out_type_6        10
Name: operation_type, dtype: int64

In [9]:
## frauds

labeled_transactions.is_fraud.value_counts(dropna=False)

NaN     939400
True      1535
Name: is_fraud, dtype: int64

In [10]:
## Curious to see if the cashouttype is related to default

In [11]:
lt2 = labeled_transactions.copy(deep=True)

lt2['is_fraud'] = lt2['is_fraud'].fillna(False).astype(int)

In [12]:
lt2['is_fraud'].sum()

1535

In [13]:
lt2.groupby('operation_type').is_fraud.mean()

operation_type
cash_out_type_1   0.003301
cash_out_type_2   0.002097
cash_out_type_3   0.000342
cash_out_type_6   0.000000
Name: is_fraud, dtype: float64

There seems to be a relation between frauds and operation type, but since types 3 and 6 are so rare, we should use better metrics to understand this relation. Lets put a pin in that and keep exploring the table.

In [14]:
lt2.shape

(940935, 11)

In [15]:
lt2.amount.mean()

1.74328478053833e-05

In [16]:
lt2.describe()

Unnamed: 0,amount,balance,is_fraud
count,940935.0,886975.0,940935.0
mean,1.7e-05,0.000117,0.001631
std,0.001566,0.000592,0.040357
min,0.0,0.0,0.0
25%,1e-06,5e-06,0.0
50%,4e-06,1.4e-05,0.0
75%,9e-06,4.3e-05,0.0
max,1.0,0.076443,1.0


In [17]:
# Histplots would give me an better understanding but they are tooking quite a tool on the SageMakers CPU. I'll keep a pin on it and after the feature engineering step, I'll evaluate things with better strategies.

lt2.groupby('is_fraud').agg({'amount':'mean', 'balance':'mean'})

Unnamed: 0_level_0,amount,balance
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.7e-05,0.000117
1,1.9e-05,7.8e-05


There seems to be a weak relation between balance and frauds, and the amount seems to had undergone some kind of min-max normalization method.

In [18]:
print("dates are equal: ", (lt2.requested_at == lt2.processed_at).sum())
print("Requests before processes: ", (lt2.requested_at < lt2.processed_at).sum())
print("Requests after processes: ", (lt2.requested_at > lt2.processed_at).sum())
print("Some date is missing: ", (lt2[['requested_at', 'processed_at']].isna().max(axis=1).sum()))

dates are equal:  118766
Requests before processes:  277675
Requests after processes:  3
Some date is missing:  544491


In [19]:
118766+277675+3+544491 == lt2.shape[0]

True

In [20]:
lt2[lt2.requested_at > lt2.processed_at]

Unnamed: 0,operation_id,request_id,account_id,device_id,counterparty_document,operation_type,amount,balance,requested_at,processed_at,is_fraud
351364,80345c1a-8185-4a10-a348-69d671b5dc4a,f8279123-4369-4176-acbb-c6117fb90030,a2205575-7eee-4134-a44d-51c1b488b1f3,0e190681-dc83-438b-ac6c-4b348f6febf3,9d4b9857-3bfe-44bc-90e7-4203f86926ac,cash_out_type_3,0.0,2.1e-05,2018-11-16 08:14:49,2018-11-16 08:14:30,0
488996,1da5656c-9339-4de0-b4f9-ee147a56790f,1f9a7462-48bc-412a-87ca-e46848fd74f1,90d722b7-2e94-4207-84fb-5df9fc1a3666,7ef8ca6f-3711-483d-b3f0-2c1dc7b80611,5266218d-af4f-4141-b774-477a171b188d,cash_out_type_3,2e-06,6e-06,2018-11-16 07:31:50,2018-11-16 07:31:35,0
557011,6f6dd45b-ed5d-453f-903d-8e849130233f,262ced8d-697e-4a41-a632-49c295c91259,89b7744e-b7a7-489b-b021-5139e74f09de,4a8f2fe4-953e-4283-ab07-d6173e740f6e,f4e053c5-5c26-4a9c-b3e8-6893f7e9d665,cash_out_type_3,2e-06,4.6e-05,2018-11-16 07:35:50,2018-11-16 07:35:34,0


In [21]:
lt2.head()

Unnamed: 0,operation_id,request_id,account_id,device_id,counterparty_document,operation_type,amount,balance,requested_at,processed_at,is_fraud
0,37e105f1-003f-465b-8179-e7705b12a24d,c1820b88-ac54-444f-b9a0-70de30124d95,a7b4f041-ef6a-4e17-baf9-cd471ef0f484,ef1756b6-354c-4ea4-b175-4eba835b60a5,38e6f7c9-e935-4c7c-8915-ef923a5ca914,cash_out_type_1,9.9e-05,0.000167,2018-11-21 15:41:23,2018-11-21 15:41:23,0
1,72d66c19-130b-4d51-8ed7-e3b6c0b65034,9400a9f4-6ab8-40cd-b806-1569b693a88e,ed310df0-6834-4899-811e-5faf13b2a99c,ef1756b6-354c-4ea4-b175-4eba835b60a5,9bd89feb-e952-47c1-bbf5-d4c17b36d852,cash_out_type_3,1.3e-05,0.000533,,2018-10-22 23:43:38,0
2,31abd386-09f7-4f18-9b01-d2682e3c2c65,80adf85b-56cc-4730-903d-e63c75ae839a,ed310df0-6834-4899-811e-5faf13b2a99c,ef1756b6-354c-4ea4-b175-4eba835b60a5,9bd89feb-e952-47c1-bbf5-d4c17b36d852,cash_out_type_3,1.6e-05,0.000347,2018-11-19 16:11:57,2018-11-19 16:12:00,0
3,27543733-e8a8-4593-88fb-921e205a6e0c,6edfea5c-4125-49fa-bbe7-b39433f6e49a,ed310df0-6834-4899-811e-5faf13b2a99c,ef1756b6-354c-4ea4-b175-4eba835b60a5,e573e2d3-39af-4f71-97df-bf400e854a8a,cash_out_type_3,1e-06,0.000572,,2018-10-29 15:20:49,0
4,6aff8f2d-0730-4f94-846b-14c059550ab2,88ddff46-6714-4a47-9407-a7d54996dde1,ed310df0-6834-4899-811e-5faf13b2a99c,ef1756b6-354c-4ea4-b175-4eba835b60a5,88f94e8a-0145-489f-9f1d-d4755456e965,cash_out_type_3,4.5e-05,0.000346,,2018-10-31 12:06:08,0


In [22]:
(pd.to_datetime(lt2.processed_at) - pd.to_datetime(lt2.requested_at)).describe(percentiles=[0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

count                       396444
mean     0 days 02:41:21.237400490
std      0 days 19:48:52.547760912
min              -1 days +23:59:41
1%                 0 days 00:00:00
10%                0 days 00:00:00
25%                0 days 00:00:00
50%                0 days 00:00:01
75%                0 days 00:00:03
90%                0 days 00:00:07
95%                0 days 13:40:07
99%         2 days 11:48:56.560000
max               81 days 21:50:37
dtype: object

In [23]:
lt2.isna().mean()

operation_id            0.000180
request_id              0.000000
account_id              0.000000
device_id               0.000000
counterparty_document   0.000000
operation_type          0.000000
amount                  0.000000
balance                 0.057347
requested_at            0.578670
processed_at            0.000000
is_fraud                0.000000
dtype: float64

Dates seems to be behaving quite normally, with most transactions having the same request and processed times (or virtually the same). The 3 times the request happened after the processing is some kind of bad data so irrelevant we could either ignore it or remove it.

Basically this is the heart of the case, having the label we are going to use to model our binary classifier. It has a lot of ID's, two timestamps that are virtually the same whenever they exists (we can create features regarding their similarity still), an amount and a balance that ranges from 0 to 1 and from 0 to ~0.07 respectively.

### Organizations Metadata

In [24]:
organizations_metadata = pd.read_csv('files/processed_organizations_metadata.csv')
organizations_metadata.head()

Unnamed: 0,id,organization_id,type,value,inserted_at,updated_at
0,8a442911-65db-42a3-8063-2e8e2ddab984,417b2d7c-8a26-483b-920a-cc283095d044,user_is_partner,"{""mei"": false, ""partner"": true, ""user_id"": ""f3...",2018-11-25 02:31:05.472505,2018-11-25 02:31:05.472505
1,fa84c1d4-9362-472f-8ef9-aa14768deb22,417b2d7c-8a26-483b-920a-cc283095d044,organization_type,ME,2018-11-25 02:31:05.518456,2018-11-25 02:31:05.518456
2,089a96b0-8e31-4303-9b8f-e8eabd126b51,f4745c03-2491-49f1-953b-461fc7f72cf5,user_is_partner,"{""mei"": true, ""partner"": true, ""user_id"": ""dcd...",2018-11-25 02:36:04.694632,2018-11-25 02:36:04.694632
3,ebe81aa1-7076-47a9-a929-0555167b6d35,f4745c03-2491-49f1-953b-461fc7f72cf5,organization_type,ME,2018-11-25 02:36:04.605196,2018-11-25 02:36:04.605196
4,b301ec68-9042-4a74-88f4-82b3256d0137,b208f531-9beb-44ae-991c-effca11da1e5,user_is_partner,"{""mei"": true, ""partner"": true, ""user_id"": ""5c9...",2018-11-25 02:41:44.953192,2018-11-25 02:41:44.953192


In [25]:
organizations_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292433 entries, 0 to 292432
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   id               292433 non-null  object
 1   organization_id  292433 non-null  object
 2   type             292433 non-null  object
 3   value            271191 non-null  object
 4   inserted_at      292433 non-null  object
 5   updated_at       292433 non-null  object
dtypes: object(6)
memory usage: 13.4+ MB


In [26]:
organizations_metadata.type.value_counts(dropna=False)

organization_type    244235
user_is_partner       48198
Name: type, dtype: int64

In [27]:
organizations_metadata[organizations_metadata.type == 'organization_type'].head()

Unnamed: 0,id,organization_id,type,value,inserted_at,updated_at
1,fa84c1d4-9362-472f-8ef9-aa14768deb22,417b2d7c-8a26-483b-920a-cc283095d044,organization_type,ME,2018-11-25 02:31:05.518456,2018-11-25 02:31:05.518456
3,ebe81aa1-7076-47a9-a929-0555167b6d35,f4745c03-2491-49f1-953b-461fc7f72cf5,organization_type,ME,2018-11-25 02:36:04.605196,2018-11-25 02:36:04.605196
5,b0c366c5-2901-4f68-8bb4-92a612fe9350,b208f531-9beb-44ae-991c-effca11da1e5,organization_type,ME,2018-11-25 02:41:44.862583,2018-11-25 02:41:44.862583
7,2c7da962-0aee-4603-b1ab-d8b0c81f2283,02b9eb4f-e25e-4ac7-a4c7-a3e0cef53968,organization_type,ME,2018-11-25 02:48:19.247844,2018-11-25 02:48:19.247844
9,f6922f73-302a-40dc-9456-35a635287759,c05a9e40-a8e7-4ebf-a72c-06c00f1a8f89,organization_type,ME,2018-11-25 02:55:30.487795,2018-11-25 02:55:30.487795


In [28]:
organizations_metadata[organizations_metadata.type == 'organization_type'].value.value_counts(dropna=False)

ME              202518
NaN              21242
EPP              12655
Other             7817
Not informed         3
Name: value, dtype: int64

In [29]:
organizations_metadata[~(organizations_metadata.type == 'organization_type')].value.value_counts(dropna=False).head()

{"mei": false, "partner": true, "user_id": "4364fecc-17ad-4b14-a30e-11b843029638"}                      12
{"mei": false, "partner": false, "user_id": "d3566696-9b5a-4e75-b615-5c086317e18b"}                      8
{"mei": false, "partner": true, "user_id": "749678c1-484f-44fe-a4c6-c95784ab900c"}                       8
{"mei": false, "eireli": false, "partner": false, "user_id": "e7132bd3-64c8-40a1-a6a7-110bd96a45a9"}     6
{"mei": false, "partner": false, "user_id": "3f1fcbca-11c4-4b79-b3b0-4e9fe2d2b52c"}                      5
Name: value, dtype: int64

Ok, they did not make it easy for me here. "value" seems to mean different things whenever the type changes, so one of the first things I'll do when I'm working on this table is separating it in two separate entities.

In [30]:
omot = organizations_metadata[(organizations_metadata.type == 'organization_type')].copy(deep=True)
omup = organizations_metadata[~(organizations_metadata.type == 'organization_type')].copy(deep=True)

In [31]:
omup.shape

(48198, 6)

I've peeked that the keys are not always the same so I'll try to infer which are the possible keys

In [32]:
omup['value'][0]

'{"mei": false, "partner": true, "user_id": "f30ed2d8-4eda-4f80-ac87-1e0d85100093"}'

In [33]:
eval(omup['value'][0].replace("false", "False").replace("true", "True"))

{'mei': False,
 'partner': True,
 'user_id': 'f30ed2d8-4eda-4f80-ac87-1e0d85100093'}

In [34]:
list(eval(omup['value'][0].replace("false", "False").replace("true", "True")).keys())

['mei', 'partner', 'user_id']

In [35]:
get_keys = lambda x: list(eval(x.replace("false", "False").replace("true", "True")).keys())

In [36]:
## Small table, we can .apply without much trouble

omup['keys'] = omup['value'].fillna("{}").apply(get_keys)
omup['keys'].value_counts(dropna=False)

[mei, partner, user_id]            45722
[mei, eireli, partner, user_id]     2476
Name: keys, dtype: int64

Ok, we have some eireli key that means nothing for me but I'll use it.

In [37]:
print("dates are equal: ", (omup.inserted_at == omup.updated_at).sum())
print("Inserted before update: ", (omup.inserted_at < omup.updated_at).sum())
print("Inserted after update: ", (omup.inserted_at > omup.updated_at).sum())
print("Some date is missing: ", (omup[['inserted_at', 'updated_at']].isna().max(axis=1).sum()))

dates are equal:  37287
Inserted before update:  10911
Inserted after update:  0
Some date is missing:  0


In [38]:
(pd.to_datetime(omup.updated_at) - pd.to_datetime(omup.inserted_at)).describe()

count                        48198
mean     0 days 15:05:27.275045290
std      2 days 14:27:36.409635562
min                0 days 00:00:00
25%                0 days 00:00:00
50%                0 days 00:00:00
75%                0 days 00:00:00
max        46 days 04:33:19.800596
dtype: object

Dates again are behaving normally.

In [39]:
omot.head()

Unnamed: 0,id,organization_id,type,value,inserted_at,updated_at
1,fa84c1d4-9362-472f-8ef9-aa14768deb22,417b2d7c-8a26-483b-920a-cc283095d044,organization_type,ME,2018-11-25 02:31:05.518456,2018-11-25 02:31:05.518456
3,ebe81aa1-7076-47a9-a929-0555167b6d35,f4745c03-2491-49f1-953b-461fc7f72cf5,organization_type,ME,2018-11-25 02:36:04.605196,2018-11-25 02:36:04.605196
5,b0c366c5-2901-4f68-8bb4-92a612fe9350,b208f531-9beb-44ae-991c-effca11da1e5,organization_type,ME,2018-11-25 02:41:44.862583,2018-11-25 02:41:44.862583
7,2c7da962-0aee-4603-b1ab-d8b0c81f2283,02b9eb4f-e25e-4ac7-a4c7-a3e0cef53968,organization_type,ME,2018-11-25 02:48:19.247844,2018-11-25 02:48:19.247844
9,f6922f73-302a-40dc-9456-35a635287759,c05a9e40-a8e7-4ebf-a72c-06c00f1a8f89,organization_type,ME,2018-11-25 02:55:30.487795,2018-11-25 02:55:30.487795


In [40]:
print("dates are equal: ", (omot.inserted_at == omot.updated_at).sum())
print("Inserted before update: ", (omot.inserted_at < omot.updated_at).sum())
print("Inserted after update: ", (omot.inserted_at > omot.updated_at).sum())
print("Some date is missing: ", (omot[['inserted_at', 'updated_at']].isna().max(axis=1).sum()))

dates are equal:  214255
Inserted before update:  29980
Inserted after update:  0
Some date is missing:  0


In [41]:
omot.value.value_counts(dropna=False)

ME              202518
NaN              21242
EPP              12655
Other             7817
Not informed         3
Name: value, dtype: int64

Ok, so here we have a table that says when the organization was included, and if by some reason the organization is also an user, we have some metadata on who this user is. We can try to create some metrics on organization age, if the user is organization and things like that starting from this table.

### Organizations

In [42]:
organizations = pd.read_csv('files/processed_organizations.csv')
organizations.head()

Unnamed: 0,id,document,inserted_at
0,4f680da2-7b7c-4d48-9151-6f49437b4c27,9487f858-03b5-4eb4-afda-3e472643ef32,2018-11-01 17:40:15.194651
1,2a98430a-4908-4f3d-9c0b-bba3ca1bdd2a,a7899558-55ca-4c09-a535-a91abfb19c3c,2017-08-09 13:36:41.156395
2,3dc6651c-7968-456b-ae00-791da8ae0571,b5f5e3fc-a1cc-4d8b-a667-1e05b0d92376,2018-08-23 15:14:47.894131
3,4bd05bbe-ebd6-483e-a249-7c9204f88eeb,58b11bce-88c4-4279-afc3-338a4dac5348,2018-07-24 12:38:20.251984
4,1a69bc7e-a978-48c0-bad5-484c260c4254,1d8bb9cf-8f6b-46d4-b73b-96f4f176b9c1,2018-05-01 21:10:01.128368


In [43]:
organizations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54953 entries, 0 to 54952
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           54953 non-null  object
 1   document     54953 non-null  object
 2   inserted_at  54953 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


Not kind of what I'm doing at the moment, but I wonder if this inserted_at is reduntant with the above one.

In [44]:
organizations_metadata.head()

Unnamed: 0,id,organization_id,type,value,inserted_at,updated_at
0,8a442911-65db-42a3-8063-2e8e2ddab984,417b2d7c-8a26-483b-920a-cc283095d044,user_is_partner,"{""mei"": false, ""partner"": true, ""user_id"": ""f3...",2018-11-25 02:31:05.472505,2018-11-25 02:31:05.472505
1,fa84c1d4-9362-472f-8ef9-aa14768deb22,417b2d7c-8a26-483b-920a-cc283095d044,organization_type,ME,2018-11-25 02:31:05.518456,2018-11-25 02:31:05.518456
2,089a96b0-8e31-4303-9b8f-e8eabd126b51,f4745c03-2491-49f1-953b-461fc7f72cf5,user_is_partner,"{""mei"": true, ""partner"": true, ""user_id"": ""dcd...",2018-11-25 02:36:04.694632,2018-11-25 02:36:04.694632
3,ebe81aa1-7076-47a9-a929-0555167b6d35,f4745c03-2491-49f1-953b-461fc7f72cf5,organization_type,ME,2018-11-25 02:36:04.605196,2018-11-25 02:36:04.605196
4,b301ec68-9042-4a74-88f4-82b3256d0137,b208f531-9beb-44ae-991c-effca11da1e5,user_is_partner,"{""mei"": true, ""partner"": true, ""user_id"": ""5c9...",2018-11-25 02:41:44.953192,2018-11-25 02:41:44.953192


In [45]:
organizations_metadata.organization_id.nunique()

244233

In [46]:
organizations.shape

(54953, 3)

In [47]:
joint_orgs = organizations.merge(organizations_metadata, left_on='id', right_on='organization_id')

In [48]:
joint_orgs.head()

Unnamed: 0,id_x,document,inserted_at_x,id_y,organization_id,type,value,inserted_at_y,updated_at
0,4f680da2-7b7c-4d48-9151-6f49437b4c27,9487f858-03b5-4eb4-afda-3e472643ef32,2018-11-01 17:40:15.194651,6d210294-d403-4b85-b870-a28c4fbe7ed4,4f680da2-7b7c-4d48-9151-6f49437b4c27,user_is_partner,"{""mei"": true, ""partner"": true, ""user_id"": ""fa3...",2018-11-01 17:40:18.606637,2018-11-01 19:35:50.737153
1,4f680da2-7b7c-4d48-9151-6f49437b4c27,9487f858-03b5-4eb4-afda-3e472643ef32,2018-11-01 17:40:15.194651,29809be0-2bb6-4847-9c0e-976b5590c745,4f680da2-7b7c-4d48-9151-6f49437b4c27,organization_type,,2018-11-01 17:40:18.629991,2018-11-01 19:35:50.764499
2,2a98430a-4908-4f3d-9c0b-bba3ca1bdd2a,a7899558-55ca-4c09-a535-a91abfb19c3c,2017-08-09 13:36:41.156395,bb164883-7978-424a-8681-87fca5a074c2,2a98430a-4908-4f3d-9c0b-bba3ca1bdd2a,organization_type,ME,2017-12-04 18:57:58.290662,2018-03-21 17:40:22.157952
3,3dc6651c-7968-456b-ae00-791da8ae0571,b5f5e3fc-a1cc-4d8b-a667-1e05b0d92376,2018-08-23 15:14:47.894131,40aad4d4-4a26-4fbc-8ee0-723d6a25f6e5,3dc6651c-7968-456b-ae00-791da8ae0571,organization_type,EPP,2018-08-23 15:14:48.343411,2018-08-23 15:14:48.343411
4,4bd05bbe-ebd6-483e-a249-7c9204f88eeb,58b11bce-88c4-4279-afc3-338a4dac5348,2018-07-24 12:38:20.251984,da27c776-b96a-481e-b66d-a019a7740382,4bd05bbe-ebd6-483e-a249-7c9204f88eeb,organization_type,ME,2018-07-24 12:38:20.616781,2018-07-24 12:38:20.616781


We can see from this examples that these inserted_ats means different things, and even though I'm not so sure what, we'll just work with they as separated dates when feature engineering.

### Unlabeled Transactions

In [49]:
unlabeled_transactions = pd.read_csv('files/processed_unlabeled_transactions.csv')
unlabeled_transactions.head()

  unlabeled_transactions = pd.read_csv('files/processed_unlabeled_transactions.csv')


Unnamed: 0,operation_id,request_id,account_id,device_id,counterparty_document,operation_type,amount,requested_at,processed_at
0,f9eb1bfc-dd3e-4494-ad3a-5728059a334b,8ba7fb9c-a8b9-4595-8235-561012daa551,d24e533e-d54f-4ac5-874f-097f968c8b23,,b20ca420-36b0-429f-85a5-7ffac69e82a1,cash_out_type_4,1e-06,2018-07-15 00:00:25.000000,2018-07-15 00:00:25.000000
1,98b084a3-353a-4ea4-8cb6-3a2b48f24d9b,f7f3b6cc-5690-4716-bc36-8a9e61ab60b0,d24e533e-d54f-4ac5-874f-097f968c8b23,,b20ca420-36b0-429f-85a5-7ffac69e82a1,cash_out_type_4,0.0,2018-07-15 00:20:32.000000,2018-07-15 00:20:32.000000
2,b04d0b3b-620f-4031-8598-14862d8385a4,c7120482-a055-4dc0-915c-ee467afa0d6b,d24e533e-d54f-4ac5-874f-097f968c8b23,,b20ca420-36b0-429f-85a5-7ffac69e82a1,cash_out_type_4,1e-06,2018-09-01 23:51:38.000000,2018-09-01 23:51:38.000000
3,aa253da4-d86f-45fe-a30c-1e25aa752344,c97a2208-a1cc-472b-a8ff-4dd2dd54fd97,d24e533e-d54f-4ac5-874f-097f968c8b23,,b20ca420-36b0-429f-85a5-7ffac69e82a1,cash_out_type_4,1e-06,2018-09-13 00:50:27.000000,2018-09-13 00:50:27.000000
4,ffed5732-c135-495f-9b43-52f4dbdd0fab,75588166-0282-401a-ace4-d42d2b33f75d,d24e533e-d54f-4ac5-874f-097f968c8b23,,b20ca420-36b0-429f-85a5-7ffac69e82a1,cash_out_type_4,0.0,2018-09-22 23:50:38.000000,2018-09-22 23:50:38.000000


In [50]:
unlabeled_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5087054 entries, 0 to 5087053
Data columns (total 9 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   operation_id           object 
 1   request_id             object 
 2   account_id             object 
 3   device_id              object 
 4   counterparty_document  object 
 5   operation_type         object 
 6   amount                 float64
 7   requested_at           object 
 8   processed_at           object 
dtypes: float64(1), object(8)
memory usage: 349.3+ MB


In [51]:
unlabeled_transactions.isna().mean()

operation_id            0.000000
request_id              0.675480
account_id              0.000000
device_id               0.999036
counterparty_document   0.000000
operation_type          0.000000
amount                  0.000070
requested_at            0.000000
processed_at            0.000000
dtype: float64

This seems to be the table I need to score after everything is finished. Also, this is were I'll measure if whatever policy I construct on top of the model is denying more than 300 transactions daily.

In [52]:
unlabeled_transactions['operation_type'].value_counts(dropna=False).sort_index()

cash_in_type_1       28196
cash_in_type_2       90381
cash_in_type_3     2154953
cash_in_type_4     1162650
cash_out_type_1      18234
cash_out_type_2     790153
cash_out_type_3     463457
cash_out_type_4     353663
cash_out_type_5      25367
Name: operation_type, dtype: int64

The types on this table are very different from what we saw in the labeled transactions, so it would be better not to use them. Any model that is trained on the labeled dataset and applied on this table will not perform well if he sees many never-seem categories, and we can't infer if the never-seem are similar to anything in the other table.

In [53]:
unlabeled_transactions['amount'].describe()

count   5086699.000000
mean          0.000006
std           0.000124
min           0.000000
25%           0.000000
50%           0.000001
75%           0.000003
max           0.067581
Name: amount, dtype: float64

Whatever happened on the balance on the other table seems to be happening here, as the values of amount here seems similar to the values of balance there.

In [54]:
ut = unlabeled_transactions.copy(deep=True)

In [55]:
print("dates are equal: ", (ut.requested_at == ut.processed_at).sum())
print("Requested before processed: ", (ut.requested_at < ut.processed_at).sum())
print("Requested after processed: ", (ut.requested_at > ut.processed_at).sum())
print("Some date is missing: ", (ut[['requested_at', 'processed_at']].isna().max(axis=1).sum()))

dates are equal:  3828533
Requested before processed:  1258521
Requested after processed:  0
Some date is missing:  0


No missing data on these tables, so I shouldn't use "missing requested_at" as a variable on the labeled_transactions dataset.

This gave me some insights on what I can/can't  expect to see on a production database, and what I should avoid to do when I start the feature engineering process.

### Users Metadata

In [56]:
users_metadata = pd.read_csv('files/processed_users_metadata.csv')
users_metadata.head()

Unnamed: 0,id,user_id,type,value,inserted_at,updated_at
0,5e820e59-2793-4aa8-be6e-95d547872de7,7ac5f4a4-e46f-4d1a-b530-50d1930d9b95,age_range,40-59,2018-05-24 12:32:21.406260,2018-05-24 12:32:21.406260
1,862263f6-e6a0-4d7f-b14d-7112bdea27a5,7ac5f4a4-e46f-4d1a-b530-50d1930d9b95,number_of_selfies_sent,5,2018-05-24 12:32:21.618808,2018-05-24 12:32:21.618808
2,12333e49-de34-4cc0-a1f2-1bf05d7da186,0e836b54-9f64-4814-b328-7e4ac41ac5ba,age_range,>= 60,2018-08-25 01:41:57.876757,2018-08-25 01:41:57.876757
3,b4751efa-3407-4074-8bf0-10a6e4235729,0e836b54-9f64-4814-b328-7e4ac41ac5ba,number_of_selfies_sent,5,2018-08-25 01:41:58.054672,2018-08-25 01:41:58.054672
4,4fac990f-b252-4621-b47c-2481f5030cec,074a93a5-4dfb-4dd1-b099-cc83fcd1e617,age_range,25-39,2018-05-24 20:46:14.904279,2018-05-24 20:46:14.904279


In [57]:
users_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525479 entries, 0 to 525478
Data columns (total 6 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           525479 non-null  object
 1   user_id      525479 non-null  object
 2   type         525479 non-null  object
 3   value        525479 non-null  object
 4   inserted_at  525479 non-null  object
 5   updated_at   525479 non-null  object
dtypes: object(6)
memory usage: 24.1+ MB


In [62]:
users_metadata.id.nunique() == users_metadata.shape[0]

True

In [69]:
users_metadata.fillna("NA").groupby(['type', 'value']).id.count()

type                    value
age_range               18-24     39993
                        25-39    137124
                        40-59     75974
                        < 18       1870
                        >= 60      7813
number_of_selfies_sent  0           518
                        1           660
                        10         4108
                        11         1193
                        12          238
                        13          110
                        14           70
                        15          599
                        16          245
                        17          135
                        18           56
                        19           26
                        2           589
                        20          133
                        21           64
                        22           42
                        23           39
                        24           25
                        25           35
          

Here we have two variables that can help a lot in the development of the model, the age group and the amount of selfies sent by the customer.

In [70]:
um = users_metadata.copy(deep=True)

In [71]:
print("dates are equal: ", (um.inserted_at == um.updated_at).sum())
print("Inserted before updated: ", (um.inserted_at < um.updated_at).sum())
print("Inserted after updated: ", (um.inserted_at > um.updated_at).sum())
print("Some date is missing: ", (um[['inserted_at', 'updated_at']].isna().max(axis=1).sum()))

dates are equal:  419587
Inserted before updated:  105892
Inserted after updated:  0
Some date is missing:  0


Dates are well Behaved

In [72]:
um.isna().mean()

id            0.000000
user_id       0.000000
type          0.000000
value         0.000000
inserted_at   0.000000
updated_at    0.000000
dtype: float64

And no nulls. With this I can create great insights about the customer.

### Users Organizations

In [73]:
users_organizations = pd.read_csv('files/processed_users_organizations.csv')
users_organizations.head()

Unnamed: 0,user_id,organization_id
0,475c1048-860c-49e7-b94a-eb478eb78cf6,e4ef0830-64e3-4cae-bc14-a718849b60b7
1,769d83d0-1438-49d2-bf03-8b0e3584bdfe,2a15fe66-2d99-4b70-a11e-cdcb74163b65
2,8a1384f7-902e-433f-83a4-3705ba19fc4d,f4981be9-1c19-40c2-af09-4c429194ed0d
3,052ecf70-7e5d-4b49-912c-415ec718c1c9,c0508778-02cf-4ab4-a1e5-0d8a0c02a6b4
4,b058e64d-6449-4ddd-993a-4cb88d2fe97a,59f73def-619c-4d7a-bf66-b635be35b6e9


In [74]:
users_organizations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54958 entries, 0 to 54957
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          54958 non-null  object
 1   organization_id  54958 non-null  object
dtypes: object(2)
memory usage: 858.8+ KB


In [75]:
uo = users_organizations.copy(deep=True)

In [78]:
uo.groupby('user_id').organization_id.count().mean()

1.0142097881449768

In [79]:
uo.groupby('user_id').organization_id.count().max()

37

In [80]:
uo.groupby('organization_id').user_id.count().mean()

1.0000909868433026

In [82]:
uo.groupby('organization_id').user_id.count().max()

5

Just a dictionary linking user_id and organization_id, nothing much to see here. It links many-many but most of the time is 1-1, we'll have to take care when joining but we don't need to dive too deep on aggregation metrics.

### Users

In [83]:
users = pd.read_csv('files/processed_users.csv')
users.head()

Unnamed: 0,id,document,inserted_at
0,659945ba-eb5b-4112-b338-8284aec5d99b,96e7c1d9-8482-4bd6-abe7-90ad0b9854eb,2018-04-24 19:40:00.914691
1,7d8795b3-e8ea-4e01-8f5e-0151a41e7021,9f11c509-9971-43d2-8112-752f6537a44d,2018-11-05 16:34:40.898502
2,75f5fe7d-05a8-4c1c-bfad-0ea8ad901200,953f9416-0b9a-48e9-bd4a-39f8fcd18fe5,2018-04-26 18:22:38.902776
3,c8fc220a-bda9-4174-a633-0515fdff92c2,4888e307-afb7-4c75-8f8c-0e2d7fc6880c,2018-08-29 19:20:57.890656
4,0bc08c41-5d93-4740-9d1f-e188fbdf7931,4200c844-f81b-4b3a-9c19-10d9d35c26ef,2018-05-28 02:50:12.368302


In [84]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62731 entries, 0 to 62730
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           62731 non-null  object
 1   document     62731 non-null  object
 2   inserted_at  62731 non-null  object
dtypes: object(3)
memory usage: 1.4+ MB


Same as above, nothing much to see here.

# Sanity Checks

I'll make some bread-and-butter sanity checks.

### Accounts

In [85]:
accounts.head()

Unnamed: 0,id,owner_id,owner_document,owner_type,inserted_at
0,ac8a1faa-3748-4ef7-b140-8fe818cdc974,d2e3e3b7-e98d-4ef3-8a51-83db807b9a94,5e998c51-53cc-48f0-aef4-45ec12a466be,user,2017-03-16 16:33:43.603803
1,b2ccbb65-19b0-43fb-891f-faf785d00ade,e01a38b9-5e4c-43fb-9aef-d866054b00bd,4ca9177f-35cc-4067-9d0b-af621aaa2e65,user,2017-03-25 14:01:25.919929
2,c890a24f-f539-4dfe-a3d3-fe2c11c0aed6,c66561a4-66cc-4e2c-8379-3fd060b0e1ba,7df16b0b-0cda-40b8-91b5-5ed4d371a834,user,2017-03-25 14:04:11.917901
3,678269b6-7b71-48a2-82fb-19018432abf9,49a50a56-3f7a-435d-80a1-6e459c3948e8,d74146d2-af1c-4396-86f7-c98ccc177411,user,2017-03-27 13:59:43.513024
4,c6f2982a-4c2a-4dc2-b237-2c683235c3d8,596160cd-c41d-4689-bd4d-a055db788931,23d532c1-855b-42c2-a011-2e64b7078d4a,user,2017-03-27 14:00:58.316963


In [86]:
accounts.isna().mean()

id               0.000000
owner_id         0.000000
owner_document   0.000000
owner_type       0.000000
inserted_at      0.000000
dtype: float64

In [87]:
accounts.owner_type.value_counts(dropna=False)

organization    197777
user             95225
Name: owner_type, dtype: int64

In [88]:
accounts.shape

(293002, 5)

In [89]:
for i in accounts.columns:
    print(accounts[i].nunique(dropna=False), i)

293002 id
292889 owner_id
291710 owner_document
2 owner_type
293000 inserted_at


In [90]:
accounts[['inserted_at']].max()

inserted_at    2018-12-17 21:13:12.196103
dtype: object

In [91]:
accounts[['inserted_at']].min()

inserted_at    2016-12-09 20:05:17.795873
dtype: object

### Labeled Transactions

In [92]:
labeled_transactions.head()

Unnamed: 0,operation_id,request_id,account_id,device_id,counterparty_document,operation_type,amount,balance,requested_at,processed_at,is_fraud
0,37e105f1-003f-465b-8179-e7705b12a24d,c1820b88-ac54-444f-b9a0-70de30124d95,a7b4f041-ef6a-4e17-baf9-cd471ef0f484,ef1756b6-354c-4ea4-b175-4eba835b60a5,38e6f7c9-e935-4c7c-8915-ef923a5ca914,cash_out_type_1,9.9e-05,0.000167,2018-11-21 15:41:23,2018-11-21 15:41:23,
1,72d66c19-130b-4d51-8ed7-e3b6c0b65034,9400a9f4-6ab8-40cd-b806-1569b693a88e,ed310df0-6834-4899-811e-5faf13b2a99c,ef1756b6-354c-4ea4-b175-4eba835b60a5,9bd89feb-e952-47c1-bbf5-d4c17b36d852,cash_out_type_3,1.3e-05,0.000533,,2018-10-22 23:43:38,
2,31abd386-09f7-4f18-9b01-d2682e3c2c65,80adf85b-56cc-4730-903d-e63c75ae839a,ed310df0-6834-4899-811e-5faf13b2a99c,ef1756b6-354c-4ea4-b175-4eba835b60a5,9bd89feb-e952-47c1-bbf5-d4c17b36d852,cash_out_type_3,1.6e-05,0.000347,2018-11-19 16:11:57,2018-11-19 16:12:00,
3,27543733-e8a8-4593-88fb-921e205a6e0c,6edfea5c-4125-49fa-bbe7-b39433f6e49a,ed310df0-6834-4899-811e-5faf13b2a99c,ef1756b6-354c-4ea4-b175-4eba835b60a5,e573e2d3-39af-4f71-97df-bf400e854a8a,cash_out_type_3,1e-06,0.000572,,2018-10-29 15:20:49,
4,6aff8f2d-0730-4f94-846b-14c059550ab2,88ddff46-6714-4a47-9407-a7d54996dde1,ed310df0-6834-4899-811e-5faf13b2a99c,ef1756b6-354c-4ea4-b175-4eba835b60a5,88f94e8a-0145-489f-9f1d-d4755456e965,cash_out_type_3,4.5e-05,0.000346,,2018-10-31 12:06:08,


In [93]:
labeled_transactions.isna().mean()

operation_id            0.000180
request_id              0.000000
account_id              0.000000
device_id               0.000000
counterparty_document   0.000000
operation_type          0.000000
amount                  0.000000
balance                 0.057347
requested_at            0.578670
processed_at            0.000000
is_fraud                0.998369
dtype: float64

In [94]:
labeled_transactions.shape

(940935, 11)

In [95]:
for i in labeled_transactions.columns:
    print(labeled_transactions[i].nunique(dropna=False), i)

940767 operation_id
940935 request_id
63723 account_id
74928 device_id
276020 counterparty_document
4 operation_type
158041 amount
467166 balance
265342 requested_at
561650 processed_at
2 is_fraud


In [96]:
labeled_transactions.operation_type.value_counts(dropna=False)

cash_out_type_2    665133
cash_out_type_3    260343
cash_out_type_1     15449
cash_out_type_6        10
Name: operation_type, dtype: int64

In [97]:
labeled_transactions.is_fraud.value_counts(dropna=False)

NaN     939400
True      1535
Name: is_fraud, dtype: int64

In [98]:
labeled_transactions[['amount', 'balance', 'requested_at', 'processed_at']].max()

  labeled_transactions[['amount', 'balance', 'requested_at', 'processed_at']].max()


amount                     1.000000
balance                    0.076443
processed_at    2018-11-24 23:58:54
dtype: object

In [99]:
labeled_transactions[['amount', 'balance', 'requested_at', 'processed_at']].min()

  labeled_transactions[['amount', 'balance', 'requested_at', 'processed_at']].min()


amount                     0.000000
balance                    0.000000
processed_at    2018-10-18 00:00:25
dtype: object

In [100]:
labeled_transactions['requested_at'].dropna().max()

'2018-11-24 23:58:51'

In [101]:
labeled_transactions['requested_at'].dropna().min()

'2018-08-29 19:35:20'

### Unlabeled Transactions

In [102]:
unlabeled_transactions.head()

Unnamed: 0,operation_id,request_id,account_id,device_id,counterparty_document,operation_type,amount,requested_at,processed_at
0,f9eb1bfc-dd3e-4494-ad3a-5728059a334b,8ba7fb9c-a8b9-4595-8235-561012daa551,d24e533e-d54f-4ac5-874f-097f968c8b23,,b20ca420-36b0-429f-85a5-7ffac69e82a1,cash_out_type_4,1e-06,2018-07-15 00:00:25.000000,2018-07-15 00:00:25.000000
1,98b084a3-353a-4ea4-8cb6-3a2b48f24d9b,f7f3b6cc-5690-4716-bc36-8a9e61ab60b0,d24e533e-d54f-4ac5-874f-097f968c8b23,,b20ca420-36b0-429f-85a5-7ffac69e82a1,cash_out_type_4,0.0,2018-07-15 00:20:32.000000,2018-07-15 00:20:32.000000
2,b04d0b3b-620f-4031-8598-14862d8385a4,c7120482-a055-4dc0-915c-ee467afa0d6b,d24e533e-d54f-4ac5-874f-097f968c8b23,,b20ca420-36b0-429f-85a5-7ffac69e82a1,cash_out_type_4,1e-06,2018-09-01 23:51:38.000000,2018-09-01 23:51:38.000000
3,aa253da4-d86f-45fe-a30c-1e25aa752344,c97a2208-a1cc-472b-a8ff-4dd2dd54fd97,d24e533e-d54f-4ac5-874f-097f968c8b23,,b20ca420-36b0-429f-85a5-7ffac69e82a1,cash_out_type_4,1e-06,2018-09-13 00:50:27.000000,2018-09-13 00:50:27.000000
4,ffed5732-c135-495f-9b43-52f4dbdd0fab,75588166-0282-401a-ace4-d42d2b33f75d,d24e533e-d54f-4ac5-874f-097f968c8b23,,b20ca420-36b0-429f-85a5-7ffac69e82a1,cash_out_type_4,0.0,2018-09-22 23:50:38.000000,2018-09-22 23:50:38.000000


In [103]:
unlabeled_transactions.shape

(5087054, 9)

In [104]:
for i in unlabeled_transactions.columns:
    print(unlabeled_transactions[i].nunique(dropna=False), i)

5087054 operation_id
1650852 request_id
51742 account_id
2993 device_id
429322 counterparty_document
9 operation_type
266136 amount
5037051 requested_at
5036979 processed_at


In [105]:
unlabeled_transactions.operation_type.value_counts(dropna=False)

cash_in_type_3     2154953
cash_in_type_4     1162650
cash_out_type_2     790153
cash_out_type_3     463457
cash_out_type_4     353663
cash_in_type_2       90381
cash_in_type_1       28196
cash_out_type_5      25367
cash_out_type_1      18234
Name: operation_type, dtype: int64

In [106]:
unlabeled_transactions.isna().mean()

operation_id            0.000000
request_id              0.675480
account_id              0.000000
device_id               0.999036
counterparty_document   0.000000
operation_type          0.000000
amount                  0.000070
requested_at            0.000000
processed_at            0.000000
dtype: float64

In [107]:
unlabeled_transactions[['amount', 'requested_at', 'processed_at']].max()

amount                            0.067581
requested_at    2018-10-17 23:59:58.000000
processed_at    2018-10-17 23:59:58.000000
dtype: object

In [108]:
unlabeled_transactions[['amount', 'requested_at', 'processed_at']].min()

amount                            0.000000
requested_at    2017-11-09 15:12:55.155123
processed_at    2018-07-15 00:00:25.000000
dtype: object

### Organizations Metadata

In [109]:
organizations_metadata.head()

Unnamed: 0,id,organization_id,type,value,inserted_at,updated_at
0,8a442911-65db-42a3-8063-2e8e2ddab984,417b2d7c-8a26-483b-920a-cc283095d044,user_is_partner,"{""mei"": false, ""partner"": true, ""user_id"": ""f3...",2018-11-25 02:31:05.472505,2018-11-25 02:31:05.472505
1,fa84c1d4-9362-472f-8ef9-aa14768deb22,417b2d7c-8a26-483b-920a-cc283095d044,organization_type,ME,2018-11-25 02:31:05.518456,2018-11-25 02:31:05.518456
2,089a96b0-8e31-4303-9b8f-e8eabd126b51,f4745c03-2491-49f1-953b-461fc7f72cf5,user_is_partner,"{""mei"": true, ""partner"": true, ""user_id"": ""dcd...",2018-11-25 02:36:04.694632,2018-11-25 02:36:04.694632
3,ebe81aa1-7076-47a9-a929-0555167b6d35,f4745c03-2491-49f1-953b-461fc7f72cf5,organization_type,ME,2018-11-25 02:36:04.605196,2018-11-25 02:36:04.605196
4,b301ec68-9042-4a74-88f4-82b3256d0137,b208f531-9beb-44ae-991c-effca11da1e5,user_is_partner,"{""mei"": true, ""partner"": true, ""user_id"": ""5c9...",2018-11-25 02:41:44.953192,2018-11-25 02:41:44.953192


In [110]:
organizations_metadata.isna().mean()

id                0.000000
organization_id   0.000000
type              0.000000
value             0.072639
inserted_at       0.000000
updated_at        0.000000
dtype: float64

In [111]:
organizations_metadata.type.value_counts()

organization_type    244235
user_is_partner       48198
Name: type, dtype: int64

In [112]:
organizations_metadata[organizations_metadata.type == 'organization_type'].value.value_counts()

ME              202518
EPP              12655
Other             7817
Not informed         3
Name: value, dtype: int64

In [113]:
organizations_metadata.value[0]

'{"mei": false, "partner": true, "user_id": "f30ed2d8-4eda-4f80-ac87-1e0d85100093"}'

In [114]:
for i in organizations_metadata.columns:
    print(organizations_metadata[i].nunique(dropna=False), i)

292433 id
244233 organization_id
2 type
47896 value
292427 inserted_at
292422 updated_at


In [115]:
organizations_metadata[['inserted_at', 'updated_at']].max()

inserted_at    2018-12-17 02:50:19.871628
updated_at     2018-12-17 02:50:19.871628
dtype: object

In [116]:
organizations_metadata[['inserted_at', 'updated_at']].min()

inserted_at    2017-11-10 21:34:34.666756
updated_at     2017-11-10 22:52:12.267941
dtype: object

### Users Metadata

In [117]:
users_metadata.head()

Unnamed: 0,id,user_id,type,value,inserted_at,updated_at
0,5e820e59-2793-4aa8-be6e-95d547872de7,7ac5f4a4-e46f-4d1a-b530-50d1930d9b95,age_range,40-59,2018-05-24 12:32:21.406260,2018-05-24 12:32:21.406260
1,862263f6-e6a0-4d7f-b14d-7112bdea27a5,7ac5f4a4-e46f-4d1a-b530-50d1930d9b95,number_of_selfies_sent,5,2018-05-24 12:32:21.618808,2018-05-24 12:32:21.618808
2,12333e49-de34-4cc0-a1f2-1bf05d7da186,0e836b54-9f64-4814-b328-7e4ac41ac5ba,age_range,>= 60,2018-08-25 01:41:57.876757,2018-08-25 01:41:57.876757
3,b4751efa-3407-4074-8bf0-10a6e4235729,0e836b54-9f64-4814-b328-7e4ac41ac5ba,number_of_selfies_sent,5,2018-08-25 01:41:58.054672,2018-08-25 01:41:58.054672
4,4fac990f-b252-4621-b47c-2481f5030cec,074a93a5-4dfb-4dd1-b099-cc83fcd1e617,age_range,25-39,2018-05-24 20:46:14.904279,2018-05-24 20:46:14.904279


In [118]:
users_metadata.isna().mean()

id            0.000000
user_id       0.000000
type          0.000000
value         0.000000
inserted_at   0.000000
updated_at    0.000000
dtype: float64

In [119]:
for i in users_metadata.columns:
    print(users_metadata[i].nunique(dropna=False), i)

525479 id
262821 user_id
2 type
56 value
525477 inserted_at
525477 updated_at


In [120]:
users_metadata.type.value_counts()

age_range                 262774
number_of_selfies_sent    262705
Name: type, dtype: int64

### Users Organization

In [121]:
users_organizations.head()

Unnamed: 0,user_id,organization_id
0,475c1048-860c-49e7-b94a-eb478eb78cf6,e4ef0830-64e3-4cae-bc14-a718849b60b7
1,769d83d0-1438-49d2-bf03-8b0e3584bdfe,2a15fe66-2d99-4b70-a11e-cdcb74163b65
2,8a1384f7-902e-433f-83a4-3705ba19fc4d,f4981be9-1c19-40c2-af09-4c429194ed0d
3,052ecf70-7e5d-4b49-912c-415ec718c1c9,c0508778-02cf-4ab4-a1e5-0d8a0c02a6b4
4,b058e64d-6449-4ddd-993a-4cb88d2fe97a,59f73def-619c-4d7a-bf66-b635be35b6e9


In [122]:
users_organizations.shape

(54958, 2)

In [123]:
for i in users_organizations.columns:
    print(users_organizations[i].nunique(dropna=False), i)

54188 user_id
54953 organization_id


### Users

In [124]:
users.head()

Unnamed: 0,id,document,inserted_at
0,659945ba-eb5b-4112-b338-8284aec5d99b,96e7c1d9-8482-4bd6-abe7-90ad0b9854eb,2018-04-24 19:40:00.914691
1,7d8795b3-e8ea-4e01-8f5e-0151a41e7021,9f11c509-9971-43d2-8112-752f6537a44d,2018-11-05 16:34:40.898502
2,75f5fe7d-05a8-4c1c-bfad-0ea8ad901200,953f9416-0b9a-48e9-bd4a-39f8fcd18fe5,2018-04-26 18:22:38.902776
3,c8fc220a-bda9-4174-a633-0515fdff92c2,4888e307-afb7-4c75-8f8c-0e2d7fc6880c,2018-08-29 19:20:57.890656
4,0bc08c41-5d93-4740-9d1f-e188fbdf7931,4200c844-f81b-4b3a-9c19-10d9d35c26ef,2018-05-28 02:50:12.368302


In [125]:
users.shape

(62731, 3)

In [126]:
for i in users.columns:
    print(users[i].nunique(dropna=False), i)

62731 id
61964 document
62731 inserted_at
