In [39]:
#!/usr/bin/env python
 
import pandas as pd
import numpy as np
import featuretools as ft

In [40]:
clients = pd.read_csv("data/clients.csv", 
    parse_dates = ["joined"])
loans = pd.read_csv("data/loans.csv", 
    parse_dates = ["loan_start", "loan_end"])
payments = pd.read_csv("data/payments.csv", 
    parse_dates = ["payment_date"])


In [41]:
es = ft.EntitySet(id="loanRepayment")

In [42]:
# Create an entity from the client dataframe
# This dataframe already has an index and a time index
es = es.add_dataframe(dataframe_name = "clients", 
    dataframe = clients, 
    index = "client_id", 
    time_index = "joined")
print(es)

Entityset: loanRepayment
  DataFrames:
    clients [Rows: 25, Columns: 4]
  Relationships:
    No relationships


In [43]:
# Create an entity from the loans dataframe
# This dataframe already has an index and a time index
es = es.add_dataframe(dataframe_name = "loans", 
    dataframe = loans, 
    logical_types = {"repaid":"Categorical"},
    index = "loan_id", 
    time_index = "loan_start")
print(es)

Entityset: loanRepayment
  DataFrames:
    clients [Rows: 25, Columns: 4]
    loans [Rows: 443, Columns: 8]
  Relationships:
    No relationships


In [44]:
# Create an entity from the payments dataframe
# This does not yet have a unique index
es = es.add_dataframe(dataframe_name="payments",
dataframe = payments,
logical_types = {"missed": "Categorical"},
time_index = "payment_date",
make_index = True,
index = "payment_id")
print(es)

Entityset: loanRepayment
  DataFrames:
    clients [Rows: 25, Columns: 4]
    loans [Rows: 443, Columns: 8]
    payments [Rows: 3456, Columns: 5]
  Relationships:
    No relationships


In [45]:
#print(es["clients"])
#print(es["loans"])
print(es["payments"])

      payment_id  loan_id  payment_amount payment_date missed
2113        2113    11988            2053   2000-03-05      0
726          726    11140             402   2000-03-19      0
2114        2114    11988            2627   2000-03-30      0
3223        3223    11430            1284   2000-04-05      0
2115        2115    11988            1911   2000-04-11      1
...          ...      ...             ...          ...    ...
1415        1415    11072             957   2015-07-01      0
1308        1308    10684             115   2015-07-06      0
1416        1416    11072             988   2015-07-14      1
1417        1417    11072             940   2015-07-29      0
1418        1418    11072             932   2015-08-21      1

[3456 rows x 5 columns]


In [46]:
# Relationship between clients and previous loans
es.add_relationship(
parent_dataframe_name="clients",
parent_column_name="client_id",
child_dataframe_name="loans",
child_column_name="client_id",
)
print(es)

Entityset: loanRepayment
  DataFrames:
    clients [Rows: 25, Columns: 4]
    loans [Rows: 443, Columns: 8]
    payments [Rows: 3456, Columns: 5]
  Relationships:
    loans.client_id -> clients.client_id


In [47]:
# Relationship between previous loans and previous payments
es.add_relationship(
parent_dataframe_name="loans",
parent_column_name="loan_id",
child_dataframe_name="payments",
child_column_name="loan_id",
)
print(es)

Entityset: loanRepayment
  DataFrames:
    clients [Rows: 25, Columns: 4]
    loans [Rows: 443, Columns: 8]
    payments [Rows: 3456, Columns: 5]
  Relationships:
    loans.client_id -> clients.client_id
    payments.loan_id -> loans.loan_id


In [48]:
# Create new features using specified primitives
features, feature_names = ft.dfs(
entityset = es,
target_dataframe_name = "clients",
agg_primitives = ["mean", "max", "last"],
trans_primitives = ["year", "month", "subtract_numeric", "divide_numeric"])
print("Number of features", len(features.columns))

Number of features 288


In [49]:
pd.DataFrame(features["MONTH(joined)"].head())

Unnamed: 0_level_0,MONTH(joined)
client_id,Unnamed: 1_level_1
42320,4
39384,6
26945,11
41472,11
46180,11


In [50]:
pd.DataFrame(features['MEAN(payments.payment_amount)'].head())

Unnamed: 0_level_0,MEAN(payments.payment_amount)
client_id,Unnamed: 1_level_1
42320,1021.483333
39384,1193.630137
26945,1109.473214
41472,1129.07619
46180,1186.550336


In [51]:
features.head()

Unnamed: 0_level_0,income,credit_score,LAST(loans.loan_amount),LAST(loans.loan_id),LAST(loans.loan_type),LAST(loans.rate),LAST(loans.repaid),MAX(loans.loan_amount),MAX(loans.rate),MEAN(loans.loan_amount),...,income - LAST(payments.payment_id),income - MAX(loans.loan_amount),income - MAX(loans.rate),income - MAX(payments.payment_amount),income - MEAN(loans.loan_amount),income - MEAN(loans.rate),income - MEAN(payments.payment_amount),YEAR(LAST(loans.loan_end)),YEAR(LAST(loans.loan_start)),YEAR(LAST(payments.payment_date))
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42320,229481,563,8090,10156,home,3.18,0,13887.0,6.74,7062.066667,...,227939.0,215594.0,229474.26,226712.0,222418.933333,229478.542667,228459.516667,2015,2012,2013
39384,191204,617,14654,11735,other,2.26,0,14654.0,9.23,7865.473684,...,188642.0,176550.0,191194.77,188382.0,183338.526316,191200.461579,190010.369863,2016,2014,2015
26945,214516,806,9249,11482,cash,2.86,1,14593.0,5.65,7125.933333,...,211176.0,199923.0,214510.35,211748.0,207390.066667,214513.144667,213406.526786,2016,2013,2014
41472,152214,638,10122,11936,cash,1.03,0,13657.0,9.82,7510.8125,...,149085.0,138557.0,152204.18,149778.0,144703.1875,152210.01875,151084.92381,2016,2014,2015
46180,43851,562,3834,10887,other,1.38,0,14081.0,9.26,7700.85,...,43414.0,29770.0,43841.74,41191.0,36150.15,43847.4975,42664.449664,2016,2014,2015
