In [1]:
import graphlab
graphlab.canvas.set_target('ipynb')

A newer version of GraphLab Create (v1.8.5) is available! Your current version is v1.8.3.

You can use pip to upgrade the graphlab-create package. For more information see https://dato.com/products/create/upgrade.


In [2]:
loans = graphlab.SFrame('lending-club-data.gl/')

[INFO] GraphLab Create v1.8.3 started. Logging: C:\Users\ritraina\AppData\Local\Temp\graphlab_server_1460442025.log.0


In [3]:
# safe_loans =  1 => safe
# safe_loans = -1 => risky
loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')

In [4]:
features = ['grade',                     # grade of the loan
            'sub_grade',                 # sub-grade of the loan
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'term',                      # the term of the loan
            'last_delinq_none',          # has borrower had a delinquincy
            'last_major_derog_none',     # has borrower had 90 day or worse rating
            'revol_util',                # percent of available credit being used
            'total_rec_late_fee',        # total late fees received to day
           ]

target = 'safe_loans'                   # prediction target (y) (+1 means safe, -1 is risky)

# Extract the feature columns and target column
loans = loans[features + [target]]
loans

grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none
B,B2,0,11,RENT,27.65,credit_card,36 months,1
C,C4,1,1,RENT,1.0,car,60 months,1
C,C5,0,11,RENT,8.72,small_business,36 months,1
C,C1,0,11,RENT,20.0,other,36 months,0
A,A4,0,4,RENT,11.2,wedding,36 months,1
E,E1,0,10,RENT,5.35,car,36 months,1
F,F2,0,5,OWN,5.55,small_business,60 months,1
B,B5,1,1,RENT,18.08,other,60 months,1
C,C3,0,6,OWN,16.12,debt_consolidation,60 months,1
B,B5,0,11,OWN,10.78,debt_consolidation,36 months,1

last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,83.7,0.0,1
1,9.4,0.0,-1
1,98.5,0.0,1
1,21.0,16.97,1
1,28.3,0.0,1
1,87.5,0.0,1
1,32.6,0.0,-1
1,36.5,0.0,-1
1,20.6,0.0,1
1,67.1,0.0,1


In [5]:
safe_loans_raw = loans[loans[target] == +1]
risky_loans_raw = loans[loans[target] == -1]
print "Number of safe loans  : %s" % len(safe_loans_raw)
print "Number of risky loans : %s" % len(risky_loans_raw)

Number of safe loans  : 99457
Number of risky loans : 23150


In [6]:
print "Percentage of safe loans                 :", len(safe_loans_raw) / float(len(loans))
print "Percentage of risky loans                :", len(risky_loans_raw) / float(len(loans))
print "Total number of loans in our new dataset :", len(loans)

Percentage of safe loans                 : 0.811185331996
Percentage of risky loans                : 0.188814668004
Total number of loans in our new dataset : 122607


In [7]:
# Since there are fewer risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))

risky_loans = risky_loans_raw
safe_loans = safe_loans_raw.sample(percentage, seed=1)

# Append the risky_loans with the downsampled version of safe_loans
loans_data = risky_loans.append(safe_loans)

In [8]:
print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)

Percentage of safe loans                 : 0.502236174422
Percentage of risky loans                : 0.497763825578
Total number of loans in our new dataset : 46508


In [9]:
train_data, validation_data = loans_data.random_split(.8, seed=1)

In [70]:
decision_tree_model = graphlab.decision_tree_classifier.create(train_data, validation_set=None,
                                target = target, features = features)

In [69]:
decision_tree_model.show(view="Tree")

In [12]:
small_model = graphlab.decision_tree_classifier.create(train_data, validation_set=None,
                   target = target, features = features, max_depth = 2)

In [13]:
small_model.show(view="Tree")

In [14]:
validation_safe_loans = validation_data[validation_data[target] == 1]
validation_risky_loans = validation_data[validation_data[target] == -1]

sample_validation_data_risky = validation_risky_loans[0:2]
sample_validation_data_safe = validation_safe_loans[0:2]

sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky)
sample_validation_label = sample_validation_data['safe_loans']
sample_validation_data

grade,sub_grade,short_emp,emp_length_num,home_ownership,dti,purpose,term,last_delinq_none
B,B3,0,11,OWN,11.18,credit_card,36 months,1
D,D1,0,10,RENT,16.85,debt_consolidation,36 months,1
D,D2,0,3,RENT,13.97,other,60 months,0
A,A5,0,11,MORTGAGE,16.33,debt_consolidation,36 months,1

last_major_derog_none,revol_util,total_rec_late_fee,safe_loans
1,82.4,0.0,1
1,96.4,0.0,1
1,59.5,0.0,-1
1,62.1,0.0,-1


In [21]:
predictions = list()
for i in range(len(sample_validation_data)):
    prediction = decision_tree_model.predict(sample_validation_data[i])
    predictions.append(prediction)
    print predictions[i]

[1L]
[-1L]
[-1L]
[1L]


In [22]:
predictions_proba = list()
for i in range(len(sample_validation_data)):
    prediction = decision_tree_model.predict(sample_validation_data[i], output_type='probability')
    predictions_proba.append(prediction)
    print predictions_proba[i]

[0.5473502227247066]
[0.48912221868005296]
[0.4559234035824712]
[0.5864479932181161]


In [23]:
predictions_proba = list()
for i in range(len(sample_validation_data)):
    prediction = small_model.predict(sample_validation_data[i], output_type='probability')
    predictions_proba.append(prediction)
    print predictions_proba[i]

[0.5242817536858114]
[0.472267584643798]
[0.472267584643798]
[0.5798847176937073]


In [24]:
sample_validation_data[1]

{'dti': 16.85,
 'emp_length_num': 10L,
 'grade': 'D',
 'home_ownership': 'RENT',
 'last_delinq_none': 1L,
 'last_major_derog_none': 1L,
 'purpose': 'debt_consolidation',
 'revol_util': 96.4,
 'safe_loans': 1L,
 'short_emp': 0L,
 'sub_grade': 'D1',
 'term': ' 36 months',
 'total_rec_late_fee': 0.0}

In [25]:
small_model.show(view="Tree")

In [26]:
predictions = list()
for i in range(len(sample_validation_data)):
    prediction = small_model.predict(sample_validation_data[i])
    predictions.append(prediction)
    print predictions[i]

[1L]
[-1L]
[-1L]
[1L]


In [27]:
print small_model.evaluate(train_data)['accuracy']
print decision_tree_model.evaluate(train_data)['accuracy']

0.613502041694
0.640581345369


In [28]:
print small_model.evaluate(validation_data)['accuracy']
print decision_tree_model.evaluate(validation_data)['accuracy']

0.619345109866
0.636686772943


In [29]:
big_model = graphlab.decision_tree_classifier.create(train_data, validation_set=None,
                   target = target, features = features, max_depth = 10)

In [32]:
print big_model.evaluate(train_data)['accuracy']
print big_model.evaluate(validation_data)['accuracy']

0.665538362347
0.627208099957


In [64]:
predictions = decision_tree_model.predict(validation_data)
predictions

dtype: int
Rows: 9284
[-1L, 1L, -1L, -1L, 1L, -1L, 1L, 1L, -1L, -1L, -1L, 1L, 1L, -1L, 1L, 1L, 1L, -1L, -1L, -1L, 1L, 1L, -1L, -1L, -1L, 1L, -1L, -1L, 1L, -1L, -1L, -1L, -1L, 1L, 1L, -1L, 1L, -1L, 1L, -1L, 1L, 1L, 1L, 1L, 1L, -1L, 1L, -1L, 1L, 1L, -1L, -1L, -1L, -1L, -1L, 1L, 1L, 1L, -1L, -1L, 1L, -1L, -1L, -1L, 1L, 1L, -1L, -1L, -1L, -1L, -1L, 1L, -1L, 1L, -1L, -1L, -1L, 1L, -1L, -1L, -1L, -1L, 1L, 1L, -1L, 1L, 1L, -1L, -1L, 1L, -1L, 1L, -1L, -1L, -1L, 1L, -1L, -1L, -1L, -1L, ... ]

In [65]:
validation_data['safe_loans']

dtype: int
Rows: 9284
[-1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, ... ]

In [67]:
false_pos = 0
for i in range(len(validation_data)):
    if predictions[i] == 1: 
       if validation_data['safe_loans'][i] == -1:
          false_pos = false_pos + 1
print false_pos

1657


In [68]:
false_neg = 0
for i in range(len(validation_data)):
    if predictions[i] == -1: 
       if validation_data['safe_loans'][i] == 1:
          false_neg = false_neg + 1
print false_neg

1716
