# Models and Evaluation

In [43]:
import pandas as pd
import numpy as np

train_data = pd.read_csv('../assets/final/data.csv')
test_data = pd.read_csv('../assets/final/kaggleData.csv')

print(train_data.shape)
print(test_data.shape)

(328, 10)
(354, 10)


In [44]:
def get_features(df):
    return df.drop('paid', axis=1)
def get_target(df):
    return df.drop(df.columns.difference(['paid']), axis=1)

results = {}

### Split data

In [45]:
def split_data():
    ids_train = train_data['loan_id']
    ids_test = test_data['loan_id']

    train_data.drop(['loan_id'], axis=1, inplace=True)
    test_data.drop(['loan_id'], axis=1, inplace=True)

    train_features = get_features(train_data)
    train_target = get_target(train_data)
    test_features = get_features(test_data)
    test_target = get_target(test_data)

    return train_features, test_features, train_target, test_target, ids_train, ids_test



In [46]:
_X_train, _X_test, _y_train, _y_test, ids_train, ids_test = split_data()

print('\nTrain data shape: ', _X_train.shape)
print('\nTest data shape: ', _X_test.shape)
print('\nTrain shape:',_y_train.shape)
print('\nTest shape:',_y_test.shape)



Train data shape:  (328, 8)

Test data shape:  (354, 8)

Train shape: (328, 1)

Test shape: (354, 1)


### Resampling

In [47]:
def oversample(X_train, y_train):
    from imblearn.over_sampling import SMOTE
    sm = SMOTE(random_state=1, sampling_strategy=1.0)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train, y_train

In [48]:
_smote_X_train, _smote_y_train = oversample(_X_train, _y_train)

print('\nTrain data shape: ', _smote_X_train.shape)
print('\nTrain shape:',_smote_y_train.shape)



Train data shape:  (564, 8)

Train shape: (564, 1)


### Algorithms

In [49]:
def add_model(name, model, use_smote=False):
    import datetime
    X_train = _X_train.copy()
    X_test = _X_test.copy()
    smote_X_train = _smote_X_train.copy()

    start = datetime.datetime.now()
    if use_smote: model.fit(smote_X_train, np.ravel(_smote_y_train.values))
    else: model.fit(X_train, np.ravel(_y_train.values))
    end = datetime.datetime.now()

    results[name] = {'model': model,
                    'X_train': X_train, 
                    'X_test': X_test, 
                    'y_train': np.ravel(_y_train.values), 
                    'y_test': np.ravel(_y_test.values),
                    'fit_time': (end - start).microseconds / 1000 # time difference in milliseconds
                    }

In [50]:
SMOTE = True

In [51]:
from sklearn.tree import DecisionTreeClassifier
add_model('dtc',
    DecisionTreeClassifier(),
    use_smote=SMOTE
)

In [52]:
from sklearn.neighbors import KNeighborsClassifier
add_model('knn',
    KNeighborsClassifier(),
    use_smote=SMOTE
)

In [53]:
from sklearn.neural_network import MLPClassifier
add_model('mlp',
    MLPClassifier(),
    use_smote=SMOTE
)

In [54]:
from sklearn.naive_bayes import GaussianNB
add_model('gnb',
    GaussianNB(),
    use_smote=SMOTE
)

In [55]:
from sklearn.svm import SVC
add_model('svc',
    SVC(probability=True),
    use_smote=SMOTE
)

In [56]:
from sklearn.ensemble import RandomForestClassifier

add_model('rf',
    RandomForestClassifier(criterion='entropy', max_depth=3, max_features='sqrt', n_estimators=150),
    use_smote=SMOTE
)

## Prediction

In [57]:
def predict(name, isTrain=False):
    prefix = 'train' if isTrain else 'test'
    result = results[name]
    pred = result['model'].predict(result['X_'+prefix])
    result[prefix+'pred'] = pred

for name in results.keys():
    predict(name)
for name in results.keys():
    predict(name, isTrain=True)

In [58]:
def predict_proba(name, isTrain=False):
    prefix = 'train' if isTrain else 'test'
    result = results[name]
    proba = result['model'].predict_proba(result['X_'+prefix])
    result[prefix+'pred_prob'] = proba

for name in results.keys():
    predict_proba(name)
for name in results.keys():
    predict_proba(name, isTrain=True)


In [59]:
import os

RES_PATH = '../assets/results'
if not os.path.exists(RES_PATH):
  os.mkdir(RES_PATH)

res = {'Id': ids_test.values, 'Predicted': results['rf']['testpred_prob'][:,0]}

df = pd.DataFrame(data=res)
df.to_csv(RES_PATH + '/submission.csv', index=False)

df

Unnamed: 0,Id,Predicted
0,5895,0.220046
1,7122,0.827765
2,6173,0.556971
3,6142,0.161682
4,5358,0.533870
...,...,...
349,4989,0.548086
350,5221,0.425780
351,6402,0.470108
352,5346,0.505537


In [60]:
print(results['rf']['testpred_prob'])

[[0.22004552 0.77995448]
 [0.82776547 0.17223453]
 [0.55697089 0.44302911]
 [0.16168198 0.83831802]
 [0.53387033 0.46612967]
 [0.21495708 0.78504292]
 [0.29526331 0.70473669]
 [0.16990116 0.83009884]
 [0.45180694 0.54819306]
 [0.48162062 0.51837938]
 [0.40290575 0.59709425]
 [0.45492531 0.54507469]
 [0.45607979 0.54392021]
 [0.55917659 0.44082341]
 [0.41439107 0.58560893]
 [0.24652766 0.75347234]
 [0.44191068 0.55808932]
 [0.21970795 0.78029205]
 [0.17135043 0.82864957]
 [0.40055046 0.59944954]
 [0.4160924  0.5839076 ]
 [0.23647823 0.76352177]
 [0.76685784 0.23314216]
 [0.25920435 0.74079565]
 [0.22778185 0.77221815]
 [0.40654458 0.59345542]
 [0.43077564 0.56922436]
 [0.48596019 0.51403981]
 [0.43663443 0.56336557]
 [0.43663443 0.56336557]
 [0.44827668 0.55172332]
 [0.40656244 0.59343756]
 [0.41170502 0.58829498]
 [0.41837595 0.58162405]
 [0.13259021 0.86740979]
 [0.16664759 0.83335241]
 [0.41079381 0.58920619]
 [0.43123059 0.56876941]
 [0.48446916 0.51553084]
 [0.44776951 0.55223049]


In [61]:
# function that opens RES_PATH + '/submission.csv' and round the Predict table in 1 decimal place
def submission():
  import pandas as pd
  df = pd.read_csv(RES_PATH + '/submission.csv')
  df.to_csv(RES_PATH + '/submission.csv', index=False)

submission()

# TODO: Show accuracy of model with train data