## Development of Models

In [57]:
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline

In [58]:
# helper function for submission
def create_submission(model, test_data):
    predictions = model.predict_proba(test_data)
    test_data["loan_status"] = predictions[:, 1]
    return test_data.reset_index()[["id", "predictions"]]

### Load Data

In [59]:
# load in training data
train = pd.read_csv("data/train.csv", index_col="id")
validation = train.sample(1000, random_state=42)
train = train.loc[~train.index.isin(validation.index)]
y_train, y_validation = train.pop("loan_status"), validation.pop("loan_status")

# ensuring split worked
print(f"training shape: {train.shape}")
print(f"validation shape: {validation.shape}")

training shape: (57645, 11)
validation shape: (1000, 11)


### First Model = Naive Decision Tree

This first model is a naive model, used for baseline. This will show how easy this data
can be modeled.

#### Build Pipeline

In [60]:
numeric_features = [
    "person_age", "person_income", "person_emp_length", "loan_amnt",
    "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length"
]
categorical_features = [
    "person_home_ownership", "loan_intent",
    "cb_person_default_on_file"
]
ordinal_features = ["loan_grade"]

In [61]:
numeric_transforms = make_pipeline(StandardScaler())
categorical_transforms = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
ordinal_transforms = make_pipeline(
    OrdinalEncoder()
)

# column transformer
preprocessing = ColumnTransformer(transformers=[
    ("numeric", numeric_transforms, numeric_features),
    ("categorical", categorical_transforms, categorical_features),
    ("ordinal", ordinal_transforms, ordinal_features)
])

In [62]:
train.head()

Unnamed: 0_level_0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2
2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10
3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5
4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3


In [63]:
tree = make_pipeline(
    preprocessing,
    DecisionTreeClassifier(random_state=42)
)
tree.fit(train, y_train)

In [64]:
tree.score(validation, y_validation)

0.927

In [65]:
output = create_submission(tree, pd.read_csv("data/test.csv", index_col="id"))

KeyError: "['predictions'] not in index"

In [56]:
output.to_csv("output/dt.csv", index=False)