In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [2]:
transformer = joblib.load("../Models/pipelines/pipeline1.joblib")

In [3]:
transformer.transform([['male', 46, 2000, 'own']])



array([[0.48214286, 0.09629141, 0.        , 1.        , 0.        ,
        1.        , 0.        ]])

In [4]:
df = pd.read_csv("../data/german_credit_data-3.csv")

In [5]:
train, test = train_test_split(df, test_size=0.3, random_state=42)

In [6]:
features = ['Sex','Age','Credit amount','Housing']
y_train = (train['Risk']=='bad').astype('int')
X_train = train[features]
X_train_transformed = transformer.transform(X_train)

In [7]:
cross_v = cross_validate(LogisticRegression(), X_train_transformed , y_train, cv=5, return_train_score=True,
              scoring=['accuracy'])

print("{} +- {}". format(cross_v["test_accuracy"].mean(),
                         cross_v["test_accuracy"].std()))

0.7014285714285714 +- 0.014568627181693668


In [8]:
cross_v

{'fit_time': array([0.00728202, 0.0062902 , 0.00656509, 0.00587106, 0.00483799]),
 'score_time': array([0.00043321, 0.00037289, 0.00033689, 0.00032687, 0.00030112]),
 'test_accuracy': array([0.70714286, 0.69285714, 0.67857143, 0.72142857, 0.70714286]),
 'train_accuracy': array([0.70178571, 0.70535714, 0.70892857, 0.69821429, 0.70178571])}

In [9]:
clf = LogisticRegression()
clf.fit(X_train_transformed, y_train)

LogisticRegression()

In [10]:
clf.predict_proba(transformer.transform([['male', 46, 2000, 'own']]))



array([[0.83685017, 0.16314983]])

In [11]:
clf.predict(transformer.transform([['male', 46, 2000, 'own']]))



array([0])

In [12]:
cross_v_1 = cross_validate(KNeighborsClassifier(n_neighbors=20), X_train_transformed , y_train, cv=5, return_train_score=True,
              scoring=['accuracy'])
print("{} +- {}". format(cross_v_1["test_accuracy"].mean(),
                         cross_v_1["test_accuracy"].std()))

0.6928571428571428 +- 0.011952286093343947


In [13]:
cross_v_2 = cross_validate(DecisionTreeClassifier(max_depth=3), X_train_transformed , y_train, cv=5, return_train_score=True,
              scoring=['accuracy'])
print("Train {} +- {}". format(cross_v_1["train_accuracy"].mean(),
                         cross_v_1["train_accuracy"].std()))
print("Validation {} +- {}". format(cross_v_1["test_accuracy"].mean(),
                         cross_v_1["test_accuracy"].std()))

Train 0.7121428571428571 +- 0.007354021529276434
Validation 0.6928571428571428 +- 0.011952286093343947


In [14]:
cross_v_3 = cross_validate(RandomForestClassifier(n_estimators=100,max_depth=4), X_train_transformed , y_train, cv=5, return_train_score=True,
              scoring=['accuracy'])
print("Train {} +- {}". format(cross_v_1["train_accuracy"].mean(),
                         cross_v_1["train_accuracy"].std()))
print("Validation {} +- {}". format(cross_v_1["test_accuracy"].mean(),
                         cross_v_1["test_accuracy"].std()))

Train 0.7121428571428571 +- 0.007354021529276434
Validation 0.6928571428571428 +- 0.011952286093343947


In [15]:
cross_v_4 = cross_validate(SVC(kernel='linear', C=5), X_train_transformed , y_train, cv=5, return_train_score=True,
              scoring=['accuracy'])
print("Train {} +- {}". format(cross_v_1["train_accuracy"].mean(),
                         cross_v_1["train_accuracy"].std()))
print("Validation {} +- {}". format(cross_v_1["test_accuracy"].mean(),
                         cross_v_1["test_accuracy"].std()))

Train 0.7121428571428571 +- 0.007354021529276434
Validation 0.6928571428571428 +- 0.011952286093343947


In [16]:
model = RandomForestClassifier()

In [17]:
model.fit(X_train_transformed, y_train)

RandomForestClassifier()

In [18]:
joblib.dump(model, "../Models/estimators/model01.joblib")

['../Models/estimators/model01.joblib']

['../Models/estimators/model02_new_features.joblib']