In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import scipy.stats as stats

In [28]:
dataset = pd.read_csv("data/credit_data.csv")

dataset.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [29]:
dataset.dropna(inplace=True)

dataset.shape

(1997, 5)

In [30]:
X = dataset[["income", "age", "loan"]].values
y = dataset["c#default"].values

In [31]:
results_naive_bayes = []
results_logistic_regression = []
results_random_forest = []

for i in range(30):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=i
    )

    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train, y_train)
    results_naive_bayes.append(accuracy_score(y_test, naive_bayes.predict(X_test)))

    logistic_regression = LogisticRegression()
    logistic_regression.fit(X_train, y_train)
    results_logistic_regression.append(
        accuracy_score(y_test, logistic_regression.predict(X_test))
    )

    random_forest = RandomForestClassifier()
    random_forest.fit(X_train, y_train)
    results_random_forest.append(accuracy_score(y_test, random_forest.predict(X_test)))

#### Working with the results

In [32]:
results_naive_bayes = np.array(results_naive_bayes)
results_logistic_regression = np.array(results_logistic_regression)
results_random_forest = np.array(results_random_forest)

In [33]:
print(f"Naive Bayes: {np.mean(results_naive_bayes)}")
print(f"Logistic Regression: {np.mean(results_logistic_regression)}")
print(f"Random Forest: {np.mean(results_random_forest)}")

Naive Bayes: 0.92425
Logistic Regression: 0.9463333333333334
Random Forest: 0.9844166666666667


In [34]:
print(f"Naive Bayes: {stats.mode(results_naive_bayes)}")
print(f"Logistic Regression: {stats.mode(results_logistic_regression)}")
print(f"Random Forest: {stats.mode(results_random_forest)}")

Naive Bayes: ModeResult(mode=0.9175, count=5)
Logistic Regression: ModeResult(mode=0.9425, count=4)
Random Forest: ModeResult(mode=0.9875, count=9)


In [35]:
print(f"Naive Bayes: {np.median(results_naive_bayes)}")
print(f"Logistic Regression: {np.median(results_logistic_regression)}")
print(f"Random Forest: {np.median(results_random_forest)}")

Naive Bayes: 0.925
Logistic Regression: 0.945
Random Forest: 0.985


In [36]:
print(f"Naive Bayes: {np.var(results_naive_bayes)}")
print(f"Logistic Regression: {np.var(results_logistic_regression)}")
print(f"Random Forest: {np.var(results_random_forest)}")
print("")
print(
    f"Min: {np.min([np.var(results_naive_bayes), np.var(results_logistic_regression), np.var(results_random_forest)])}"
)
print(
    f"Max: {np.max([np.var(results_naive_bayes), np.var(results_logistic_regression), np.var(results_random_forest)])}"
)

Naive Bayes: 8.756250000000001e-05
Logistic Regression: 9.738888888888882e-05
Random Forest: 2.611805555555568e-05

Min: 2.611805555555568e-05
Max: 9.738888888888882e-05


In [37]:
print(f"Naive Bayes: {np.std(results_naive_bayes)}")
print(f"Logistic Regression: {np.std(results_logistic_regression)}")
print(f"Random Forest: {np.std(results_random_forest)}")

Naive Bayes: 0.00935748363610645
Logistic Regression: 0.009868580895391638
Random Forest: 0.005110582702154


In [38]:
print(f"Naive Bayes:  {stats.variation(results_naive_bayes) * 100}")
print(f"Logistic Regression:  {stats.variation(results_logistic_regression) * 100}")
print(f"Random Forest:  {stats.variation(results_random_forest) * 100}")

Naive Bayes:  1.0124407504578252
Logistic Regression:  1.042822919555298
Random Forest:  0.5191483317180055


### Exercise

In [39]:
naive_bayes_cv = cross_val_score(naive_bayes, X, y, cv=10)
logistic_regression_cv = cross_val_score(logistic_regression, X, y, cv=10)
random_forest_cv = cross_val_score(random_forest, X, y, cv=10)

In [40]:
print(f"Naive Bayes CV: {np.mean(naive_bayes_cv)}")
print(f"Logistic Regression CV: {np.mean(logistic_regression_cv)}")
print(f"Random Forest CV: {np.mean(random_forest_cv)}")

Naive Bayes CV: 0.9238743718592964
Logistic Regression CV: 0.9459095477386933
Random Forest CV: 0.9874748743718593


### Comparasion

In [41]:
print(f"Naive Bayes: {np.mean(results_naive_bayes)}")
print(f"Logistic Regression: {np.mean(results_logistic_regression)}")
print(f"Random Forest: {np.mean(results_random_forest)}")
print("")
print(f"Naive Bayes CV: {np.mean(naive_bayes_cv)}")
print(f"Logistic Regression CV: {np.mean(logistic_regression_cv)}")
print(f"Random Forest CV: {np.mean(random_forest_cv)}")

Naive Bayes: 0.92425
Logistic Regression: 0.9463333333333334
Random Forest: 0.9844166666666667

Naive Bayes CV: 0.9238743718592964
Logistic Regression CV: 0.9459095477386933
Random Forest CV: 0.9874748743718593


#### Manual mode

In [43]:
result_naive_bayes_cv = []
result_logistic_regression_cv = []
result_random_forest_cv = []

for i in range(30):
    kfold = KFold(n_splits=10, shuffle=True, random_state=i)

    naive_bayes_cv = cross_val_score(naive_bayes, X, y, cv=kfold)
    result_naive_bayes_cv.append(naive_bayes_cv.mean())

    logistic_regression_cv = cross_val_score(logistic_regression, X, y, cv=kfold)
    result_logistic_regression_cv.append(logistic_regression_cv.mean())

    random_forest_cv = cross_val_score(random_forest, X, y, cv=kfold)
    result_random_forest_cv.append(random_forest_cv.mean())

result_naive_bayes_cv = np.array(result_naive_bayes_cv)
result_logistic_regression_cv = np.array(result_logistic_regression_cv)
result_random_forest_cv = np.array(result_random_forest_cv)

In [44]:
print(f"Naive Bayes CV: {stats.variation(result_naive_bayes_cv) * 100}")
print(f"Logistic Regression CV: {stats.variation(result_logistic_regression_cv) * 100}")
print(f"Random Forest CV: {stats.variation(result_random_forest_cv) * 100}")

Naive Bayes CV: 0.08641071566366061
Logistic Regression CV: 0.10802610833013937
Random Forest CV: 0.17375612517782416
