In [94]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import scipy.stats as stats
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler
import statistics

In [3]:
dataset = pd.read_csv("data/credit_data.csv")

dataset.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [4]:
dataset.dropna(inplace=True)

dataset.shape

(1997, 5)

In [5]:
X = dataset[["income", "age", "loan"]].values
y = dataset["c#default"].values

In [6]:
results_naive_bayes = []
results_logistic_regression = []
results_random_forest = []

for i in range(30):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=i
    )

    naive_bayes = GaussianNB()
    naive_bayes.fit(X_train, y_train)
    results_naive_bayes.append(accuracy_score(y_test, naive_bayes.predict(X_test)))

    logistic_regression = LogisticRegression()
    logistic_regression.fit(X_train, y_train)
    results_logistic_regression.append(
        accuracy_score(y_test, logistic_regression.predict(X_test))
    )

    random_forest = RandomForestClassifier()
    random_forest.fit(X_train, y_train)
    results_random_forest.append(accuracy_score(y_test, random_forest.predict(X_test)))

#### Working with the results

In [7]:
results_naive_bayes = np.array(results_naive_bayes)
results_logistic_regression = np.array(results_logistic_regression)
results_random_forest = np.array(results_random_forest)

In [8]:
print(f"Naive Bayes: {np.mean(results_naive_bayes)}")
print(f"Logistic Regression: {np.mean(results_logistic_regression)}")
print(f"Random Forest: {np.mean(results_random_forest)}")

Naive Bayes: 0.92425
Logistic Regression: 0.9463333333333334
Random Forest: 0.98425


In [9]:
print(f"Naive Bayes: {stats.mode(results_naive_bayes)}")
print(f"Logistic Regression: {stats.mode(results_logistic_regression)}")
print(f"Random Forest: {stats.mode(results_random_forest)}")

Naive Bayes: ModeResult(mode=0.9175, count=5)
Logistic Regression: ModeResult(mode=0.9425, count=4)
Random Forest: ModeResult(mode=0.9875, count=6)


In [10]:
print(f"Naive Bayes: {np.median(results_naive_bayes)}")
print(f"Logistic Regression: {np.median(results_logistic_regression)}")
print(f"Random Forest: {np.median(results_random_forest)}")

Naive Bayes: 0.925
Logistic Regression: 0.945
Random Forest: 0.985


In [11]:
print(f"Naive Bayes: {np.var(results_naive_bayes)}")
print(f"Logistic Regression: {np.var(results_logistic_regression)}")
print(f"Random Forest: {np.var(results_random_forest)}")
print("")
print(
    f"Min: {np.min([np.var(results_naive_bayes), np.var(results_logistic_regression), np.var(results_random_forest)])}"
)
print(
    f"Max: {np.max([np.var(results_naive_bayes), np.var(results_logistic_regression), np.var(results_random_forest)])}"
)

Naive Bayes: 8.756250000000001e-05
Logistic Regression: 9.738888888888882e-05
Random Forest: 3.1312500000000056e-05

Min: 3.1312500000000056e-05
Max: 9.738888888888882e-05


In [12]:
print(f"Naive Bayes: {np.std(results_naive_bayes)}")
print(f"Logistic Regression: {np.std(results_logistic_regression)}")
print(f"Random Forest: {np.std(results_random_forest)}")

Naive Bayes: 0.00935748363610645
Logistic Regression: 0.009868580895391638
Random Forest: 0.005595757321399853


In [13]:
print(f"Naive Bayes:  {stats.variation(results_naive_bayes) * 100}")
print(f"Logistic Regression:  {stats.variation(results_logistic_regression) * 100}")
print(f"Random Forest:  {stats.variation(results_random_forest) * 100}")

Naive Bayes:  1.0124407504578252
Logistic Regression:  1.042822919555298
Random Forest:  0.568530080914387


### Exercise

In [14]:
naive_bayes_cv = cross_val_score(naive_bayes, X, y, cv=10)
logistic_regression_cv = cross_val_score(logistic_regression, X, y, cv=10)
random_forest_cv = cross_val_score(random_forest, X, y, cv=10)

In [15]:
print(f"Naive Bayes CV: {np.mean(naive_bayes_cv)}")
print(f"Logistic Regression CV: {np.mean(logistic_regression_cv)}")
print(f"Random Forest CV: {np.mean(random_forest_cv)}")

Naive Bayes CV: 0.9238743718592964
Logistic Regression CV: 0.9459095477386933
Random Forest CV: 0.9869798994974875


### Comparasion

In [16]:
print(f"Naive Bayes: {np.mean(results_naive_bayes)}")
print(f"Logistic Regression: {np.mean(results_logistic_regression)}")
print(f"Random Forest: {np.mean(results_random_forest)}")
print("")
print(f"Naive Bayes CV: {np.mean(naive_bayes_cv)}")
print(f"Logistic Regression CV: {np.mean(logistic_regression_cv)}")
print(f"Random Forest CV: {np.mean(random_forest_cv)}")

Naive Bayes: 0.92425
Logistic Regression: 0.9463333333333334
Random Forest: 0.98425

Naive Bayes CV: 0.9238743718592964
Logistic Regression CV: 0.9459095477386933
Random Forest CV: 0.9869798994974875


#### Manual mode

In [17]:
result_naive_bayes_cv = []
result_logistic_regression_cv = []
result_random_forest_cv = []

for i in range(30):
    kfold = KFold(n_splits=10, shuffle=True, random_state=i)

    naive_bayes_cv = cross_val_score(naive_bayes, X, y, cv=kfold)
    result_naive_bayes_cv.append(naive_bayes_cv.mean())

    logistic_regression_cv = cross_val_score(logistic_regression, X, y, cv=kfold)
    result_logistic_regression_cv.append(logistic_regression_cv.mean())

    random_forest_cv = cross_val_score(random_forest, X, y, cv=kfold)
    result_random_forest_cv.append(random_forest_cv.mean())

result_naive_bayes_cv = np.array(result_naive_bayes_cv)
result_logistic_regression_cv = np.array(result_logistic_regression_cv)
result_random_forest_cv = np.array(result_random_forest_cv)

In [18]:
print(f"Naive Bayes CV: {stats.variation(result_naive_bayes_cv) * 100}")
print(f"Logistic Regression CV: {stats.variation(result_logistic_regression_cv) * 100}")
print(f"Random Forest CV: {stats.variation(result_random_forest_cv) * 100}")

Naive Bayes CV: 0.08641071566366061
Logistic Regression CV: 0.10802610833013937
Random Forest CV: 0.1581744284839362


#### Selecting attributes with variance

In [None]:
base_selection = {
    "a": np.random.rand(20),
    "b": np.array([0.5] * 20),
    "class": np.random.randint(0, 2, 20),
}

base_selection

{'a': array([0.30090108, 0.73090579, 0.4363391 , 0.22115279, 0.48434177,
        0.50878436, 0.96096013, 0.95304298, 0.3323235 , 0.04670804,
        0.5251064 , 0.27723924, 0.18943158, 0.88772055, 0.79319   ,
        0.71880896, 0.54846051, 0.86908236, 0.30994523, 0.74797554]),
 'b': array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]),
 'class': array([1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1])}

In [20]:
base_selection = pd.DataFrame(base_selection)

base_selection.head()

Unnamed: 0,a,b,class
0,0.300901,0.5,1
1,0.730906,0.5,1
2,0.436339,0.5,0
3,0.221153,0.5,1
4,0.484342,0.5,1


In [21]:
base_selection.describe()

Unnamed: 0,a,b,class
count,20.0,20.0,20.0
mean,0.542121,0.5,0.6
std,0.277012,0.0,0.502625
min,0.046708,0.5,0.0
25%,0.307684,0.5,0.0
50%,0.516945,0.5,1.0
75%,0.759279,0.5,1.0
max,0.96096,0.5,1.0


In [31]:
np.var(base_selection["a"]), np.var(base_selection["b"])

(0.07289897984416614, 0.0)

In [32]:
X = base_selection[["a", "b"]].values
y = base_selection["class"].values

In [34]:
selection = VarianceThreshold(threshold=0.05)
new_X = selection.fit(X)

new_X = selection.transform(X)

new_X

array([[0.30090108],
       [0.73090579],
       [0.4363391 ],
       [0.22115279],
       [0.48434177],
       [0.50878436],
       [0.96096013],
       [0.95304298],
       [0.3323235 ],
       [0.04670804],
       [0.5251064 ],
       [0.27723924],
       [0.18943158],
       [0.88772055],
       [0.79319   ],
       [0.71880896],
       [0.54846051],
       [0.86908236],
       [0.30994523],
       [0.74797554]])

In [35]:
selection.variances_

array([0.07289898, 0.        ])

In [36]:
np.where(selection.variances_ > 0.05)

(array([0]),)

#### Exercise

In [64]:
credit_data = pd.read_csv("data/credit_data.csv")

credit_data.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [65]:
credit_data.dropna(inplace=True)

In [66]:
credit_data.describe()

Unnamed: 0,i#clientid,income,age,loan,c#default
count,1997.0,1997.0,1997.0,1997.0,1997.0
mean,1001.956935,45333.864334,40.807559,4445.487716,0.141713
std,576.702206,14325.131177,13.624469,3046.792457,0.348842
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,503.0,32804.904487,28.990415,1936.813257,0.0
50%,1002.0,45788.7471,41.317159,3977.287432,0.0
75%,1501.0,57787.565659,52.58704,6440.861434,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [67]:
np.var(credit_data["income"]), np.var(credit_data["age"]), np.var(credit_data["loan"])

(205106624.4066085, 185.53321543372624, 9278295.832931679)

In [68]:
credit_data.var()

i#clientid    3.325854e+05
income        2.052094e+08
age           1.856262e+02
loan          9.282944e+06
c#default     1.216911e-01
dtype: float64

In [69]:
X = credit_data[["income", "age", "loan"]].values
y = credit_data["c#default"].values

#### Normalization

In [None]:
scaler = MinMaxScaler()

X = scaler.fit_transform(X)

In [71]:
np.var(X[:, 0]), np.var(X[:, 1]), np.var(X[:, 2])

(0.08210439343522123, 0.013694697910033629, 0.04897070767524547)

In [74]:
selection = VarianceThreshold(threshold=0.014)

new_X = selection.fit(X)

new_X = selection.transform(X)

new_X

array([[0.9231759 , 0.58883739],
       [0.28812165, 0.47682695],
       [0.74633429, 0.58262011],
       ...,
       [0.48612202, 0.40112895],
       [0.47500998, 0.1177903 ],
       [0.98881367, 0.53597028]])

In [None]:
naive_bayes_no_selection = GaussianNB()
naive_bayes_no_selection.fit(X, y)
naive_bayes_no_selection_predictions = naive_bayes_no_selection.predict(X)
accuracy_score(y, naive_bayes_no_selection_predictions)

0.9253880821231848

In [76]:
naive_bayes_selection = GaussianNB()
naive_bayes_selection.fit(new_X, y)
naive_bayes_selection_predictions = naive_bayes_selection.predict(new_X)
accuracy_score(y, naive_bayes_selection_predictions)

0.8472709063595393

### Missing Values

In [86]:
credit_data = pd.read_csv("data/credit_data.csv")

credit_data.isnull().sum()

i#clientid    0
income        0
age           3
loan          0
c#default     0
dtype: int64

In [87]:
credit_null = credit_data[credit_data.isnull().any(axis=1)]

credit_null

Unnamed: 0,i#clientid,income,age,loan,c#default
28,29,59417.805406,,2082.625938,0
30,31,48528.852796,,6155.78467,0
31,32,23526.302555,,2862.010139,0


In [None]:
credit_data["age"].mean(), credit_data["age"].median()

(40.80755937840458, 41.3171591130085)

In [89]:
credit_data.fillna({"age": credit_data["age"].mean()}, inplace=True)

credit_data.isnull().sum()

i#clientid    0
income        0
age           0
loan          0
c#default     0
dtype: int64

##### Autos data

In [90]:
autos_data = pd.read_csv("data/autos.csv", encoding="ISO-8859-1")

autos_data.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [91]:
autos_data.isnull().sum()

dateCrawled                0
name                       0
seller                     0
offerType                  0
price                      0
abtest                     0
vehicleType            37869
yearOfRegistration         0
gearbox                20209
powerPS                    0
model                  20484
kilometer                  0
monthOfRegistration        0
fuelType               33386
brand                      0
notRepairedDamage      72060
dateCreated                0
nrOfPictures               0
postalCode                 0
lastSeen                   0
dtype: int64

In [93]:
autos_data['fuelType'].unique()

array(['benzin', 'diesel', nan, 'lpg', 'andere', 'hybrid', 'cng',
       'elektro'], dtype=object)

In [92]:
autos_data['fuelType'].value_counts()

fuelType
benzin     223857
diesel     107746
lpg          5378
cng           571
hybrid        278
andere        208
elektro       104
Name: count, dtype: int64

In [97]:
statistics.mode(autos_data['fuelType'])

'benzin'

In [98]:
autos_data.fillna({"fuelType": "benzin"}, inplace=True)

autos_data['fuelType'].unique()

array(['benzin', 'diesel', 'lpg', 'andere', 'hybrid', 'cng', 'elektro'],
      dtype=object)