In [22]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler

In [23]:
# Load data from file
test_df = pd.read_csv('testSet.csv', na_values=['?'])
train_df = pd.read_csv('trainSet.csv', na_values=['?'])


### Preprocessing The Data

#### Turning all variables to float64
Mapping all ordinal values to integers, one-hot encoding for nominal values

- credit_history: ordinal
    - 5: all paid
    - 4: existing paid
    - 3: no credits/all paid
    - 2: delayed previously
    - 1: critical/other existing credit
    
<br>

- employment: ordinal
    - 4: >=7
    - 3: 4<=X<7
    - 2: 1<=X<4
    - 1: <1
    - 0: unemployed
    
<br>

- property_magnitude: nominal
    - one-hot encoding

appending new encoded variables to the end of the dataframe in order to keep integrity of data

In [24]:
# Dummy encoding property_magnitude
# For train set
ohe = OneHotEncoder(sparse=False)
encoded_array = ohe.fit_transform(train_df[['property_magnitude']])

labels = np.array(ohe.categories_).ravel()
encoded_df = pd.DataFrame(encoded_array, columns=labels)
train_df = pd.concat([train_df, encoded_df], axis=1)
train_df

# For test set
ohe = OneHotEncoder(sparse=False)
encoded_array = ohe.fit_transform(test_df[['property_magnitude']])

labels = np.array(ohe.categories_).ravel()
encoded_df = pd.DataFrame(encoded_array, columns=labels)
test_df = pd.concat([test_df, encoded_df], axis=1)
test_df.head()

Unnamed: 0,credit_history,credit_amount,employment,property_magnitude,age,class,'life insurance','no known property','real estate',car,NaN
0,'existing paid',10366.0,>=7,'life insurance',42.0,good,1.0,0.0,0.0,0.0,0.0
1,'critical/other existing credit',1872.0,unemployed,'no known property',36.0,good,0.0,1.0,0.0,0.0,0.0
2,'existing paid',6758.0,1<=X<4,car,31.0,bad,0.0,0.0,0.0,1.0,0.0
3,'existing paid',3857.0,1<=X<4,'life insurance',40.0,good,1.0,0.0,0.0,0.0,0.0
4,'existing paid',3190.0,1<=X<4,'real estate',24.0,bad,0.0,0.0,1.0,0.0,0.0


In [25]:

# Map employment
# For train
train_df['employment_mapped'] = train_df['employment'].map({
    '>=7' : 4,
    '4<=X<7' : 3,
    '1<=X<4': 2,
    '<1' : 1,
    'unemployed' : 0
})
#for test
test_df['employment_mapped'] = train_df['employment'].map({
    '>=7' : 4,
    '4<=X<7' : 3,
    '1<=X<4': 2,
    '<1' : 1,
    'unemployed' : 0
})
test_df.head()

Unnamed: 0,credit_history,credit_amount,employment,property_magnitude,age,class,'life insurance','no known property','real estate',car,NaN,employment_mapped
0,'existing paid',10366.0,>=7,'life insurance',42.0,good,1.0,0.0,0.0,0.0,0.0,2
1,'critical/other existing credit',1872.0,unemployed,'no known property',36.0,good,0.0,1.0,0.0,0.0,0.0,4
2,'existing paid',6758.0,1<=X<4,car,31.0,bad,0.0,0.0,0.0,1.0,0.0,4
3,'existing paid',3857.0,1<=X<4,'life insurance',40.0,good,1.0,0.0,0.0,0.0,0.0,2
4,'existing paid',3190.0,1<=X<4,'real estate',24.0,bad,0.0,0.0,1.0,0.0,0.0,4


In [26]:
# Map Credit History
# For train
train_df['credit_history_mapped'] = train_df['credit_history'].map({
    "'all paid'": 5,
    "'existing paid'": 4,
    "'no credits/all paid'": 3,
    "'delayed previously'": 2,
    "'critical/other existing credit'": 1
})
#for test
test_df['credit_history_mapped'] = test_df['credit_history'].map({
    "'all paid'": 5,
    "'existing paid'": 4,
    "'no credits/all paid'": 3,
    "'delayed previously'": 2,
    "'critical/other existing credit'": 1
})

#### Remove Missing Values
Because the instances of samples which has a missing attribute are few,
these samples can be deleted

In [27]:
# Check amount of null variables
train_df.isnull().sum()
test_df.isnull().sum()

credit_history           6
credit_amount            2
employment               3
property_magnitude       5
age                      6
class                    0
'life insurance'         0
'no known property'      0
'real estate'            0
car                      0
NaN                      0
employment_mapped        0
credit_history_mapped    6
dtype: int64

In [28]:
# Remove samples which has a missing value
train_df = train_df.dropna()
test_df = test_df.dropna()
test_df.isnull().sum()

credit_history           0
credit_amount            0
employment               0
property_magnitude       0
age                      0
class                    0
'life insurance'         0
'no known property'      0
'real estate'            0
car                      0
NaN                      0
employment_mapped        0
credit_history_mapped    0
dtype: int64

Split the dataset dependent and independet variables

Y sets are for class variable, X for decision variables

In [29]:
#Split the dataset dependent and independet variables
x_train = train_df[["'life insurance'",
                    "'no known property'", 
                    "'real estate'", 'car', 
                    'employment_mapped', 
                    'credit_history_mapped',
                    'credit_amount',
                    'age']]

x_test = test_df[["'life insurance'",
                    "'no known property'", 
                    "'real estate'", 'car', 
                    'employment_mapped', 
                    'credit_history_mapped',
                    'credit_amount',
                    'age']]

y_train = train_df.loc[:, 'class']
y_test = test_df.loc[:, 'class']

x_train.head()

Unnamed: 0,'life insurance','no known property','real estate',car,employment_mapped,credit_history_mapped,credit_amount,age
0,1.0,0.0,0.0,0.0,2,4.0,1924.0,38.0
1,0.0,1.0,0.0,0.0,4,4.0,7297.0,36.0
2,0.0,0.0,1.0,0.0,4,4.0,1278.0,36.0
3,0.0,0.0,1.0,0.0,2,4.0,2039.0,20.0
4,1.0,0.0,0.0,0.0,4,1.0,4272.0,24.0


##### Normailize Values
using min-max normalization from sci-kit learn

In [30]:
# For train
scaler = MinMaxScaler()
scaler.fit(x_train)
scaled = scaler.fit_transform(x_train)
x_train_norm = pd.DataFrame(scaled, columns=x_train.columns)
# for test
scaler = MinMaxScaler()
scaler.fit(x_test)
scaled = scaler.fit_transform(x_test)
x_test_norm = pd.DataFrame(scaled, columns=x_test.columns)

In [31]:
# Turn string values in class variable to integer
y_train = y_train.map({'good' : 1, 'bad': 0})
y_test = y_test.map({'good' : 1, 'bad': 0})
y_train.head()

0    1
1    0
2    1
3    0
4    1
Name: class, dtype: int64

#### Train Model
Gaussian Naive Bayes from sci-kit learn library

In [32]:
model = GaussianNB()
model.fit(x_train_norm, y_train)
# get prediction values in y_pred
y_pred = model.predict(x_test_norm)
y_true = np.array(y_test) # actual values

#### Run Tests

In [33]:
# HELPERS
def accuracy(y_true, y_pred):
	accuracy = np.sum(y_true == y_pred) / len(y_true)
	return accuracy


### Test Results

In [34]:
accuracy(y_true, y_pred)

0.6447368421052632

In [35]:
# TRUE POSITIVE
# Where both are 1
TP = np.sum(np.logical_and(y_pred == 1, y_true == 1)) # where both must equal 1
TP

132

In [36]:
# TRUE NEGATIVE
# Where both are 0
TN = np.sum(np.logical_and(y_pred == 0, y_true == 0))
TN

15

In [37]:
# FALSE POSITIVE
# where prediction is 1 and ture is 0
FP = np.sum(np.logical_and(y_pred == 1, y_true == 0))
FP

55

In [38]:
# FALSE NEGATIVE
# Where prediction is 0 and true is 1
FN = np.sum(np.logical_and(y_pred == 0, y_true == 1))
FN

26

In [39]:
# Alternative way to calculate accuracy
(TN + TP) / len(y_pred)

0.6447368421052632

In [40]:
# TRUE POSITIVE RATE
TP / (TP + FN)


0.8354430379746836

In [41]:
# TRUE NEGATIVE RATE
TN / (TN + FP)

0.21428571428571427

Used np.logical_and as a way of using 2 conditions in np.sum