In [100]:
# Import library
import pandas as pd
import numpy as np
from sklearn.base import TransformerMixin
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, classification_report

In [101]:
# Read data
dataset = pd.read_csv('Data.csv')
dataset.head(10)

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
5,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
6,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
7,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
8,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
9,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K


In [54]:
# Split labels
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 14].values

In [55]:
print(X)

[[50 ' Self-emp-not-inc' 83311 ... 0 13 ' United-States']
 [38 ' Private' 215646 ... 0 40 ' United-States']
 [53 ' Private' 234721 ... 0 40 ' United-States']
 ...
 [58 ' Private' 151910 ... 0 40 ' United-States']
 [22 ' Private' 201490 ... 0 20 ' United-States']
 [52 ' Self-emp-inc' 287927 ... 0 40 ' United-States']]


In [56]:
print(X[26,3])

 Some-college


In [57]:
print(y)

[' <=50K' ' <=50K' ' <=50K' ... ' <=50K' ' <=50K' ' >50K']


In [58]:
# To Categoric salary  >'50K' =====> 1  and  <'50K' =====> 0
for n, i in enumerate(y):
    if i == ' >=50K' or i == ' >50K':
        y[n] = 1
    elif i == ' <=50K' or i == ' <50K':
        y[n] = 0
    else:
        y[n]=2

In [59]:
print(y)

[0 0 0 ... 0 0 1]


In [60]:
# Replace misvalues with numpy nan
for j in range(0, 32559):
    for n, i in enumerate(X[j]):
        if i == ' ?':
            X[j,n] = np.NAN

In [61]:
print(X[26])

[54 nan 180211 ' Some-college' 10 ' Married-civ-spouse' nan ' Husband'
 ' Asian-Pac-Islander' ' Male' 0 0 60 ' South']


In [62]:
# Transforming data
class DataFrameImputer(TransformerMixin):
    def fit(self, X, y=None):
        self.fill = pd.Series([X[c].value_counts().index[0] if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
            index=X.columns)

        return self
      
    def transform(self, X, y=None):
        return X.fillna(self.fill)
        

In [63]:
# Fit and Transform data
Xt = pd.DataFrame(X)
xt = DataFrameImputer().fit_transform(Xt)
X = xt

In [64]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32555,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States
32556,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States
32557,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States
32558,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States


In [65]:
categorical_feature_mask = X.dtypes==object

# filter categorical columns using mask and turn it into a list
categorical_cols = X.columns[categorical_feature_mask].tolist()

In [66]:
categorical_cols

[1, 3, 5, 6, 7, 8, 9, 13]

In [67]:
encoder = OrdinalEncoder()
X = pd.DataFrame.to_numpy(X)
X[26]

array([54, ' Private', 180211, ' Some-college', 10, ' Married-civ-spouse',
       ' Prof-specialty', ' Husband', ' Asian-Pac-Islander', ' Male', 0,
       0, 60, ' South'], dtype=object)

In [68]:
# Maping data
X[:,categorical_cols] = encoder.fit_transform(X[:,categorical_cols])

array([54, 3.0, 180211, 15.0, 10, 2.0, 9.0, 0.0, 1.0, 1.0, 0, 0, 60, 34.0],
      dtype=object)

In [69]:
X

array([[50, 5.0, 83311, ..., 0, 13, 38.0],
       [38, 3.0, 215646, ..., 0, 40, 38.0],
       [53, 3.0, 234721, ..., 0, 40, 38.0],
       ...,
       [58, 3.0, 151910, ..., 0, 40, 38.0],
       [22, 3.0, 201490, ..., 0, 20, 38.0],
       [52, 4.0, 287927, ..., 0, 40, 38.0]], dtype=object)

In [70]:
y

array([0, 0, 0, ..., 0, 0, 1], dtype=object)

In [95]:
# Split data to train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.29, random_state=0, shuffle=True)

In [96]:
X_train = X_train.astype('int')
y_train = y_train.astype('int')
X_test = X_test.astype('int')
y_test = y_test.astype('int')

In [102]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

In [103]:
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))

print("Naive Bayes =====> Accuracy on the test: {0:.2f}%".format(accuracy_score(y_test, y_pred)*100))

Number of mislabeled points out of a total 9443 points : 1907
Naive Bayes =====> Accuracy on the test: 79.81%


In [104]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[6837  367]
 [1540  699]]
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      7204
           1       0.66      0.31      0.42      2239

    accuracy                           0.80      9443
   macro avg       0.74      0.63      0.65      9443
weighted avg       0.78      0.80      0.77      9443

