### GAUSSIAN NAIVE BAYES CLASSIFIER IMPLEMENTATION IN PYTHON

See the complete detail [here](http://dataaspirant.com/2017/02/20/gaussian-naive-bayes-classifier-implementation-python/).

In [140]:
# Required Python Machine learning Packages
import pandas as pd
import numpy as np

# For preprocessing the data
from sklearn.preprocessing import Imputer
from sklearn import preprocessing

# To split the dataset into train and test datasets
from sklearn.cross_validation import train_test_split

# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB

# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score

In [141]:
adult_df = pd.read_csv('data/adult.data', header = None, delimiter=' *, *', engine='python')

In [142]:
adult_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [143]:
adult_df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
                    'marital_status', 'occupation', 'relationship',
                    'race', 'sex', 'capital_gain', 'capital_loss',
                    'hours_per_week', 'native_country', 'income']

In [144]:
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [145]:
adult_df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [146]:
for value in ['workclass', 'education',
          'marital_status', 'occupation',
          'relationship','race', 'sex',
          'native_country', 'income']:
    temp = sum(adult_df[value] == '?')
    print(f'{value}: {temp}')

workclass: 1836
education: 0
marital_status: 0
occupation: 1843
relationship: 0
race: 0
sex: 0
native_country: 583
income: 0


In [147]:
adult_df_rev = adult_df
adult_df_rev.describe(include = 'all')

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


In [148]:
for value in ['workclass', 'education',
          'marital_status', 'occupation',
          'relationship','race', 'sex',
          'native_country', 'income']:
    adult_df_rev[value] = adult_df_rev[value].str.replace('?',
                                                          adult_df_rev.describe(include='all')[value][2])

In [149]:
le = preprocessing.LabelEncoder()

In [150]:
workclass_cat = le.fit_transform(adult_df.workclass)
education_cat = le.fit_transform(adult_df.education)
marital_cat   = le.fit_transform(adult_df.marital_status)
occupation_cat = le.fit_transform(adult_df.occupation)
relationship_cat = le.fit_transform(adult_df.relationship)
race_cat = le.fit_transform(adult_df.race)
sex_cat = le.fit_transform(adult_df.sex)
native_country_cat = le.fit_transform(adult_df.native_country)
print(workclass_cat)

[6 5 3 ... 3 3 4]


In [151]:
#initialize the encoded categorical columns
adult_df_rev['workclass_cat'] = workclass_cat
adult_df_rev['education_cat'] = education_cat
adult_df_rev['marital_cat'] = marital_cat
adult_df_rev['occupation_cat'] = occupation_cat
adult_df_rev['relationship_cat'] = relationship_cat
adult_df_rev['race_cat'] = race_cat
adult_df_rev['sex_cat'] = sex_cat
adult_df_rev['native_country_cat'] = native_country_cat

In [152]:
#drop the old categorical columns from dataframe
dummy_fields = ['workclass', 'education', 'marital_status', 
                  'occupation', 'relationship', 'race',
                  'sex', 'native_country']
adult_df_rev = adult_df_rev.drop(dummy_fields, axis = 1)

In [153]:
adult_df_rev = adult_df_rev.reindex_axis(['age', 'workclass_cat', 'fnlwgt', 'education_cat',
                                    'education_num', 'marital_cat', 'occupation_cat',
                                    'relationship_cat', 'race_cat', 'sex_cat', 'capital_gain',
                                    'capital_loss', 'hours_per_week', 'native_country_cat', 
                                    'income'], axis= 1)


  """


In [161]:
num_features = ['age', 'workclass_cat', 'fnlwgt', 'education_cat', 'education_num',
                'marital_cat', 'occupation_cat', 'relationship_cat', 'race_cat',
                'sex_cat', 'capital_gain', 'capital_loss', 'hours_per_week',
                'native_country_cat']

scaled_features = {}
for each in num_features:
    mean, std = adult_df_rev[each].mean(), adult_df_rev[each].std()
    scaled_features[each] = [mean, std]
    adult_df_rev.loc[:, each] = (adult_df_rev[each] - mean)/std

In [162]:
adult_df_rev.tail()

Unnamed: 0,age,workclass_cat,fnlwgt,education_cat,education_num,marital_cat,occupation_cat,relationship_cat,race_cat,sex_cat,capital_gain,capital_loss,hours_per_week,native_country_cat,income
32556,-0.849067,-0.085295,0.639731,-0.852192,0.746028,-0.406206,1.475378,2.211664,0.393661,-1.422309,-0.145918,-0.216656,-0.197406,0.261366,<=50K
32557,0.103982,-0.085295,-0.335428,0.181329,-0.420053,-0.406206,-0.034927,-0.900167,0.393661,0.703061,-0.145918,-0.216656,-0.035429,0.261366,>50K
32558,1.423588,-0.085295,-0.358772,0.181329,-0.420053,2.249446,-1.545232,1.589298,0.393661,-1.422309,-0.145918,-0.216656,-0.035429,0.261366,<=50K
32559,-1.215625,-0.085295,0.110958,0.181329,-0.420053,0.92162,-1.545232,0.966932,0.393661,0.703061,-0.145918,-0.216656,-1.655199,0.261366,<=50K
32560,0.983719,0.817889,0.929878,0.181329,-0.420053,-0.406206,-0.79008,2.211664,0.393661,-1.422309,1.888395,-0.216656,-0.035429,0.261366,>50K


In [163]:
features = adult_df_rev.values[:,:14]
target = adult_df_rev.values[:,14]
# features_train, features_test, target_train, target_test = train_test_split(features,
#                                                                             target, test_size = 0.33, random_state = 10)

In [165]:
clf = GaussianNB()
clf.fit(features, target)
target_pred = clf.predict([[58,3,151910,11,9,6,0,4,4,0,0,0,40,38]])
print(target_pred)

['<=50K']


In [158]:
accuracy_score(target_test, target_pred, normalize = True)

0.7589800856132515