In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.colors import ListedColormap

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows",None)

import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_csv('sample/adult.csv')
dataset.head()

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [3]:
dataset.columns

Index(['39', ' State-gov', ' 77516', ' Bachelors', ' 13', ' Never-married',
       ' Adm-clerical', ' Not-in-family', ' White', ' Male', ' 2174', ' 0',
       ' 40', ' United-States', ' <=50K'],
      dtype='object')

In [4]:
# Upon checking features have no label. Let us fix this first

dataset.rename(columns = {'39':'age', ' State-gov':'employed_at', ' 77516': 'final_weight', ' Bachelors': 'ed_degree', ' 13': 'ed_level',' Never-married':'marital_status',' Adm-clerical': 'job_title',' Not-in-family': 'marital_relation',' White': 'race',' Male':'Gender',' 2174':'capital_gain', ' 0':'capital_loss',' 40':'work_hours',' United-States':'Country', ' <=50K' : 'Anual_income'}, inplace = True)

In [5]:
dataset.columns

Index(['age', 'employed_at', 'final_weight', 'ed_degree', 'ed_level',
       'marital_status', 'job_title', 'marital_relation', 'race', 'Gender',
       'capital_gain', 'capital_loss', 'work_hours', 'Country',
       'Anual_income'],
      dtype='object')

In [6]:
dataset.head(10)

Unnamed: 0,age,employed_at,final_weight,ed_degree,ed_level,marital_status,job_title,marital_relation,race,Gender,capital_gain,capital_loss,work_hours,Country,Anual_income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
5,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
6,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
7,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
8,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
9,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K


In [7]:
# Summarise the dataset

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32560 entries, 0 to 32559
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   age               32560 non-null  int64 
 1   employed_at       32560 non-null  object
 2   final_weight      32560 non-null  int64 
 3   ed_degree         32560 non-null  object
 4   ed_level          32560 non-null  int64 
 5   marital_status    32560 non-null  object
 6   job_title         32560 non-null  object
 7   marital_relation  32560 non-null  object
 8   race              32560 non-null  object
 9   Gender            32560 non-null  object
 10  capital_gain      32560 non-null  int64 
 11  capital_loss      32560 non-null  int64 
 12  work_hours        32560 non-null  int64 
 13  Country           32560 non-null  object
 14  Anual_income      32560 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [8]:
# Let's sort categorical and numerical variables

categorical = [cat_var for cat_var in dataset.columns if dataset[cat_var].dtype == 'O']

print(categorical)

['employed_at', 'ed_degree', 'marital_status', 'job_title', 'marital_relation', 'race', 'Gender', 'Country', 'Anual_income']


In [9]:
numerical = [num_var for num_var in dataset.columns if dataset[num_var].dtype != 'O']

print(numerical)

['age', 'final_weight', 'ed_level', 'capital_gain', 'capital_loss', 'work_hours']


In [10]:
dataset[categorical].head(25)

Unnamed: 0,employed_at,ed_degree,marital_status,job_title,marital_relation,race,Gender,Country,Anual_income
0,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
1,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
2,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
3,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
4,Private,Masters,Married-civ-spouse,Exec-managerial,Wife,White,Female,United-States,<=50K
5,Private,9th,Married-spouse-absent,Other-service,Not-in-family,Black,Female,Jamaica,<=50K
6,Self-emp-not-inc,HS-grad,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,>50K
7,Private,Masters,Never-married,Prof-specialty,Not-in-family,White,Female,United-States,>50K
8,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,>50K
9,Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,Black,Male,United-States,>50K


In [11]:
dataset[categorical].isnull().sum()

employed_at         0
ed_degree           0
marital_status      0
job_title           0
marital_relation    0
race                0
Gender              0
Country             0
Anual_income        0
dtype: int64

In [12]:
# check records in each freuqncy distribution

dataset['employed_at'].value_counts()


 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1297
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: employed_at, dtype: int64

In [13]:
dataset['ed_degree'].value_counts()

 HS-grad         10501
 Some-college     7291
 Bachelors        5354
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: ed_degree, dtype: int64

In [14]:
dataset['marital_status'].value_counts()

 Married-civ-spouse       14976
 Never-married            10682
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: marital_status, dtype: int64

In [15]:
dataset['job_title'].value_counts()


 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3769
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 ?                    1843
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: job_title, dtype: int64

In [16]:
dataset['marital_relation'].value_counts()


 Husband           13193
 Not-in-family      8304
 Own-child          5068
 Unmarried          3446
 Wife               1568
 Other-relative      981
Name: marital_relation, dtype: int64

In [17]:
dataset['race'].value_counts()


 White                 27815
 Black                  3124
 Asian-Pac-Islander     1039
 Amer-Indian-Eskimo      311
 Other                   271
Name: race, dtype: int64

In [18]:
dataset['Gender'].value_counts()


 Male      21789
 Female    10771
Name: Gender, dtype: int64

In [19]:
dataset['Country'].value_counts()


 United-States                 29169
 Mexico                          643
 ?                               583
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 

In [20]:
dataset['Anual_income'].value_counts()

 <=50K    24719
 >50K      7841
Name: Anual_income, dtype: int64

We have a strange records - ? in features, 'employed_at', 'job_title', 'country'. since ? is not a null variable, let us first replace the ? with null value.

In [21]:
dataset['employed_at'].unique()

array([' Self-emp-not-inc', ' Private', ' State-gov', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [22]:
dataset['employed_at'].replace(' ?',np.NaN, inplace = True)

In [23]:
dataset['employed_at'].value_counts()

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 State-gov            1297
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: employed_at, dtype: int64

In [24]:
dataset['job_title'].unique()

array([' Exec-managerial', ' Handlers-cleaners', ' Prof-specialty',
       ' Other-service', ' Adm-clerical', ' Sales', ' Craft-repair',
       ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
       ' Tech-support', ' ?', ' Protective-serv', ' Armed-Forces',
       ' Priv-house-serv'], dtype=object)

In [25]:
dataset['job_title'].replace(' ?',np.NaN, inplace = True)

In [26]:
dataset['job_title'].value_counts()

 Prof-specialty       4140
 Craft-repair         4099
 Exec-managerial      4066
 Adm-clerical         3769
 Sales                3650
 Other-service        3295
 Machine-op-inspct    2002
 Transport-moving     1597
 Handlers-cleaners    1370
 Farming-fishing       994
 Tech-support          928
 Protective-serv       649
 Priv-house-serv       149
 Armed-Forces            9
Name: job_title, dtype: int64

In [27]:
dataset['Country'].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [28]:
dataset['Country'].replace(' ?', np.NaN, inplace = True)

In [29]:
dataset['Country'].value_counts()

 United-States                 29169
 Mexico                          643
 Philippines                     198
 Germany                         137
 Canada                          121
 Puerto-Rico                     114
 El-Salvador                     106
 India                           100
 Cuba                             95
 England                          90
 Jamaica                          81
 South                            80
 China                            75
 Italy                            73
 Dominican-Republic               70
 Vietnam                          67
 Guatemala                        64
 Japan                            62
 Poland                           60
 Columbia                         59
 Taiwan                           51
 Haiti                            44
 Iran                             43
 Portugal                         37
 Nicaragua                        34
 Peru                             31
 Greece                           29
 

In [30]:
dataset[categorical].isnull().sum()

employed_at         1836
ed_degree              0
marital_status         0
job_title           1843
marital_relation       0
race                   0
Gender                 0
Country              583
Anual_income           0
dtype: int64

In [31]:
# Now that we have worked on categoricals, let us proceed to handle numerical features

In [32]:
dataset[numerical].head(25)

Unnamed: 0,age,final_weight,ed_level,capital_gain,capital_loss,work_hours
0,50,83311,13,0,0,13
1,38,215646,9,0,0,40
2,53,234721,7,0,0,40
3,28,338409,13,0,0,40
4,37,284582,14,0,0,40
5,49,160187,5,0,0,16
6,52,209642,9,0,0,45
7,31,45781,14,14084,0,50
8,42,159449,13,5178,0,40
9,37,280464,10,0,0,80


In [33]:
print(dataset['age'].value_counts())
print(dataset['age'].unique())

36    898
31    888
34    886
23    877
35    876
33    875
28    867
30    861
37    858
25    841
27    835
32    828
38    827
39    815
29    813
41    808
24    798
40    794
26    785
42    780
43    770
22    765
20    753
46    737
45    734
44    724
21    720
19    712
47    708
50    602
51    595
49    577
18    550
48    543
52    478
53    464
55    419
54    415
17    395
58    366
56    366
57    358
59    355
60    312
61    300
62    258
63    230
64    208
65    178
67    151
66    150
68    120
69    108
70     89
71     72
72     67
73     64
74     51
76     46
75     45
90     43
77     29
78     23
80     22
79     22
81     20
82     12
84     10
83      6
85      3
88      3
87      1
86      1
Name: age, dtype: int64
[50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 39 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68
 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86
 87]


In [34]:
print(dataset['final_weight'].value_counts())
print(dataset['final_weight'].unique())

123011     13
203488     13
164190     13
121124     12
148995     12
113364     12
126675     12
123983     11
120131     11
190290     11
241998     11
126569     11
188246     11
120277     11
155659     11
111483     11
102308     11
125933     10
125461     10
177675     10
99185      10
125892     10
112497     10
155489     10
174789     10
193882     10
216129     10
117963     10
186934     10
119793     10
194630     10
124963      9
175262      9
194901      9
202872      9
221172      9
214542      9
118551      9
129573      9
112847      9
116632      9
218490      9
82393       9
200471      9
111567      9
202027      8
147258      8
150533      8
340917      8
113324      8
163003      8
176185      8
199058      8
210781      8
163665      8
176683      8
161141      8
144778      8
144949      8
119156      8
185385      8
210736      8
157747      8
99146       8
132879      8
111128      8
213140      8
151089      8
184655      8
172538      8
104501      8
108435

In [35]:
print(dataset['ed_level'].value_counts())
print(dataset['ed_level'].unique())

9     10501
10     7291
13     5354
14     1723
11     1382
7      1175
12     1067
6       933
4       646
15      576
5       514
8       433
16      413
3       333
2       168
1        51
Name: ed_level, dtype: int64
[13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8]


In [36]:
print(dataset['capital_gain'].value_counts())
print(dataset['capital_gain'].unique())

0        29849
15024      347
7688       284
7298       246
99999      159
5178        97
3103        97
4386        70
5013        69
8614        55
3325        53
2174        47
10520       43
4064        42
4650        41
14084       41
20051       37
3137        37
27828       34
594         34
3908        32
2829        31
13550       27
6849        27
14344       26
1055        25
2885        24
3411        24
4787        23
2176        23
3464        23
9386        22
2597        20
4101        20
2407        19
4865        17
2202        16
1506        15
3942        14
3674        14
4508        12
4416        12
3781        12
2580        12
10605       12
2907        11
25236       11
5455        11
6497        11
2354        11
2635        11
2463        11
2964         9
2105         9
6418         9
7430         9
2414         8
914          8
2977         8
1151         8
3471         8
4934         7
1471         7
1831         7
1797         7
3818         7
1409      

In [37]:
print(dataset['capital_loss'].value_counts())
print(dataset['capital_loss'].unique())

0       31041
1902      202
1977      168
1887      159
1848       51
1485       51
2415       49
1602       47
1740       42
1590       40
1876       39
1672       34
1564       25
2258       25
1669       24
1741       24
2001       24
1980       23
1719       22
2002       21
2051       21
1408       21
1579       20
2377       20
1721       18
1504       18
1974       18
2339       17
2179       15
1628       15
1762       14
2444       12
2559       12
625        12
2824       10
2042        9
1617        9
2205        9
1651        9
2392        9
1594        8
1340        7
1380        7
1092        7
2174        7
1573        6
880         6
2246        6
2057        6
2206        6
2603        5
1668        4
1825        4
1258        4
2547        4
1726        4
213         4
2457        3
2129        3
653         3
2231        3
419         3
323         3
4356        3
2267        3
3683        2
1755        2
2352        2
1648        2
1138        2
810         2
1735  

In [38]:
print(dataset['work_hours'].value_counts())
print(dataset['work_hours'].unique())

40    15216
50     2819
45     1824
60     1475
35     1297
20     1224
30     1149
55      694
25      674
48      517
38      476
15      404
70      291
10      278
32      266
24      252
65      244
36      220
42      219
44      212
16      205
12      173
43      151
37      149
8       145
52      138
80      133
56       97
28       86
99       85
46       82
18       75
72       71
75       66
6        64
5        60
4        54
47       49
84       45
22       44
54       41
3        39
33       39
39       38
41       36
14       34
2        32
27       30
26       30
17       29
49       29
90       29
58       28
34       28
7        26
53       25
21       24
13       23
23       21
1        20
62       18
9        18
66       17
57       17
19       14
64       14
51       13
85       13
68       12
98       11
11       11
63       10
78        8
29        7
77        6
59        5
31        5
96        5
67        4
91        3
76        3
81        3
73        2
89  

In [39]:
# Assign a feature vector and target variable

x = dataset.drop(columns = ['Anual_income'], axis = 1)

In [40]:
y = dataset['Anual_income']

In [41]:
x.dtypes

age                  int64
employed_at         object
final_weight         int64
ed_degree           object
ed_level             int64
marital_status      object
job_title           object
marital_relation    object
race                object
Gender              object
capital_gain         int64
capital_loss         int64
work_hours           int64
Country             object
dtype: object

In [42]:
x.isnull().sum()

age                    0
employed_at         1836
final_weight           0
ed_degree              0
ed_level               0
marital_status         0
job_title           1843
marital_relation       0
race                   0
Gender                 0
capital_gain           0
capital_loss           0
work_hours             0
Country              583
dtype: int64

In [43]:
categoricals = [col for col in x.columns if x[col].dtypes == 'O']
categoricals

['employed_at',
 'ed_degree',
 'marital_status',
 'job_title',
 'marital_relation',
 'race',
 'Gender',
 'Country']

In [44]:
numericals = [col for col in x.columns if x[col].dtypes != 'O']
numericals

['age',
 'final_weight',
 'ed_level',
 'capital_gain',
 'capital_loss',
 'work_hours']

In [45]:
# handle missing values in categoricals 
for col in categoricals:
    if x[col].isnull().mean() > 0:
        print(col, x[col].isnull().mean())

employed_at 0.05638820638820639
job_title 0.0566031941031941
Country 0.017905405405405406


In [46]:
# imputation of categoricals

for data in [x]:
    data['employed_at'].fillna(x['employed_at'].mode()[0], inplace = True)
    data['job_title'].fillna(x['job_title'].mode()[0], inplace = True)
    data['Country'].fillna(x['Country'].mode()[0], inplace = True)

In [47]:
x.isnull().sum()

age                 0
employed_at         0
final_weight        0
ed_degree           0
ed_level            0
marital_status      0
job_title           0
marital_relation    0
race                0
Gender              0
capital_gain        0
capital_loss        0
work_hours          0
Country             0
dtype: int64

In [48]:
x.shape

(32560, 14)

In [49]:
x.dtypes == 'O'

age                 False
employed_at          True
final_weight        False
ed_degree            True
ed_level            False
marital_status       True
job_title            True
marital_relation     True
race                 True
Gender               True
capital_gain        False
capital_loss        False
work_hours          False
Country              True
dtype: bool

from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.compose import ColumnTransformer as CT

ColTrns = CT([('encoder',OHE(),[1,3,5,6,7,8,9,13])], remainder = 'passthrough')
x = np.array(ColTrns.fit_transform(x))

print(x)

In [50]:
pip install --upgrade category_encoders

Requirement already up-to-date: category_encoders in c:\anaconda_navigator\lib\site-packages (2.2.2)
Note: you may need to restart the kernel to use updated packages.


In [51]:
import category_encoders as ce
encoder = ce.OneHotEncoder(cols = ['employed_at','ed_degree','marital_status','job_title','marital_relation', 'race','Gender', 'Country'])

In [52]:
x = encoder.fit_transform(x)

In [53]:
x.head(10)

Unnamed: 0,age,employed_at_1,employed_at_2,employed_at_3,employed_at_4,employed_at_5,employed_at_6,employed_at_7,employed_at_8,final_weight,ed_degree_1,ed_degree_2,ed_degree_3,ed_degree_4,ed_degree_5,ed_degree_6,ed_degree_7,ed_degree_8,ed_degree_9,ed_degree_10,ed_degree_11,ed_degree_12,ed_degree_13,ed_degree_14,ed_degree_15,ed_degree_16,ed_level,marital_status_1,marital_status_2,marital_status_3,marital_status_4,marital_status_5,marital_status_6,marital_status_7,job_title_1,job_title_2,job_title_3,job_title_4,job_title_5,job_title_6,job_title_7,job_title_8,job_title_9,job_title_10,job_title_11,job_title_12,job_title_13,job_title_14,marital_relation_1,marital_relation_2,marital_relation_3,marital_relation_4,marital_relation_5,marital_relation_6,race_1,race_2,race_3,race_4,race_5,Gender_1,Gender_2,capital_gain,capital_loss,work_hours,Country_1,Country_2,Country_3,Country_4,Country_5,Country_6,Country_7,Country_8,Country_9,Country_10,Country_11,Country_12,Country_13,Country_14,Country_15,Country_16,Country_17,Country_18,Country_19,Country_20,Country_21,Country_22,Country_23,Country_24,Country_25,Country_26,Country_27,Country_28,Country_29,Country_30,Country_31,Country_32,Country_33,Country_34,Country_35,Country_36,Country_37,Country_38,Country_39,Country_40,Country_41
0,50,1,0,0,0,0,0,0,0,83311,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,13,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,38,0,1,0,0,0,0,0,0,215646,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,40,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,53,0,1,0,0,0,0,0,0,234721,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,7,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,40,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,28,0,1,0,0,0,0,0,0,338409,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,40,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,37,0,1,0,0,0,0,0,0,284582,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,14,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,40,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,49,0,1,0,0,0,0,0,0,160187,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,16,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,52,1,0,0,0,0,0,0,0,209642,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,45,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,31,0,1,0,0,0,0,0,0,45781,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,14,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,14084,0,50,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,42,0,1,0,0,0,0,0,0,159449,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,5178,0,40,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,37,0,1,0,0,0,0,0,0,280464,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,10,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,80,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [54]:
x.shape

(32560, 105)

In [55]:
# Feature Scaling

In [56]:
cols = x.columns

In [57]:
from sklearn.preprocessing import RobustScaler as RS
scaler = RS()
x = scaler.fit_transform(x)

In [58]:
x

array([[ 0.65,  1.  , -1.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.05,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.8 ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       ...,
       [ 1.05,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [-0.75,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
       [ 0.75,  0.  , -1.  , ...,  0.  ,  0.  ,  0.  ]])

In [59]:
type(x)

numpy.ndarray

In [60]:
x=pd.DataFrame(x, columns = [cols])

In [61]:
x.head(10)

Unnamed: 0,age,employed_at_1,employed_at_2,employed_at_3,employed_at_4,employed_at_5,employed_at_6,employed_at_7,employed_at_8,final_weight,ed_degree_1,ed_degree_2,ed_degree_3,ed_degree_4,ed_degree_5,ed_degree_6,ed_degree_7,ed_degree_8,ed_degree_9,ed_degree_10,ed_degree_11,ed_degree_12,ed_degree_13,ed_degree_14,ed_degree_15,ed_degree_16,ed_level,marital_status_1,marital_status_2,marital_status_3,marital_status_4,marital_status_5,marital_status_6,marital_status_7,job_title_1,job_title_2,job_title_3,job_title_4,job_title_5,job_title_6,job_title_7,job_title_8,job_title_9,job_title_10,job_title_11,job_title_12,job_title_13,job_title_14,marital_relation_1,marital_relation_2,marital_relation_3,marital_relation_4,marital_relation_5,marital_relation_6,race_1,race_2,race_3,race_4,race_5,Gender_1,Gender_2,capital_gain,capital_loss,work_hours,Country_1,Country_2,Country_3,Country_4,Country_5,Country_6,Country_7,Country_8,Country_9,Country_10,Country_11,Country_12,Country_13,Country_14,Country_15,Country_16,Country_17,Country_18,Country_19,Country_20,Country_21,Country_22,Country_23,Country_24,Country_25,Country_26,Country_27,Country_28,Country_29,Country_30,Country_31,Country_32,Country_33,Country_34,Country_35,Country_36,Country_37,Country_38,Country_39,Country_40,Country_41
0,0.65,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.797262,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.312717,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.333333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.472711,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.342409,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.890927,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.333333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.152454,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.666667,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,-4.8,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.75,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.262357,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.333333,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.112051,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.333333,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,14084.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.158644,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5178.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.856387,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
type(x)

pandas.core.frame.DataFrame

In [63]:
from sklearn.model_selection import train_test_split as ttst
x1 = x.copy()
y1 = y.copy()

In [64]:
x1_train, x1_test, y1_train,y1_test = ttst(x,y, test_size = 0.35, random_state = 0)


In [65]:
x1_train.shape

(21164, 105)

In [66]:
x1_test.shape

(11396, 105)

In [67]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x1_train, y1_train)

GaussianNB()

In [68]:
ygnb_predict = gnb.predict(x1_test)
ygnb_predict

array([' >50K', ' <=50K', ' <=50K', ..., ' >50K', ' >50K', ' >50K'],
      dtype='<U6')

In [69]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y1_test, ygnb_predict)
cm

array([[7058, 1646],
       [ 519, 2173]], dtype=int64)

In [70]:
acs = accuracy_score(y1_test, ygnb_predict)
acs

0.81002106002106

In [71]:
ygnb_trainpredict = gnb.predict(x1_train)

In [72]:
acs2 = accuracy_score(y1_train, ygnb_trainpredict)
acs2

0.8036760536760537

In [73]:
cm2 = confusion_matrix(y1_train, ygnb_trainpredict)
cm2

array([[12913,  3102],
       [ 1053,  4096]], dtype=int64)