In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
data=pd.read_csv("/content/adult.csv", header = None, delimiter=' *, *', engine='python')
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
data.columns

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], dtype='int64')

In [4]:
# Adding headers to the dataframe
data.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship',
                    'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']

In [5]:
len(data)

32561

In [6]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

The above output shows that there is no “null” value in our dataset.

Let’s try to test whether any categorical attribute contains a “?” in it or not. At times there exists “?” or ” ” in place of missing values. Using the below code snippet we are going to test whether adult_df data frame consists of categorical variables with values as “?”.

In [7]:
for value in ['workclass','education','marital_status','occupation','relationship','race','sex','native_country','income']:
    print(value,":", sum(data[value] == '?'))

workclass : 1836
education : 0
marital_status : 0
occupation : 1843
relationship : 0
race : 0
sex : 0
native_country : 583
income : 0


The output of the above code snippet shows that there are 1836 missing values in workclass attribute. 1843 missing values in occupation attribute and 583 values in native_country attribute.

Data preprocessing
For preprocessing, we are going to make a duplicate copy of our original dataframe.We are duplicating adult_df to adult_df_rev dataframe. Observe that we have used deep copy while copying.

In [8]:
df=data.copy(deep=True)
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [9]:
df.describe(include= 'all')

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


By using deep=True, you ensure that the new DataFrame (df in this case) is completely independent of the original DataFrame (data).

In [10]:
for value in ['workclass','education','marital_status','occupation','relationship','race','sex','native_country','income']:
    replaceValue = df.describe(include='all')[value][2]
    df[value][df[value]=='?'] = replaceValue

Replacement Value Extraction:

replaceValue = df.describe(include='all')[value][2]: This line extracts the replacement value from the descriptive statistics of the column using the describe() method. describe(include='all') computes summary statistics for all columns, including non-numeric ones. [value][2] accesses the third row of the statistics DataFrame, which typically represents the most frequent value in the column (top in the case of categorical columns).
Replacing Missing Values:

df[value][df[value]=='?'] = replaceValue: This line replaces the missing values represented by '?' in the specified column (value) of the DataFrame df with the extracted replacement value (replaceValue). It uses boolean indexing to identify rows where the value is '?', and then assigns the replacement value to those rows..

In [11]:
# Encoding
from sklearn import preprocessing
# To split the dataset into train and test datasets
from sklearn.model_selection import train_test_split
# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score

In [12]:
#Hot Encoding
le=preprocessing.LabelEncoder()
workclass_category=le.fit_transform(data.workclass)
education_category = le.fit_transform(data.education)
marital_category   = le.fit_transform(data.marital_status)
occupation_category = le.fit_transform(data.occupation)
relationship_category = le.fit_transform(data.relationship)
race_category = le.fit_transform(data.race)
sex_category = le.fit_transform(data.sex)
native_country_category = le.fit_transform(data.native_country)

In [13]:
#initialize the encoded categorical columns
df['workclass_category'] = workclass_category
df['education_category'] = education_category
df['marital_category'] = marital_category
df['occupation_category'] = occupation_category
df['relationship_category'] = relationship_category
df['race_category'] = race_category
df['sex_category'] = sex_category
df['native_country_category'] = native_country_category

In [14]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,...,native_country,income,workclass_category,education_category,marital_category,occupation_category,relationship_category,race_category,sex_category,native_country_category
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,United-States,<=50K,7,9,4,1,1,4,1,39
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,United-States,<=50K,6,9,2,4,0,4,1,39
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,United-States,<=50K,4,11,0,6,1,4,1,39
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,United-States,<=50K,4,1,2,6,0,2,1,39
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,Cuba,<=50K,4,9,2,10,5,2,0,5


In [15]:
#drop the old categorical columns from dataframe
dummy_fields = ['workclass','education','marital_status','occupation','relationship','race', 'sex', 'native_country']
df = df.drop(dummy_fields, axis = 1)

In [16]:
df = df.reindex(['age', 'workclass_category', 'fnlwgt', 'education_category',
                                    'education_num', 'marital_category', 'occupation_category',
                                    'relationship_category', 'race_category', 'sex_category', 'capital_gain',
                                    'capital_loss', 'hours_per_week', 'native_country_category',
                                    'income'], axis= 1)
df.head(5)

Unnamed: 0,age,workclass_category,fnlwgt,education_category,education_num,marital_category,occupation_category,relationship_category,race_category,sex_category,capital_gain,capital_loss,hours_per_week,native_country_category,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,<=50K
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,<=50K
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,<=50K
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,<=50K
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,<=50K


Reindexing Columns:

df.reindex(['age', 'workclass_category', 'fnlwgt', 'education_category', ..., 'income'], axis=1): This line reindexes the columns of the DataFrame df according to the provided list of column names. The axis=1 parameter indicates that reindexing should be performed along columns.
New Column Order:

The list ['age', 'workclass_category', 'fnlwgt', 'education_category', ..., 'income'] specifies the new order of columns in the DataFrame. Each column name represents a feature or variable in the dataset.
Result:

After reindexing, the DataFrame df will have its columns arranged in the specified order.
Head Display:

df.head(5): This line displays the first 5 rows of the DataFrame df after reindexing, allowing you to observe the new column order.

In [17]:
# Data Slicing

In [18]:
x = df.values[:,:14]
y = df.values[:,14]

In [19]:
x

array([[39, 7, 77516, ..., 0, 40, 39],
       [50, 6, 83311, ..., 0, 13, 39],
       [38, 4, 215646, ..., 0, 40, 39],
       ...,
       [58, 4, 151910, ..., 0, 40, 39],
       [22, 4, 201490, ..., 0, 20, 39],
       [52, 5, 287927, ..., 0, 40, 39]], dtype=object)

In [20]:
y

array(['<=50K', '<=50K', '<=50K', ..., '<=50K', '<=50K', '>50K'],
      dtype=object)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 5)

In [22]:
clf=GaussianNB()
clf.fit(x_train,y_train)

In [23]:
y_pred=clf.predict(x_test)
y_pred

array(['>50K', '<=50K', '<=50K', ..., '>50K', '<=50K', '<=50K'],
      dtype='<U5')

In [28]:
accuracy_score(y_test,y_pred)

0.7903205994349588