In [242]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler

## Import Modules

In [243]:
# !pip3 install pandas
# !pip install scikit-learn

## Data Cleaning

In [244]:
def clean_data(path):
    """
        Reads the data, getting rid of extra whitespace after comma
    """
    df = pd.read_csv(
                        path,
                        index_col = None,
                        sep = ', ',
                        names = [  'age', 'work_class', 'id', 'education', 
                                    'education_class', 'marital_status', 
                                    'occupation', 'relationship', 'race', 
                                    'sex', 'capital_loss', 'capital_gain', 
                                    'hours_per_week', 'native_country', 'salary_class'
                                ]
                            )

    return df

In [245]:
data_train = clean_data('adult.data')
data_test = clean_data('adult.test')

  df = pd.read_csv(
  df = pd.read_csv(


In [246]:
data_train.head()

Unnamed: 0,age,work_class,id,education,education_class,marital_status,occupation,relationship,race,sex,capital_loss,capital_gain,hours_per_week,native_country,salary_class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [247]:
data_test.head()

Unnamed: 0,age,work_class,id,education,education_class,marital_status,occupation,relationship,race,sex,capital_loss,capital_gain,hours_per_week,native_country,salary_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [248]:
def drop_na(df):
    """
        Drops null values from the dataframe
        as they don't contribute to the model
    """

    df.loc[df['work_class'] == '?'] = np.nan
    df.loc[df['occupation'] == '?'] = np.nan
    df.loc[df['native_country'] == '?'] = np.nan

    df.dropna(axis = 0, how = 'all', inplace = True)

    return df

In [249]:
clean_data = data_train.copy()
clean_data_test = data_test.copy()

In [250]:
clean_data = drop_na(clean_data)
clean_data_test = drop_na(clean_data_test)

In [251]:
def drop_dup(df):
    """
        Drops duplicate values from the dataframe
        as they also don't contribute to the model
    """

    df = df.drop_duplicates(keep = 'first')

    return df

In [252]:
clean_data = drop_dup(clean_data)
clean_data_test = drop_dup(clean_data_test)

In [253]:
salary_data = clean_data[[
        'age', 'work_class', 'education', 'education_class', 
        'marital_status', 'occupation', 'relationship', 
        'race', 'sex', 'capital_loss', 'capital_gain', 
        'hours_per_week', 'native_country', 'salary_class'
    ]]

salary_data_test = clean_data_test[[
        'age', 'work_class', 'education', 'education_class', 
        'marital_status', 'occupation', 'relationship', 
        'race', 'sex', 'capital_loss', 'capital_gain', 
        'hours_per_week', 'native_country', 'salary_class'
    ]]

In [254]:
pre_processed = salary_data.copy()
pre_processed_test = salary_data_test.copy()

In [255]:
pre_processed.head()

Unnamed: 0,age,work_class,education,education_class,marital_status,occupation,relationship,race,sex,capital_loss,capital_gain,hours_per_week,native_country,salary_class
0,39.0,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [256]:
pre_processed_test.head()

Unnamed: 0,age,work_class,education,education_class,marital_status,occupation,relationship,race,sex,capital_loss,capital_gain,hours_per_week,native_country,salary_class
0,25.0,Private,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
1,38.0,Private,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
2,28.0,Local-gov,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
3,44.0,Private,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
5,34.0,Private,10th,6.0,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,30.0,United-States,<=50K.


## Data Pre-Processing

In [257]:
def label_encode(df):
    """
        Applies Label Encoding to categorical columns for Naive Bayes preprocessing.
    """

    categorical_columns = ['work_class', 'education', 'marital_status', 
                           'occupation', 'relationship', 'race', 'native_country']

    label_encoders = {}

    for column in categorical_columns:
        encoder = LabelEncoder()
        df[column] = encoder.fit_transform(df[column])
        label_encoders[column] = encoder

    return df, label_encoders

In [258]:
def binary_encode(df):
    """
        Converts the binary columns in the DataFrame.
    """
    lb = LabelBinarizer()

    df['sex'] = lb.fit_transform(df['sex'])
    df['salary_class'] = lb.fit_transform(df['salary_class'])

    return df

In [259]:
def std_scaler(df):
    """
        Performs minmax scaling on capital loss and gains columns
    """

    scaler = StandardScaler()
    df[['capital_gain', 'capital_loss']] = scaler.fit_transform(df[['capital_gain', 'capital_loss']])

    return df

In [260]:
def naive_bayes_preprocessing(df):
    """
    Combining all preprocessing functions into one
    """
    
    df, label_encoders = label_encode(df)
    
    df = binary_encode(df)

    df = df.drop(columns = ['education_class'])

    # df = std_scaler(df)
    
    return df, label_encoders

In [261]:
pre_processed, lbe = naive_bayes_preprocessing(pre_processed)
pre_processed_test, lbe_test = naive_bayes_preprocessing(pre_processed_test)

In [262]:
pre_processed.head()

Unnamed: 0,age,work_class,education,marital_status,occupation,relationship,race,sex,capital_loss,capital_gain,hours_per_week,native_country,salary_class
0,39.0,5,9,4,0,1,4,1,2174.0,0.0,40.0,37,0
1,50.0,4,9,2,3,0,4,1,0.0,0.0,13.0,37,0
2,38.0,2,11,0,5,1,4,1,0.0,0.0,40.0,37,0
3,53.0,2,1,2,5,0,2,1,0.0,0.0,40.0,37,0
4,28.0,2,9,2,9,5,2,0,0.0,0.0,40.0,4,0


In [263]:
pre_processed_test.head()

Unnamed: 0,age,work_class,education,marital_status,occupation,relationship,race,sex,capital_loss,capital_gain,hours_per_week,native_country,salary_class
0,25.0,2,1,4,6,3,2,1,0.0,0.0,40.0,37,0
1,38.0,2,11,2,4,0,4,1,0.0,0.0,50.0,37,0
2,28.0,1,7,2,10,0,4,1,0.0,0.0,40.0,37,1
3,44.0,2,15,2,6,0,2,1,7688.0,0.0,40.0,37,1
5,34.0,2,0,4,7,1,4,1,0.0,0.0,30.0,37,0


In [264]:
X_train = pre_processed.drop(columns = ['salary_class'])
y_train = pre_processed['salary_class']

X_test = pre_processed_test.drop(columns = ['salary_class'])
y_test = pre_processed_test['salary_class']


nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[10779   576]
 [ 2564  1136]]

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.95      0.87     11355
           1       0.66      0.31      0.42      3700

    accuracy                           0.79     15055
   macro avg       0.74      0.63      0.65     15055
weighted avg       0.77      0.79      0.76     15055

