# Naive Bayes Algorithm

In [22]:
import pandas as pd
from NaiveBayes import NaiveBayes
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,recall_score,f1_score,precision_score
import numpy as np
from sklearn.linear_model import LogisticRegression
import random 

In [23]:
df = pd.read_csv("adult.data", header=None)
df.columns = ["age", "work-class", "fnlwgt", "education", "education-num", "maritial-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]

## Preprocessing the data

In [24]:
df = df.replace(" ?", np.nan)

In [25]:
df.isna().sum().sum()

4262

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1   work-class       30725 non-null  object
 2   fnlwgt           32561 non-null  int64 
 3   education        32561 non-null  object
 4   education-num    32561 non-null  int64 
 5   maritial-status  32561 non-null  object
 6   occupation       30718 non-null  object
 7   relationship     32561 non-null  object
 8   race             32561 non-null  object
 9   sex              32561 non-null  object
 10  capital-gain     32561 non-null  int64 
 11  capital-loss     32561 non-null  int64 
 12  hours-per-week   32561 non-null  int64 
 13  native-country   31978 non-null  object
 14  income           32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [27]:
def impute(df):
    for column in df.columns:
        if df[column].dtype == "object":
            df[column] = df[column].fillna(df[column].mode()[0])
        else:
            df[column] = df[column].fillna(df[column].mean())

In [28]:
impute(df)

In [29]:
df.isna().sum().sum()

0

## Training and testing the data

In [30]:
train_splits = []

In [31]:
for i in range(10):
    train, test = train_test_split(df, test_size=0.33, random_state=random.randint(0,100))
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    train_splits.append((train,test))

In [32]:
results_list = []

In [33]:
for i,(train,test) in enumerate(train_splits):
    accuracies_dict = {}

    integer_columns = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
    ranges: dict[str, np.array] = dict()
    def convert_from_integer_to_range(df, columns, n_bins=100, _test=False):
        for column in columns:
            if not _test:
                ranges[column] = pd.cut(df[column], bins=n_bins, labels=False, retbins=True)[1]
            df[column] = pd.cut(df[column], bins=ranges[column], labels=False)
    convert_from_integer_to_range(train, integer_columns)
    naive = NaiveBayes(train, "income")
    convert_from_integer_to_range(test, integer_columns, _test=True)
    tp,tn,fp,fn = naive.accuracy(test)
    print("Accuracy for split {0} : {1}".format(i+1,(tp+tn)/(tp+tn+fp+fn)))
    accuracies_dict["index"] = f"Accuracy {i+1}"
    accuracies_dict["accuracy"]= (tp+tn)/(tp+tn+fp+fn)
    accuracies_dict['precision'] = (tp)/(tp+fp)
    accuracies_dict['recall'] = (tp)/(tp+fn)
    accuracies_dict['f1 score'] = (2*accuracies_dict['recall']*accuracies_dict['precision'])/(accuracies_dict['precision'] + accuracies_dict['recall'])
    results_list.append(accuracies_dict)


Accuracy for split 1 : 0.8342639121533594
Accuracy for split 2 : 0.8330541596873255
Accuracy for split 3 : 0.8347292015633724
Accuracy for split 4 : 0.8337986227433464
Accuracy for split 5 : 0.8334263912153359
Accuracy for split 6 : 0.8347292015633724
Accuracy for split 7 : 0.8355667225013959
Accuracy for split 8 : 0.8311930020472734
Accuracy for split 9 : 0.8293318444072213
Accuracy for split 10 : 0.8329611018053229


## KNN Model for the given data

In [35]:
categorical_columns = list(set(df.columns)-set(integer_columns))
categorical_columns.remove('income')
df = pd.get_dummies(df,columns=categorical_columns)
df

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,sex_ Female,sex_ Male,work-class_ Federal-gov,...,education_ 9th,education_ Assoc-acdm,education_ Assoc-voc,education_ Bachelors,education_ Doctorate,education_ HS-grad,education_ Masters,education_ Preschool,education_ Prof-school,education_ Some-college
0,39,77516,13,2174,0,40,<=50K,0,1,0,...,0,0,0,1,0,0,0,0,0,0
1,50,83311,13,0,0,13,<=50K,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,38,215646,9,0,0,40,<=50K,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,53,234721,7,0,0,40,<=50K,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,28,338409,13,0,0,40,<=50K,1,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,<=50K,1,0,0,...,0,1,0,0,0,0,0,0,0,0
32557,40,154374,9,0,0,40,>50K,0,1,0,...,0,0,0,0,0,1,0,0,0,0
32558,58,151910,9,0,0,40,<=50K,1,0,0,...,0,0,0,0,0,1,0,0,0,0
32559,22,201490,9,0,0,20,<=50K,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [36]:
train_splits = []

In [37]:
for i in range(10):
    train, test = train_test_split(df, test_size=0.33, random_state=random.randint(0,100))
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    train_splits.append((train,test))

In [41]:
for i,(train,test) in enumerate(train_splits):
    X_train, y_train = train.drop(columns=["income"]).to_numpy(), train["income"].to_numpy()
    X_test, y_test = test.drop(columns=["income"]).to_numpy(), test["income"].to_numpy()
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    # print(X_test)
    knn = KNeighborsClassifier(n_neighbors=100)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
    results_list[i]['knn accuracy'] = accuracy_score(y_test, y_pred)
    results_list[i]['knn recall'] = recall_score(y_test,y_pred, average="binary", pos_label=' <=50K')
    results_list[i]['knn f1 score'] = f1_score(y_test,y_pred, average="binary", pos_label=' <=50K')
    results_list[i]['knn precision'] = precision_score(y_test,y_pred, average="binary", pos_label=' <=50K')

Model accuracy score: 0.8242
Model accuracy score: 0.8307
Model accuracy score: 0.8289
Model accuracy score: 0.8279
Model accuracy score: 0.8260
Model accuracy score: 0.8352
Model accuracy score: 0.8328
Model accuracy score: 0.8242
Model accuracy score: 0.8254
Model accuracy score: 0.8289


## Logistic Regression for the Given Model

In [42]:
model = LogisticRegression()

In [46]:
for i,(train,test) in enumerate(train_splits):
    X_train, y_train = train.drop(columns=["income"]).to_numpy(), train["income"].to_numpy()
    X_test, y_test = test.drop(columns=["income"]).to_numpy(), test["income"].to_numpy()
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(accuracy_score(y_test,y_pred))
    results_list[i]['logreg accuracy'] = accuracy_score(y_test, y_pred)
    results_list[i]['logreg recall'] = recall_score(y_test,y_pred,average='binary',pos_label=' <=50K')
    results_list[i]['logreg f1 score'] = f1_score(y_test,y_pred,pos_label=' <=50K')
    results_list[i]['logreg precision'] = precision_score(y_test,y_pred,pos_label=' <=50K')


0.8467336683417085
0.8507351572678206
0.8482225944537503
0.8494323469197841
0.8501768099758049
0.8529685464358832
0.8503629257398102
0.8467336683417085
0.8501768099758049
0.847571189279732


In [47]:
results = pd.DataFrame.from_records(results_list)
results

Unnamed: 0,index,accuracy,precision,recall,f1 score,knn accuracy,knn recall,knn f1 score,knn precision,logreg accuracy,logreg recall,logreg f1 score,logreg precision
0,Accuracy 1,0.834264,0.849219,0.925603,0.885767,0.824214,0.926364,0.888456,0.853528,0.846734,0.928703,0.90156,0.875958
1,Accuracy 2,0.833054,0.847732,0.924269,0.884348,0.830728,0.927313,0.892842,0.860843,0.850735,0.932085,0.904739,0.878952
2,Accuracy 3,0.834729,0.784245,0.623573,0.69474,0.828867,0.927703,0.891536,0.858084,0.848223,0.933104,0.903119,0.875
3,Accuracy 4,0.833799,0.84747,0.927998,0.885908,0.827936,0.916105,0.889999,0.86534,0.849432,0.92725,0.903461,0.880861
4,Accuracy 5,0.833426,0.800391,0.615038,0.695578,0.825982,0.932906,0.889987,0.850844,0.850177,0.935619,0.904064,0.874568
5,Accuracy 6,0.834729,0.800702,0.618746,0.698062,0.835194,0.924346,0.895188,0.867814,0.852969,0.927646,0.905728,0.884822
6,Accuracy 7,0.835567,0.791004,0.624235,0.697794,0.832775,0.927064,0.894288,0.86375,0.850363,0.931089,0.904717,0.879797
7,Accuracy 8,0.831193,0.843331,0.928955,0.884075,0.824214,0.926364,0.888456,0.853528,0.846734,0.928703,0.90156,0.875958
8,Accuracy 9,0.829332,0.841604,0.924677,0.881187,0.825423,0.922038,0.888968,0.858188,0.850177,0.934561,0.90436,0.87605
9,Accuracy 10,0.832961,0.797303,0.619832,0.697455,0.828867,0.929446,0.891919,0.857304,0.847571,0.931039,0.902732,0.876095


## NAIVE BAYES RESULTS

In [48]:
print(f"Mean for 10 splits is {results['accuracy'].mean()}")
print(f"Standard deviation for 10 splits is {results['accuracy'].std()}")
print(f"Average precision for 10 splits is {results['precision'].mean()}")
print(f"Average recall for 10 splits is {results['recall'].mean()}")

Mean for 10 splits is 0.8333054159687325
Standard deviation for 10 splits is 0.0018492546977552783
Average precision for 10 splits is 0.8203001233198701
Average recall for 10 splits is 0.7732926315344654


## KNN RESULTS

In [49]:
print(f"Mean for 10 splits is {results['knn accuracy'].mean()}")
print(f"Standard deviation for 10 splits is {results['knn accuracy'].std()}")
print(f"Average precision for 10 splits is {results['knn precision'].mean()}")
print(f"Average recall for 10 splits is {results['knn recall'].mean()}")

Mean for 10 splits is 0.8284198771635959
Standard deviation for 10 splits is 0.003668626604947496
Average precision for 10 splits is 0.8589223183272839
Average recall for 10 splits is 0.9259649632522343


## LOG REG RESULTS

In [50]:
print(f"Mean for 10 splits is {results['logreg accuracy'].mean()}")
print(f"Standard deviation for 10 splits is {results['logreg accuracy'].std()}")
print(f"Average precision for 10 splits is {results['logreg precision'].mean()}")
print(f"Average recall for 10 splits is {results['knn recall'].mean()}")

Mean for 10 splits is 0.8493113716731807
Standard deviation for 10 splits is 0.0019867125293160184
Average precision for 10 splits is 0.8778061197278836
Average recall for 10 splits is 0.9259649632522343
