In [32]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split
from scipy.stats import norm
import warnings
warnings.filterwarnings("ignore")
import random


In [33]:
Header = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sexl', 'capital-gain', 'capital-loss', 'hours-per-week','native-country','income']
data = pd.read_csv("adult.data", names = Header)
data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sexl,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [34]:
test = pd.read_csv("adult.test", names = Header)
test

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sexl,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
16277,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K.
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [35]:
#Replace missing values with mode
def preprocess(data):
    print("Count of missing values before Preprocessing: " + str(data[data==' ?'].count().sum()))
    print("Columns containing missing values: " + str(data.columns[data.isin([' ?']).any()]))
    data['workclass'].replace(' ?', data['workclass'].mode()[0], inplace=True)
    data['occupation'].replace(' ?', data['occupation'].mode()[0], inplace=True)
    data['native-country'].replace(' ?', data['native-country'].mode()[0], inplace=True)
    print("Count of missing values after Preprocessing: "+str(data[data==' ?'].count().sum()))
    return data

In [36]:
#Remove rows with missing values
def preprocess2(data):
    data = data[data.workclass != ' ?']
    data = data[data.occupation != ' ?']
    data = data[data['native-country'] != ' ?']
    return data


In [37]:
data = preprocess(data)
test = preprocess(test)
print(data.shape)
print(test.shape)

Count of missing values before Preprocessing: 4262
Columns containing missing values: Index(['workclass', 'occupation', 'native-country'], dtype='object')
Count of missing values after Preprocessing: 0
Count of missing values before Preprocessing: 2203
Columns containing missing values: Index(['workclass', 'occupation', 'native-country'], dtype='object')
Count of missing values after Preprocessing: 0
(32561, 15)
(16281, 15)


In [38]:
def train_test(X, y, test_size=0.2, random_state=42):
    n_samples = X.shape[0]
    n_test = int(n_samples * test_size)
    if random_state is not None:
        random.seed(random_state)
    indices = list(range(n_samples))
    random.shuffle(indices)
    test_indices = indices[:n_test]
    train_indices = indices[n_test:]
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]
    return X_train, X_test, y_train, y_test

In [39]:
X_train = data.drop('income', axis=1)
y_train = data['income']
X_test = test.drop('income', axis=1)
y_test = test['income']
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(32561, 14)
(16281, 14)
(32561,)
(16281,)


In [40]:
for i in range(y_test.shape[0]):
    y_test.iloc[i] = y_test.iloc[i].replace('.', '')

# Naive Bayes Classifier

In [41]:
def prior_prob(y_train):
    n_samples = len(y_train)
    print("Number of samples: " + str(n_samples))
    prior = {}
    for c in np.unique(y_train):
        prior[c] = y_train[y_train == c].shape[0] / n_samples
    return prior

def conditional_prob(X_train, y_train, smoothing = False):
    n_samples, n_features = X_train.shape
    unique_classes = np.unique(y_train)
    conditional_probabilities = {}
    
    for col in X_train.columns:
        probabilities_c = {}
        for c in unique_classes:
            X_c = X_train[y_train == c]
            n_c = X_c.shape[0]
            if X_c[col].dtype == np.object:
                value_counts = X_c[col].value_counts()
                if smoothing:
                    probs = (value_counts + 1) / (n_c + len(value_counts))
                else:
                    probs = value_counts / n_c
                probabilities_c[c] = probs.to_dict()
            else:
                mean = X_c[col].mean()
                std = X_c[col].std()
                probabilities_c[c] = (mean, std)
        
        conditional_probabilities[col] = probabilities_c
    
    return conditional_probabilities

def predict(instance, conditional, y_train, prior):
    unique_classes = np.unique(y_train)
    prediction = ' '
    best_prob = -np.inf
    for c in unique_classes:
        prob = 0
        for col in instance.index:
            if type(instance[col]) == str:
                if(instance[col] in conditional[col][c]):
                    prob += np.log(conditional[col][c][instance[col]])
            else:
                mean, std = conditional[col][c]
                prob += np.log(norm.pdf(instance[col], mean, std))
        if prob + np.log(prior[c]) > best_prob:
            best_prob = prob + np.log(prior[c])
            prediction = c
    return prediction

def evaluate(X_test, conditional, prior):
    predictions = []
    for i in range(len(X_test)):
        pred = predict(X_test.iloc[i], conditional, y_train, prior)
        predictions.append(pred)
    return predictions

def metrics(predictions, y_test):
    num_examples = len(predictions)
    true_positives = sum(p == ' <=50K' and t == ' <=50K' for p, t in zip(predictions, y_test))
    false_positives = sum(p == ' <=50K' and t == ' >50K' for p, t in zip(predictions, y_test))
    true_negatives = sum(p == ' >50K' and t == ' >50K' for p, t in zip(predictions, y_test))
    false_negatives = sum(p == ' >50K' and t == ' <=50K' for p, t in zip(predictions, y_test))
    
    accuracy = (true_positives + true_negatives) / num_examples
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    print("Accuracy: " + str(100 * accuracy))
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("F1 Score: " + str(f1_score))

In [42]:
prior = prior_prob(y_train)
print(prior)
conditional = conditional_prob(X_train, y_train, True)
print(conditional)

Number of samples: 32561
{' <=50K': 0.7591904425539756, ' >50K': 0.2408095574460244}
{'age': {' <=50K': (36.78373786407767, 14.02008849082488), ' >50K': (44.24984058155847, 10.519027719851826)}, 'workclass': {' <=50K': {' Private': 0.783686509220317, ' Self-emp-not-inc': 0.07351989647363313, ' Local-gov': 0.05972986088644452, ' State-gov': 0.03825622775800712, ' Federal-gov': 0.023859592364930442, ' Self-emp-inc': 0.02001779359430605, ' Without-pay': 0.0006065998058880621, ' Never-worked': 0.00032351989647363315}, ' >50K': {' Private': 0.6569389575634, ' Self-emp-not-inc': 0.09239199694150631, ' Self-emp-inc': 0.07939339875111508, ' Local-gov': 0.07875621256531158, ' Federal-gov': 0.04740665222377979, ' State-gov': 0.045112781954887216}}, 'fnlwgt': {' <=50K': (190340.8651699029, 106482.27119468106), ' >50K': (188005.0, 102541.77547230694)}, 'education': {' <=50K': {' HS-grad': 0.35684831824062097, ' Some-college': 0.23872089262613194, ' Bachelors': 0.12673835705045278, ' 11th': 0.04511

In [43]:
instance = X_test.iloc[1234, :]
print(predict(instance, conditional, y_train, prior))
print(y_test.iloc[1234])
print(predict(instance, conditional, y_train, prior) == y_test.iloc[1234])

 <=50K
 <=50K
True


In [45]:
print("Without smoothing")
prior = prior_prob(y_train)
conditional = conditional_prob(X_train, y_train, False)
predictions = evaluate(X_test, conditional, prior)
metrics(predictions, y_test)

Without smoothing
Number of samples: 32561
Accuracy: 82.93716602174314
Precision: 0.858170758845783
Recall: 0.9303578608765581
F1 Score: 0.8928075320265473


In [46]:
print("With smoothing")
prior = prior_prob(y_train)
conditional = conditional_prob(X_train, y_train, True)
predictions = evaluate(X_test, conditional, prior)
metrics(predictions, y_test)

With smoothing
Number of samples: 32561
Accuracy: 82.90031324857196
Precision: 0.8578951271972113
Recall: 0.9301970245275433
F1 Score: 0.8925843043444711


In [47]:
#implement label encoding for the categorical features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in data.columns:
    if data[col].dtype == np.object:
        data[col] = le.fit_transform(data[col])

X_train, X_test, y_train, y_test = train_test_split(data.drop(['income'], axis=1), data['income'], test_size=0.33, random_state=42)

In [48]:
def metrics_lr_knn(predictions, y_test):
    num_examples = len(predictions)
    true_positives = sum(p == 1 and t == 1 for p, t in zip(predictions, y_test))
    false_positives = sum(p == 1 and t == 0 for p, t in zip(predictions, y_test))
    true_negatives = sum(p == 0 and t == 0 for p, t in zip(predictions, y_test))
    false_negatives = sum(p == 0 and t == 1 for p, t in zip(predictions, y_test))
    
    accuracy = (true_positives + true_negatives) / num_examples
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    print("Accuracy: " + str(100 * accuracy))
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("F1 Score: " + str(f1_score))

# Logistic Regression

In [49]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)
predictions = logisticRegr.predict(X_test)
metrics_lr_knn(predictions, y_test)

Accuracy: 80.37409268565048
Precision: 0.743109151047409
Recall: 0.2643137254901961
F1 Score: 0.3899334683251374


# K Nearest Neighbors

In [52]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train_scaled, y_train)
predictions = knn.predict(X_test_scaled)
metrics_lr_knn(predictions, y_test)
# print('Accuracy of K-NN classifier on training set: {:.2f}'.format(knn.score(X_train_scaled, y_train)))
# print('Accuracy of K-NN classifier on test set: {:.2f}'.format(knn.score(X_test_scaled, y_test)))

Accuracy: 81.48148148148148
Precision: 0.6208981001727115
Recall: 0.563921568627451
F1 Score: 0.5910398684751337
