In [318]:
import pandas as pd
import numpy as np
import random

### Task 1 - Data Processing

In [319]:
#Feature Engineering Task 1
def Engineering_Task1(data):
    # Iterate through each column
    for col in data.columns:
        # Check if the column contains missing values
        if data[col].isnull().sum() > 0:
            # If the column is categorical, impute with the most frequent value
            if data[col].dtype == 'object':
                data[col].fillna(data[col].value_counts().index[0], inplace=True)

        # If the column is numerical, impute with the mean value
            else:
                data[col].fillna(data[col].mean(), inplace=True)

# Save the imputed dataset
    data.to_csv('imputed_data.csv', index=False)

In [320]:
data=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data",
              names=["age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","Salary"])

In [337]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [322]:
from sklearn.preprocessing import LabelEncoder

# Convert the target variable to numerical values using label encoding
le = LabelEncoder()

Engineering_Task1(data)
X = data.drop(['Salary'], axis=1).values
y = data['Salary'].values
# print(X)
# print(y)
# y[y==' <=50K']=0
# y[y==' >50K']=1
# print([y==0].sum)
# assume X and y are the feature matrix and target vector, respectively

# shuffle the indices of the data
indices = list(range(len(X)))
random.shuffle(indices)

# calculate the split point
split_idx = int(0.67 * len(X))


# split the data into training and testing sets
train_X = X[indices[:split_idx]]
train_y = y[indices[:split_idx]]
test_X = X[indices[split_idx:]]
test_y = y[indices[split_idx:]]

train_y = le.fit_transform(train_y)
test_y = le.transform(test_y)
# print(train_X[:,1])



### Task 2 - Naive Bayes Classifier Implementation


### 1. Calculation of the prior probability of each class

In [323]:
def calculate_prior_probabilities(labels):
    num_samples = len(labels)
    unique_labels, label_counts = np.unique(labels, return_counts=True)
    prior_probs = {}
    
    for label, label_count in zip(unique_labels, label_counts):
        prior_probs[label] = label_count / num_samples
    
    return prior_probs


In [324]:
prior_prob=calculate_prior_probabilities(train_y)
print(prior_prob)


{0: 0.7613568645427459, 1: 0.23864313545725419}


### 2.  Calculation of the conditional probability of each feature

In [325]:
def calculate_conditional_probabilities(features, target):
    num_features = features.shape[1]
#     print(features.names)
#     print(num_features)
    unique_targets = np.unique(target)
#     print(unique_targets)
    conditional_probs = {}

    for label in unique_targets:
        label_features = features[target == label]
        conditional_probs[label] = {}

        for i in range(num_features):
            feature_values, feature_counts = np.unique(label_features[:, i], return_counts=True)
#             print(feature_values,feature_counts)
            total_count = np.sum(feature_counts)
#             print(total_count)
            prob_dict = {value: count/total_count for value, count in zip(feature_values, feature_counts)}
            conditional_probs[label][i] = prob_dict

    return conditional_probs


In [326]:
cond_prob=calculate_conditional_probabilities(train_X,train_y)

### 3. Predict the class of a given instance using the Naive Bayes


In [327]:
def predict(instance, prior_probs, conditional_probs):
    unique_labels = list(prior_probs.keys())
    num_features = len(instance)

    class_probs = {}
    for label in unique_labels:
        class_probs[label] = prior_probs[label]

        for i in range(num_features):
            feature_value = instance[i]
            if feature_value in conditional_probs[label][i]:
                class_probs[label] *= conditional_probs[label][i][feature_value]
            else:
                class_probs[label] *=0
#     print(class_probs)
    return max(class_probs, key=class_probs.get)


In [328]:
predictions=[]
for i in range(len(test_X)):
    predictions.append(predict(test_X[i],prior_prob,cond_prob))

### 4. Function to calculate the accuracy of your Naive Bayes

In [329]:
def calculate_accuracy(predictions, actual_labels):
    num_correct = sum(predictions == actual_labels)
    accuracy = num_correct / len(actual_labels)
    return accuracy

In [330]:
calculate_accuracy(predictions, test_y)

0.764749674297413

### Task 3 - Evaluation and Improvement

### 1. Accuracy, Precision, Recall, and F1-score of Naive Bayes Classifier

In [331]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_performance(predictions, actual_labels):
    accuracy = accuracy_score(actual_labels, predictions)
    precision = precision_score(actual_labels, predictions, average='weighted')
    recall = recall_score(actual_labels, predictions, average='weighted')
    f1 = f1_score(actual_labels, predictions, average='weighted')
    return accuracy, precision, recall, f1


In [332]:
evaluate_performance(predictions, test_y)

(0.764749674297413, 0.7355776253273522, 0.764749674297413, 0.7384193433160445)

###  2. Smoothing Technique

In [333]:
def predict_smoothing(instance, prior_probs, conditional_probs):
    unique_labels = list(prior_probs.keys())
    num_features = len(instance)

    class_probs = {}
    for label in unique_labels:
        class_probs[label] = prior_probs[label]

        for i in range(num_features):
            feature_value = instance[i]
            if feature_value in conditional_probs[label][i]:
                class_probs[label] *= conditional_probs[label][i][feature_value]
            else:
                class_probs[label] *=1e-3
#     print(class_probs)
    return max(class_probs, key=class_probs.get)


In [334]:
predictions_smoothing=[]
for i in range(len(test_X)):
    predictions_smoothing.append(predict_smoothing(test_X[i],prior_prob,cond_prob))

In [335]:
evaluate_performance(predictions_smoothing, test_y)

(0.7963893541782989,
 0.8331712795819932,
 0.7963893541782989,
 0.8065158939442165)

### 3. Naive Bayes VS  Logistic Regression VS K-Nearest Neighbors

In [338]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import OneHotEncoder

# create OneHotEncoder object
enc = OneHotEncoder(handle_unknown='ignore')

# fit encoder on categorical features
enc.fit(train_X)

# transform categorical features to one-hot encoded features
X_train_enc = enc.transform(train_X).toarray()
X_test_enc = enc.transform(test_X).toarray()


# Load the dataset
X_train, y_train, X_test, y_test = X_train_enc,train_y,X_test_enc,test_y

print(X_train)



# Train and test a logistic regression classifier
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
prec_lr = precision_score(y_test, y_pred_lr)
rec_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

# Train and test a k-nearest neighbors classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred_knn)
prec_knn = precision_score(y_test, y_pred_knn)
rec_knn = recall_score(y_test, y_pred_knn)
f1_knn = f1_score(y_test, y_pred_knn)

# Print the results
# print("Naive Bayes classifier:")
# print("Accuracy: {:.2f}".format(acc_nb))
# print("Precision: {:.2f}".format(prec_nb))
# print("Recall: {:.2f}".format(rec_nb))
# print("F1-score: {:.2f}".format(f1_nb))
# print()
print("Logistic regression classifier:")
print("Accuracy: {:.2f}".format(acc_lr))
print("Precision: {:.2f}".format(prec_lr))
print("Recall: {:.2f}".format(rec_lr))
print("F1-score: {:.2f}".format(f1_lr))
print()
print("k-Nearest Neighbors classifier:")
print("Accuracy: {:.2f}".format(acc_knn))
print("Precision: {:.2f}".format(prec_knn))
print("Recall: {:.2f}".format(rec_knn))
print("F1-score: {:.2f}".format(f1_knn))


[[0. 1. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic regression classifier:
Accuracy: 0.87
Precision: 0.79
Recall: 0.62
F1-score: 0.70

k-Nearest Neighbors classifier:
Accuracy: 0.82
Precision: 0.66
Recall: 0.54
F1-score: 0.60
