In [100]:
# packages

import pandas as pd
import numpy as np
import random as random
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score

In [101]:
# reading data, removing redundant columns

data = pd.read_csv('existing-customers.csv', sep=';', na_values=[''])
data = data.drop(columns = ['RowID', 'education-num'])
X = data.drop(columns = ['class'])
y = data[['class']]
y = pd.get_dummies(y, drop_first=True)
y = y['class_>50K']

new_data = pd.read_csv('potential-customers.csv', sep=';', na_values=[''])
new_data_ID = new_data['RowID']
new_data = new_data.drop(columns = ['RowID', 'education-num'])
X_new = new_data


# make dictionary to transform categorical data to numeric
cat_columns = X.select_dtypes(['object']).columns
dictionaries = {}
for column in cat_columns:
    dictionary = {}
    count = 0
    for unique_value in X[column].unique():
        dictionary[unique_value] = count
        count += 1    
    dictionaries[column] = dictionary
    
# transform categorical data to numeric    
for column in cat_columns:
    X[column] = X[column].map(dictionaries[column])
    X_new[column] = X_new[column].map(dictionaries[column])
X = pd.get_dummies(X, drop_first=True)


# Split the dataset into train, val, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5)

# Split the train data into multiple subsets using bootstrap sampling
n_subsets = 20
customer_subsets = []
for i in range(n_subsets):
    subset_indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
    X_train_subset = X_train.iloc[subset_indices]
    y_train_subset = y_train.iloc[subset_indices]
    customer_subsets.append((X_train_subset, y_train_subset))


In [102]:
# test which hyperparameters are best on validation set                                

def build_bayes(alpha, validation):
    # select a subset of the data for this bayes
    subset = np.random.choice(len(customer_subsets), size=1)[0]
    X_train_subset, y_train_subset = customer_subsets[subset]
    # train bayes
    bayes = CategoricalNB(force_alpha=True, alpha=alpha)
    bayes.fit(X_train_subset, y_train_subset)
    if validation:
        # predict val with this bayes
        val_predict = bayes.predict(X_val)
        val_acc = accuracy_score(y_val, val_predict)
        # return accuracy
        return val_acc
    else:
        #return bayes
        return bayes


def param_testing(alphas):
    best_acc = 0
    for alpha in alphas:
        val_acc = build_bayes(alpha, True) 
        if val_acc > best_acc:
            best_alpha = alpha
            best_acc = val_acc
            print(f"current best alpha: {best_alpha}, accuracy is {best_acc}")
    print(f"\n best alpha is: {best_alpha}, accuracy is {best_acc}")
    return best_alpha


In [103]:
# testing which hyperparameters give best results (can be skipped: use best parameters given in next step)
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000] #alpha can't be zero or very close to zero

best_alpha = param_testing(alphas)

current best alpha: 0.0001, accuracy is 0.8351760851760852
current best alpha: 0.01, accuracy is 0.8355855855855856
current best alpha: 0.1, accuracy is 0.8462325962325963
current best alpha: 1, accuracy is 0.8527846027846028

 best alpha is: 1, accuracy is 0.8527846027846028


In [104]:
# building bayes with the best found alpha
bayes = build_bayes(best_alpha, False)

In [105]:
# testing the acuracy of the selected bayes on test set
test_predict = bayes.predict(X_test)
test_acc = accuracy_score(y_test, test_predict)

print(test_acc)

0.8524053224155578


In [106]:
# performing predictions on unseen data, and desciding which people to send promo to
new_probability = bayes.predict_proba(X_new)[:,0]

# Predicting the yield of sending promo, based on probabilities
yield_if_sent = (1-new_probability)*980*0.1-new_probability*310*0.05-10
# Desciding whether or not to send promo, based on yield
send_promo = (yield_if_sent > 0).astype(bool)
send_promo = pd.Series(send_promo)

# Calculating total yield of sending promo to selected people
profit_estimate = yield_if_sent[send_promo.values].sum()
amount_to_send = send_promo.sum()
print(f"amount of promos to send: {amount_to_send}, with estimated profit: {profit_estimate}")


amount of promos to send: 4655, with estimated profit: 233779.0201494613


In [107]:
# Writing potential customer ID's and potential profit to text file

# Selecting corresponding ID's
sending_ID = new_data_ID[send_promo.values]
# Calculating total yield of sending promo to selected ID's
profit_estimate = yield_if_sent[send_promo.values].sum()

#writing to text file
with open('selected_customers_bayes.txt', 'w') as f:
    f.write(f"Potential profit estimate: \n {str(profit_estimate)} \n \n ID's of potential customers to send promo to:")
    f.write('\n'.join(sending_ID))