In [1]:
# packages

import pandas as pd
import numpy as np
import random as random
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [2]:
# reading data, removing redundant columns

data = pd.read_csv('existing-customers.csv', sep=';', na_values=[''])
y = data[['class']]
y = pd.get_dummies(y, drop_first=True)
y = y['class_>50K']
data = data.drop(columns = ['RowID', 'education-num'])
data = data.drop(columns = ['class'])

new_data = pd.read_csv('potential-customers.csv', sep=';', na_values=[''])
new_data_ID = new_data['RowID']
new_data = new_data.drop(columns = ['RowID', 'education-num'])


## Bayesian classifier

In [3]:
# transforming data to fit for Bayesian classifier
X_bayes = data
X_bayes_new = new_data

# make dictionary to transform categorical data to numeric, to make sure new customer data has same numbering as training data
cat_columns = X_bayes.select_dtypes(['object']).columns
dictionaries = {}
for column in cat_columns:
    dictionary = {}
    count = 0
    for unique_value in X_bayes[column].unique():
        dictionary[unique_value] = count
        count += 1    
    dictionaries[column] = dictionary
    
# transform categorical data to numeric    
for column in cat_columns:
    X_bayes[column] = X_bayes[column].map(dictionaries[column])
    X_bayes_new[column] = X_bayes_new[column].map(dictionaries[column])
X_bayes = pd.get_dummies(X_bayes, drop_first=True)


In [11]:
# Split the dataset into train, val, and test sets
X_bayes_train, X_bayes_val_test, y_bayes_train, y_bayes_val_test = train_test_split(X_bayes, y, test_size=0.3)
X_bayes_val, X_bayes_test, y_bayes_val, y_bayes_test = train_test_split(X_bayes_val_test, y_bayes_val_test, test_size=0.5)

# Split the train data into multiple subsets using bootstrap sampling
n_subsets = 10
customer_subsets_bayes = []
for i in range(n_subsets):
    subset_indices = np.random.choice(len(X_bayes_train), size=len(X_bayes_train), replace=True)
    X_bayes_train_subset = X_bayes_train.iloc[subset_indices]
    y_bayes_train_subset = y_bayes_train.iloc[subset_indices]
    customer_subsets_bayes.append((X_bayes_train_subset, y_bayes_train_subset))


In [9]:
# defining functions to build bayes, test hyperparameters (to be used in next step)                               

def build_bayes(alpha, validation):
    # select a subset of the data for this bayes
    subset = np.random.choice(len(customer_subsets_bayes), size=1)[0]
    X_bayes_train_subset, y_bayes_train_subset = customer_subsets_bayes[subset]
    # train bayes
    bayes = CategoricalNB(force_alpha=True, alpha=alpha)
    bayes.fit(X_bayes_train_subset, y_bayes_train_subset)
    if validation:
        # predict val with this bayes
        val_predict = bayes.predict(X_bayes_val)
        val_acc = accuracy_score(y_bayes_val, val_predict)
        # return accuracy
        return val_acc
    else:
        #return bayes
        return bayes


def param_testing(alphas):
    best_acc = 0
    for alpha in alphas:
        val_acc = build_bayes(alpha, True) 
        if val_acc > best_acc:
            best_alpha = alpha
            best_acc = val_acc
            print(f"current best alpha: {best_alpha}, accuracy is {best_acc}")
    print(f"\n best alpha is: {best_alpha}, accuracy is {best_acc}")
    return best_alpha


In [12]:
# testing which hyperparameters give best results (can be skipped: use best parameters given in next step)
alphas = [0.0001, 0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000, 10000] #alpha can't be zero or very close to zero

best_alpha = param_testing(alphas)

current best alpha: 0.0001, accuracy is 0.8404995904995906
current best alpha: 0.1, accuracy is 0.8511466011466011
current best alpha: 0.5, accuracy is 0.8587223587223587
current best alpha: 1, accuracy is 0.8609746109746109

 best alpha is: 1, accuracy is 0.8609746109746109


In [13]:
# building bayes with the best found alpha
bayes = build_bayes(best_alpha, False)

In [16]:
# testing the acuracy of the selected bayes on test set
test_predict = bayes.predict(X_bayes_test)
test_acc = accuracy_score(y_bayes_test, test_predict)

print(f" accuracy of Bayesian classifier is {test_acc}")

 accuracy of Bayesian classifier is 0.8554759467758444


## Decision tree and random forest

In [18]:
# transforming data to fit for decision tree classifier
X_tree = data
X_tree_new = new_data

# trasforming categorical data into dummies (seperate variabe for NA)
X_tree = pd.get_dummies(X_tree, dummy_na=True)
X_tree_new = pd.get_dummies(X_tree_new, dummy_na=True)

# checking all columns unique values, to make sure new customer data has same columns as training data
for category in X_tree.columns.values.tolist():
    if category not in X_tree_new.columns:
        X_tree_new[category] = 0




In [20]:
# Split the dataset into train, val, and test sets
X_tree_train, X_tree_val_test, y_tree_train, y_tree_val_test = train_test_split(X_tree, y, test_size=0.3)
X_tree_val, X_tree_test, y_tree_val, y_tree_test = train_test_split(X_tree_val_test, y_tree_val_test, test_size=0.5)

# Split the train data into multiple subsets using bootstrap sampling
n_subsets = 20
customer_subsets_tree = []
for i in range(n_subsets):
    subset_indices = np.random.choice(len(X_tree_train), size=len(X_tree_train), replace=True)
    X_tree_train_subset = X_tree_train.iloc[subset_indices]
    y_tree_train_subset = y_tree_train.iloc[subset_indices]
    customer_subsets_tree.append((X_tree_train_subset, y_tree_train_subset))

In [21]:
# defining functions to build tree, build forest, test hy_treeperparameters (to be used in next step)

# Group the dummies from each categorical variable together (to be able to select features keeping dummies together)
vars = ['age', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country_tree']
groups = []
for var in vars:
    var_dummies = [col for col in X_tree.columns if col.startswith(var)]
    groups.append(var_dummies)

# build a decision tree with given parameters, and predict the outcome for the validation set
def build_tree(n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, groups, validation):
    # randomly_tree select a subset of features for tree
    selected_groups = []
    groups = random.sample(groups, k=n_feature)
    selected_groups.append(groups)
    selected_features = []
    for groups in selected_groups:
        for group in groups:
            selected_features += group
    # select a subset of the data for tree
    subset = np.random.choice(len(customer_subsets_tree), size=1)[0]
    X_tree_train_subset, y_tree_train_subset = customer_subsets_tree[subset]
    X_tree_train_subset = X_tree_train_subset[selected_features]
    # train tree
    tree = DecisionTreeClassifier(criterion=criterion, min_samples_leaf=min_samples_leaf, max_depth=max_depth,  min_samples_split= min_samples_split)
    tree.fit(X_tree_train_subset, y_tree_train_subset)
    if validation:
        # transforming validation set to fit with selected features of tree
        X_tree_val_subset = X_tree_val[selected_features]
        # predict val with this tree
        predict = tree.predict(X_tree_val_subset)
        return predict
    else:
        #return tree with its selected features
        return [tree, selected_features]

# build a random forest with given parameters, and calculate its accuracy_tree
def build_random_forest(n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, n_tree, groups, validation):
    if validation:
        val_predict = np.zeros((len(X_tree_val), n_tree))
        for i in range(n_tree):
            #fill column of validation prediction for this tree
            val_predict[:, i] = build_tree(n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, groups, True)
        # Aggregate the predict of all the decision trees on the val set
        val_predict = pd.DataFrame(val_predict)
        val_predict = val_predict.mode(axis=1)[0]
        val_acc = accuracy_tree_score(y_tree_val, val_predict)
        return(val_acc)
    else:
        forest=[]
        features=[]
        for i in range(n_tree):
            #fill column of validation prediction for this tree
            tree, selected_features = build_tree(n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, groups, False)
            forest.append(tree) 
            features.append(selected_features)
        return([forest, features])
  
    
# test which hy_treeperparameters are best on validation set                                
def param_testing(n_features, criterions, min_samples_leafs, max_depths, min_samples_splits, n_trees, groups):
    best_acc = 0
    for n_feature in n_features:   
        for criterion in criterions: #["gini", "entropy_tree", "log_loss"]
            for min_samples_leaf in min_samples_leafs:
                for max_depth in max_depths:
                    for min_samples_split in min_samples_splits:
                        for n_tree in n_trees:
                            val_acc = build_random_forest(n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, n_tree, groups, True) 
                            if val_acc > best_acc:
                                best_params = [n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, n_tree]
                                best_acc = val_acc
                                print(f"current best parameters are n_features: {best_params[0]}, criterion: {best_params[1]}, min_samples_leaf: {best_params[2]}, max_depth: {best_params[3]}, min_samples_split: {best_params[4]}, n_trees: {best_params[5]}, accuracy_tree is {best_acc}")
    print(f"\n best parameters are n_features: {best_params[0]}, criterion: {best_params[1]}, min_samples_leaf: {best_params[2]}, max_depth: {best_params[3]}, min_samples_split: {best_params[4]}, n_trees: {best_params[5]}, accuracy_tree is {best_acc}")
    return(best_params)


In [None]:
# testing which hyperparameters give best results (can be skipped: use best parameters given in next step)
n_features = [6, 9, 12]
criterions = ["gini", "entropy", "log_loss"]
min_samples_leafs = [10, 20, 30]
max_depths = [10, 20]
min_samples_splits = [2]
n_trees = [5, 10, 50]

best_params = param_testing(n_features, criterions, min_samples_leafs, max_depths, min_samples_splits, n_trees, groups)


In [23]:
# last found best parameters (to not have to run the testing of hyperparameters again)
# best_params = [9, "entropy", 10, 20, 2, 50]
# accuracy is 0.8634316134316135

In [24]:
# building a forest with the best found parameters
n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, n_tree = best_params
forest, features = build_random_forest(n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, n_tree, groups, False)

In [25]:
# testing the acuracy of the selected random forest on test set

test_predict = np.zeros((len(X_tree_test), n_tree))
for i in range(n_tree):
    test_tree = forest[i]
    test_features = features[i]
    # transforming test set to fit with selected features of tree
    X_tree_test_subset = X_tree_test[test_features]
    # predict test with this tree
    test_predict[:, i] = test_tree.predict(X_tree_test_subset)
# Aggregate the predict of all the decision trees on the val set
test_predict = pd.DataFrame(test_predict)
test_predict = test_predict.mode(axis=1)[0]
test_acc = accuracy_score(y_tree_test, test_predict)

print(f" accuracy of random forest classifier is {test_acc}")


 accuracy of random forest classifier is 0.8624360286591607


## Aggregating two models on unseen data

In [66]:
# performing Bayesian predictions on unseen data
bayes_new_probability = bayes.predict_proba(X_bayes_new)[:,0]
bayes_new_probability = pd.DataFrame(bayes_new_probability)

# performing tree predictions on unseen data
tree_new_probability = np.zeros((len(X_tree_new), n_tree))
for i in range(n_tree):
    tree = forest[i]
    feature = features[i]
    # transforming new data set to fit with selected features of tree
    X_tree_new_subset = X_tree_new[feature]
    # predict test with this tree
    tree_new_probability[:, i] = tree.predict_proba(X_tree_new_subset)[:,0]
# Aggregate the probabilities of all the decision trees on the new data set
tree_new_probability = pd.DataFrame(tree_new_probability)
tree_new_probability = tree_new_probability.mean(axis=1)

# aggregating the two prediction
new_probability = bayes_new_probability
new_probability['tree']=tree_new_probability
new_probability = new_probability.mean(axis=1)

In [68]:
# Predicting the yield of sending promo, based on probabilities
yield_if_sent = (1-new_probability)*980*0.1-new_probability*310*0.05-10

# Desciding whether or not to send promo, based on yield
send_promo = (yield_if_sent > 0).astype(bool)
send_promo = pd.Series(send_promo)

# Calculating total yield of sending promo to selected people
profit_estimate = yield_if_sent[send_promo.values].sum()
amount_to_send = send_promo.sum()
print(f"amount of promos to send: {amount_to_send}, with estimated profit: {profit_estimate}")

amount of promos to send: 5364, with estimated profit: 212137.85547256417


In [70]:
# Writing potential customer ID's and potential profit to text file

# Selecting corresponding ID's
sending_ID = new_data_ID[send_promo.values]
# Calculating total yield of sending promo to selected ID's
profit_estimate = yield_if_sent[send_promo.values].sum()

#writing to text file
with open('selected_customers.txt', 'w') as f:
    f.write(f"Potential profit estimate: \n {str(profit_estimate)} \n \n ID's of potential customers to send promo to:")
    f.write('\n'.join(sending_ID))