In [22]:
# packages

import pandas as pd
import numpy as np
import random as random
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [23]:
# reading data, removing redundant columns

data = pd.read_csv('existing-customers.csv', sep=';', na_values=[''])
data = data.drop(columns = ['RowID', 'education-num'])
new_data = pd.read_csv('potential-customers.csv', sep=';', na_values=[''])
new_data_ID = new_data['RowID']
new_data = new_data.drop(columns = ['RowID', 'education-num'])

# splitting features from label, trasforming categorical data into dummies (seperate variabe for NA)
X = data.drop(columns = ['class'])
X = pd.get_dummies(X, dummy_na=True)
y = data[['class']]
y = pd.get_dummies(y, drop_first=True)
X_new = pd.get_dummies(new_data, dummy_na=True)

# making sure new customer data has same columns as training data
for category in X.columns.values.tolist():
    if category not in X_new.columns:
        X_new[category] = 0

# Split the dataset into train, val, and test sets
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, test_size=0.3)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5)

# Split the train data into multiple subsets using bootstrap sampling
n_subsets = 20
customer_subsets = []
for i in range(n_subsets):
    subset_indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
    X_train_subset = X_train.iloc[subset_indices]
    y_train_subset = y_train.iloc[subset_indices]
    customer_subsets.append((X_train_subset, y_train_subset))


In [24]:
# defining functions to build tree, build forest, test hyperparameters (to be used in next step)

# Group the dummies from each categorical variable together (to be able to select features keeping dummies together)
vars = ['age', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']
groups = []
for var in vars:
    var_dummies = [col for col in X.columns if col.startswith(var)]
    groups.append(var_dummies)

# build a decision tree with given parameters, and predict the outcome for the validation set
def build_tree(n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, groups, validation):
    # randomly select a subset of features for tree
    selected_groups = []
    groups = random.sample(groups, k=n_feature)
    selected_groups.append(groups)
    selected_features = []
    for groups in selected_groups:
        for group in groups:
            selected_features += group
    # select a subset of the data for tree
    subset = np.random.choice(len(customer_subsets), size=1)[0]
    X_train_subset, y_train_subset = customer_subsets[subset]
    X_train_subset = X_train_subset[selected_features]
    # train tree
    tree = DecisionTreeClassifier(criterion=criterion, min_samples_leaf=min_samples_leaf, max_depth=max_depth,  min_samples_split= min_samples_split)
    tree.fit(X_train_subset, y_train_subset)
    if validation:
        # transforming validation set to fit with selected features of tree
        X_val_subset = X_val[selected_features]
        # predict val with this tree
        predict = tree.predict(X_val_subset)
        return predict
    else:
        #return tree with its selected features
        return [tree, selected_features]

# build a random forest with given parameters, and calculate its accuracy
def build_random_forest(n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, n_tree, groups, validation):
    if validation:
        val_predict = np.zeros((len(X_val), n_tree))
        for i in range(n_tree):
            #fill column of validation prediction for this tree
            val_predict[:, i] = build_tree(n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, groups, True)
        # Aggregate the predict of all the decision trees on the val set
        val_predict = pd.DataFrame(val_predict)
        val_predict = val_predict.mode(axis=1)[0]
        val_acc = accuracy_score(y_val, val_predict)
        return(val_acc)
    else:
        forest=[]
        features=[]
        for i in range(n_tree):
            #fill column of validation prediction for this tree
            tree, selected_features = build_tree(n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, groups, False)
            forest.append(tree) 
            features.append(selected_features)
        return([forest, features])
  
    
# test which hyperparameters are best on validation set                                
def param_testing(n_features, criterions, min_samples_leafs, max_depths, min_samples_splits, n_trees, groups):
    best_acc = 0
    for n_feature in n_features:   
        for criterion in criterions: #["gini", "entropy", "log_loss"]
            for min_samples_leaf in min_samples_leafs:
                for max_depth in max_depths:
                    for min_samples_split in min_samples_splits:
                        for n_tree in n_trees:
                            val_acc = build_random_forest(n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, n_tree, groups, True) 
                            if val_acc > best_acc:
                                best_params = [n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, n_tree]
                                best_acc = val_acc
                                print(f"current best parameters are n_features: {best_params[0]}, criterion: {best_params[1]}, min_samples_leaf: {best_params[2]}, max_depth: {best_params[3]}, min_samples_split: {best_params[4]}, n_trees: {best_params[5]}, accuracy is {best_acc}")
    print(f"\n best parameters are n_features: {best_params[0]}, criterion: {best_params[1]}, min_samples_leaf: {best_params[2]}, max_depth: {best_params[3]}, min_samples_split: {best_params[4]}, n_trees: {best_params[5]}, accuracy is {best_acc}")
    return(best_params)

In [16]:
# testing which hyperparameters give best results (can be skipped: use best parameters given in next step)
n_features = [6, 9, 12]
criterions = ["gini", "entropy", "log_loss"]
min_samples_leafs = [10, 20, 30]
max_depths = [10, 20]
min_samples_splits = [2]
n_trees = [5, 10, 50]

best_params = param_testing(n_features, criterions, min_samples_leafs, max_depths, min_samples_splits, n_trees, groups)


current best parameters are n_features: 9, criterion: gini, min_samples_leaf: 10, max_depth: 20, min_samples_split: 2, n_trees: 50, accuracy is 0.8597461097461098
current best parameters are n_features: 9, criterion: gini, min_samples_leaf: 20, max_depth: 20, min_samples_split: 2, n_trees: 50, accuracy is 0.8607698607698607
current best parameters are n_features: 9, criterion: entropy, min_samples_leaf: 10, max_depth: 20, min_samples_split: 2, n_trees: 50, accuracy is 0.8634316134316135

 best parameters are n_features: 9, criterion: entropy, min_samples_leaf: 10, max_depth: 20, min_samples_split: 2, n_trees: 50, accuracy is 0.8634316134316135


In [25]:
# last found best parameters (to not have to run the testing of hyperparameters again)
# best_params = [9, "entropy", 10, 20, 2, 50]
# accuracy is 0.8634316134316135

In [26]:
# building a forest with the best found parameters
n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, n_tree = best_params
forest, features = build_random_forest(n_feature, criterion, min_samples_leaf, max_depth, min_samples_split, n_tree, groups, False)

In [27]:
# testing the acuracy of the selected random forest on test set

test_predict = np.zeros((len(X_test), n_tree))
for i in range(n_tree):
    test_tree = forest[i]
    test_features = features[i]
    # transforming test set to fit with selected features of tree
    X_test_subset = X_test[test_features]
    # predict test with this tree
    test_predict[:, i] = test_tree.predict(X_test_subset)
# Aggregate the predict of all the decision trees on the val set
test_predict = pd.DataFrame(test_predict)
test_predict = test_predict.mode(axis=1)[0]
test_acc = accuracy_score(y_test, test_predict)

print(test_acc)

0.8552712384851586


In [41]:
# performing predictions on unseen data, and desciding which people to send promo to

new_probability = np.zeros((len(X_new), n_tree))
for i in range(n_tree):
    tree = forest[i]
    feature = features[i]
    # transforming new data set to fit with selected features of tree
    X_new_subset = X_new[feature]
    # predict test with this tree
    new_probability[:, i] = tree.predict_proba(X_new_subset)[:,0]
# Aggregate the probabilities of all the decision trees on the new data set
new_probability = pd.DataFrame(new_probability)
new_probability = new_probability.mean(axis=1)

# Predicting the yield of sending promo, based on probabilities
yield_if_sent = (1-new_probability)*980*0.1-new_probability*310*0.05-10
# Desciding whether or not to send promo, based on yield
send_promo = (yield_if_sent > 0).astype(bool)

# Calculating total yield of sending promo to selected people
profit_estimate = yield_if_sent[send_promo.values].sum()
amount_to_send = send_promo.sum()
print(f"amount of promos to send: {amount_to_send}, with estimated profit: {profit_estimate}")

amount of promos to send: 6633, with estimated profit: 209897.97660882477


In [40]:
# Writing potential customer ID's and potential profit to text file

# Selecting corresponding ID's
sending_ID = new_data_ID[send_promo.values]

#writing to text file
with open('selected_customers_forest.txt', 'w') as f:
    f.write(f"Potential profit estimate: \n {str(profit_estimate)} \n \n ID's of potential customers to send promo to:")
    f.write('\n'.join(sending_ID))