In [10]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
# A list of columns to train with
income = pd.read_csv('income.csv')
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]
# Convert all columns to numeric
c = pd.Categorical(income['high_income'])
income['high_income'] = c.codes
for name in columns:
    col = pd.Categorical(income[name])
    income[name] = col.codes
# Instantiate the classifier
# Set random_state to 1 to make sure the results are consistent

clf = DecisionTreeClassifier(random_state=1)

# We've already loaded the variable "income," which contains all of the income data
clf.fit(income[columns], income['high_income'])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

In [11]:
import numpy
import math
# Set a random seed so the shuffle is the same every time
numpy.random.seed(1)
# Shuffle the rows  
# This permutes the index randomly using numpy.random.permutation
# Then, it reindexes the dataframe with the result
# The net effect is to put the rows into random order
income = income.reindex(numpy.random.permutation(income.index))
train_max_row = math.floor(income.shape[0] * .8)

train = income.iloc[:train_max_row]
test = income.iloc[train_max_row:]

In [12]:
from sklearn.metrics import roc_auc_score
clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train["high_income"])

predictions = clf.predict(test[columns])
error = roc_auc_score(test['high_income'], predictions)

In [14]:
error

0.6935681755427078

In [16]:
pred_train = clf.predict(train[columns])
error_train = roc_auc_score(train['high_income'], pred_train)
error_train

0.9471244501437455

In [18]:
clf = DecisionTreeClassifier(random_state=1, min_samples_split= 13)
clf.fit(train[columns], train["high_income"])
pred_train = clf.predict(train[columns])
train_auc = roc_auc_score(train['high_income'], pred_train)
pred_test = clf.predict(test[columns])
test_auc = roc_auc_score(test['high_income'], pred_test)
print(train_auc, test_auc)

0.8421431849275413 0.6995617145150872


In [19]:
clf = DecisionTreeClassifier(random_state=1, 
                             min_samples_split= 13,
                            max_depth = 7)
clf.fit(train[columns], train["high_income"])
pred_train = clf.predict(train[columns])
train_auc = roc_auc_score(train['high_income'], pred_train)
pred_test = clf.predict(test[columns])
test_auc = roc_auc_score(test['high_income'], pred_test)
print(train_auc, test_auc)

0.748037708309209 0.7436344996725136


In [20]:
numpy.random.seed(1)
# Generate a column containing random numbers from 0 to 4
income["noise"] = numpy.random.randint(4, size=income.shape[0])
# Adjust "columns" to include the noise column
columns = ["noise", "age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]
# Make new train and test sets
train_max_row = math.floor(income.shape[0] * .8)

train = income.iloc[:train_max_row]

test = income.iloc[train_max_row:]
# Initialize the classifier
clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test["high_income"], predictions)
train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train["high_income"], train_predictions)
print(test_auc)
print(train_auc)

0.6914060013941348
0.9750761614350801


In [21]:

columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)

clf.fit(train[columns], train["high_income"])

clf2 = DecisionTreeClassifier(random_state=1, max_depth=5)

clf2.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])

print(roc_auc_score(test["high_income"], predictions))

predictions = clf2.predict(test[columns])

print(roc_auc_score(test["high_income"], predictions))

0.6878964226062301
0.6759853906508785


In [22]:
predictions = clf.predict_proba(test[columns])[:,1]
predictions2 = clf2.predict_proba(test[columns])[:,1]
combined = (predictions + predictions2) / 2
rounded = numpy.round(combined)
print(roc_auc_score(test["high_income"], rounded))

0.7150846804038882


In [24]:
# We'll build 10 trees

tree_count = 10

# Each "bag" will have 60% of the number of original rows

bag_proportion = .6

predictions = []

for i in range(tree_count):

    # We select 60% of the rows from train, sampling with replacement

    # We set a random state to ensure we'll be able to replicate our results

    # We set it to i instead of a fixed value so we don't get the same sample in every loop

    # That would make all of our trees the same

    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)

    # Fit a decision tree model to the "bag"
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=2)

    clf.fit(bag[columns], bag["high_income"])

    # Using the model, make predictions on the test data
    predictions.append(clf.predict_proba(test[columns])[:,1])
    
combined = numpy.sum(predictions, axis = 0) / 10
rounded = numpy.round(combined)
print(roc_auc_score(test["high_income"], rounded))


0.7327934360409382


In [26]:
# We'll build 10 trees
tree_count = 10
# Each "bag" will have 60% of the number of original rows

bag_proportion = .6
predictions = []

for i in range(tree_count):
    # We select 60% of the rows from train, sampling with replacement
    # We set a random state to ensure we'll be able to replicate our results
    # We set it to i instead of a fixed value so we don't get the same sample every time
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)

    # Fit a decision tree model to the "bag"

    clf = DecisionTreeClassifier(random_state=1, 
                                 min_samples_leaf=2,
                                splitter = 'random', 
                                max_features = 'auto')
# If we have N columns, this will pick a subset of features
#of size sqrt(N), compute the Gini coefficient for each (this is
#similar to information gain), 
#and split the node on the best column in the subset.
    clf.fit(bag[columns], bag["high_income"])

    # Using the model, make predictions on the test data

    predictions.append(clf.predict_proba(test[columns])[:,1])

combined = numpy.sum(predictions, axis=0) / 10

rounded = numpy.round(combined)

print(roc_auc_score(test["high_income"], rounded))

0.7305357972400943


In [28]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=2)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
print(roc_auc_score(test["high_income"], predictions))

0.7379403213124711


In [29]:
# Random forest overfit less
clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=5)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(train[columns])

print(roc_auc_score(train["high_income"], predictions))
predictions = clf.predict(test[columns])

print(roc_auc_score(test["high_income"], predictions))

clf = RandomForestClassifier(n_estimators=150, random_state=1, min_samples_leaf=5)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(train[columns])
print(roc_auc_score(train["high_income"], predictions))
predictions = clf.predict(test[columns])
print(roc_auc_score(test["high_income"], predictions))

0.8192570489534683
0.7139325899284541
0.7917047295143252
0.7498874343962398
