### Predicting Income with Random Forests

In this project, we will be using a dataset containing census information from UCIâ€™s Machine Learning Repository.

By using this census data with a random forest, we will try to predict whether or not a person makes more than $50,000.

In [20]:
# Importing our modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# Loading in and viewing our data
income_data =  pd.read_csv('income.csv', header = 0)
income_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [17]:
# For this model we will be selecting "age", "capital-gain", "capital-loss", "hours-per-week", "Native-country" and "sex"

# Transforming the "sex" column to Female = 1 and male = 0
income_data['sex-int'] = income_data['sex'].apply(lambda row: 1 if row == 'Female' else 0)

# Transforming the Native-country column to US = 1 else 0
income_data['native-country-int'] = income_data['native-country'].apply(lambda row: 1 if row == 'United-States' else 0)

features = income_data[['age', 'capital-gain', 'hours-per-week', 'sex-int', 'native-country-int']]
labels = income_data[['income']]


# Splitting our data into train and test
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 1)

In [18]:
# Creating our model
forest = RandomForestClassifier(random_state = 1)
forest.fit(X_train, y_train)
print(forest.score(X_test, y_test))


  forest.fit(X_train, y_train)


0.8017810532780593


In [23]:
# Trying a Decision Tree Classifier 
tree = DecisionTreeClassifier(random_state = 1)
tree.fit(X_train, y_train)
print(tree.score(X_test, y_test))
print(forest.feature_importances_)

0.8017810532780593
[0.35466488 0.37674332 0.26087984 0.         0.00771196]


In [28]:
# re-training features with those of higher importance
features = income_data[['age', 'capital-gain', 'hours-per-week']]
labels = income_data[['income']]
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 1)

new_tree = DecisionTreeClassifier(random_state = 1)
new_tree.fit(X_train, y_train)
print(new_tree.score(X_test, y_test))


0.8043912175648703
