# Random Forest Classifier for Income

#### Value: Informs clients on factors most likely to make them money. 
#### Purpose: Economic growth
#### Goal: Predict factors that determine whether an individual earns over $50,000 ($109,000 in 2025) a year
#### Subject Area: Business
#### Features: 14
#### Data: Census data from UCI’s Machine Learning Repository.

* `adult.csv`: includes data on capital gains, education, age, work hours per week, capital loss, and more.

https://archive.ics.uci.edu/dataset/20/census+income 

### Extract, Transform, and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, RandomForestRegressor
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
col_names = ['age', 'workclass', 'fnlwgt','education', 'education-num', 
'marital-status', 'occupation', 'relationship', 'race', 'sex',
'capital-gain','capital-loss', 'hours-per-week','native-country', 'income']
df = pd.read_csv('adult.data', header=None, names = col_names)

In [None]:
#Exploratory data analysis 
print(df.head())
print(df.info())
print(df.describe())

In [None]:
#Distribution of income
print(df['income'].value_counts(normalize=True))

Data cleaning

In [None]:
#Cleans columns by stripping extra whitespace for columns of type " object"
for c in df.select_dtypes(include=['object']).columns:
    df[c] = df[c].str.strip()
    
feature_cols = ['age',
       'capital-gain', 'capital-loss', 'hours-per-week', 'sex','race']

In [None]:
#Creates feature dataframe X with feature columns and dummy variables for categorical features
X = pd.get_dummies(df[feature_cols], drop_first=True)


In [None]:
#Creates output variable y which is binary, 0 when income is less than 50k, 1 when it is greather than 50k
y = np.where(df.income=='<=50K', 0, 1)

In [None]:
#Splits data into a train and test set
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=.2)

### Build and Tune Random Forest Classifiers by Depth 
Has default parameters as a baseline to compare other model performances.

In [None]:
#Instantiates random forest classifier, fit and score with default parameters
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

In [None]:
#Tunes the hyperparameter max_depth over a range from 1-25, save scores for test and train set
np.random.seed(0)
accuracy_train=[]
accuracy_test = []
depths = range(1,26)
for i in depths:
    rf = RandomForestClassifier(max_depth=i)
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    accuracy_test.append(accuracy_score(y_test, rf.predict(x_test)))
    accuracy_train.append(accuracy_score(y_train, rf.predict(x_train)))


In [None]:
#Finds the best accuracy and at what depth that occurs
best_acc= np.max(accuracy_test)
best_depth = depths[np.argmax(accuracy_test)]
print(f'The highest accuracy on the test is achieved when depth: {best_depth}')
print(f'The highest accuracy on the test set is: {round(best_acc*100,3)}%')

In [None]:
# Plots the training and test accuracy of the models versus the max_depth.
plt.plot(depths, accuracy_test,'bo--',depths, accuracy_train,'r*:')
plt.legend(['test accuracy', 'train accuracy'])
plt.xlabel('max depth')
plt.ylabel('accuracy')
plt.show()

In [None]:
#Save the best random forest model and save the feature importances in a dataframe
best_rf = RandomForestClassifier(max_depth=best_depth)
best_rf.fit(x_train, y_train)
feature_imp_df = pd.DataFrame(zip(x_train.columns, best_rf.feature_importances_),  columns=['feature', 'importance'])
print('Top 5 random forest features:')
print(feature_imp_df.sort_values('importance', ascending=False).iloc[0:5])


### Create Additional Features and Re-Tune

In [None]:
#Create new feature based on education and native country
df['education_bin'] = pd.cut(df['education-num'], [0,9,13,16], labels=['HS or less', 'College to Bachelors', 'Masters or more'])

feature_cols = ['age',
        'capital-gain', 'capital-loss', 'hours-per-week', 'sex', 'race','education_bin']

In [None]:
#Uses additional feature and recreate X and test/train split
X = pd.get_dummies(df[feature_cols], drop_first=True)

x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=1, test_size=.2)

In [None]:
#Find the best max depth now with the additional two features
np.random.seed(0)
accuracy_train=[]
accuracy_test = []
depths = range(1,26)
for i in depths:
    rf = RandomForestClassifier(max_depth=i)
    rf.fit(x_train, y_train)
    y_pred = rf.predict(x_test)
    accuracy_test.append(accuracy_score(y_test, rf.predict(x_test)))
    accuracy_train.append(accuracy_score(y_train, rf.predict(x_train)))
    



In [None]:

# Find the largest accuracy and the depth this occurs on the test data. 
# Compare the results from the previous model tuned.
best_acc= np.max(accuracy_test)
best_depth = depths[np.argmax(accuracy_test)]
print(f'The highest accuracy on the test is achieved when depth is: {best_depth}')
print(f'The highest accuracy on the test set is: {round(best_acc*100,3)}%')


In [None]:

# Plots the training and test accuracy of the models versus the max_depth. 
# Compare the results from the previous model tuned.
plt.figure(2)
plt.plot(depths, accuracy_test,'bo--',depths, accuracy_train,'r*:')
plt.legend(['test accuracy', 'train accuracy'])
plt.xlabel('max depth')
plt.ylabel('accuracy')
plt.show()

In [None]:
# Refits the random forest model using the max_depth from above
# Saves the feature importances in a dataframe. 
# Sorts the results and print the top five features. 
# Compares the results from the previous model tuned.
best_rf = RandomForestClassifier(max_depth=best_depth)
best_rf.fit(x_train, y_train)
feature_imp_df = pd.DataFrame(zip(x_train.columns, best_rf.feature_importances_),  columns=['feature', 'importance'])
print('Top 5 random forest features:')
print(feature_imp_df.sort_values('importance', ascending=False).iloc[0:5])
