In [1]:
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects

In [7]:
df = pd.read_csv('adult.csv',skipinitialspace=True)

In [8]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [10]:
x_cols = [c for c in df.columns if c!= 'income']
# set input matrix and target column
X = df[x_cols]
y = df['income']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state = 800)

In [20]:
# converting categorical values to numerical values using LabelEncoder
cat_columns = ['workclass','education','marital-status','occupation','relationship','race','gender','native-country']
encoders = {}
for column in cat_columns:
    categorical_convert = LabelEncoder()
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    encoders[column] = categorical_convert

In [32]:
y_train.unique()

array(['<=50K', '>50K'], dtype=object)

In [33]:
# Train the RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

In [34]:
# Train ExtraTreesClassifier algorithm
et = ExtraTreesClassifier()
et = et.fit(X_train, y_train)

In [35]:
# save preprocessing objects and RF algorithm
joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']

In [36]:
joblib.dump(encoders, './encoders.joblib', compress = True)

['./encoders.joblib']