In [1]:
import pandas as pd
import numpy as np
import pickle
from os import chdir, getcwd
from pprint import pprint
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Read in data
df = pd.read_csv('../data/adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
# Drop invalid or redundant variables
df = df.drop(['fnlwgt', 'educational-num'], axis=1)

In [8]:
# Replace the question marks with nan's from numpy
for col in df.columns:
    df = df.replace('?', np.nan)

In [9]:
# Fill nan's with the mode of each column
df = df.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [10]:
# Clean up the marital status random variable
df = df.replace(['Divorced', 'Married-AF-spouse',
            'Married-civ-spouse', 'Married-spouse-absent',
            'Never-married', 'Separated', 'Widowed'],
           ['divorced', 'married', 'married', 'married',
            'not married', 'not married', 'not married'])

In [11]:
# List the categorical columns
categorical_cols = ['workclass', 'race', 'education', 'marital-status', 'occupation',
                'relationship', 'gender', 'native-country', 'income']

In [12]:
# Initialize LabelEncoder() object
le = preprocessing.LabelEncoder()

In [13]:
# labelEncoder.classes_: This attribute contains the unique classes found in the column during the fitting process. 
# It represents the original categorical values.

# labelEncoder.transform(labelEncoder.classes_): This part transforms the unique classes into their corresponding numerical labels. 
# It essentially applies the label encoding transformation to the unique classes.

mapping_dict = {}
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])
    name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    mapping_dict[col] = name_mapping
pprint(mapping_dict)

{'education': {' 10th': 0,
               ' 11th': 1,
               ' 12th': 2,
               ' 1st-4th': 3,
               ' 5th-6th': 4,
               ' 7th-8th': 5,
               ' 9th': 6,
               ' Assoc-acdm': 7,
               ' Assoc-voc': 8,
               ' Bachelors': 9,
               ' Doctorate': 10,
               ' HS-grad': 11,
               ' Masters': 12,
               ' Preschool': 13,
               ' Prof-school': 14,
               ' Some-college': 15},
 'gender': {' Female': 0, ' Male': 1},
 'income': {' <=50K': 0, ' >50K': 1},
 'marital-status': {' Divorced': 0,
                    ' Married-AF-spouse': 1,
                    ' Married-civ-spouse': 2,
                    ' Married-spouse-absent': 3,
                    ' Never-married': 4,
                    ' Separated': 5,
                    ' Widowed': 6},
 'native-country': {' ?': 0,
                    ' Cambodia': 1,
                    ' Canada': 2,
                    ' China': 3,
       

In [14]:
# Create matrices
X = df.values[:, 0:12]
y = df.values[:, 12]

In [15]:
# Create train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Create classifier
dtc = DecisionTreeClassifier(criterion='gini', random_state=42, max_depth=5, min_samples_leaf=5)

# Fit the classifier
dtc.fit(X_train, y_train)

In [17]:
# Predict test set
y_pred = dtc.predict(X_test)
print('Accuracy: ', accuracy_score(y_test, y_pred))

Accuracy:  0.831260555811454


In [44]:
# Save dataset as pickle
df.to_pickle(cwd + '\\adult.pkl')

In [45]:
# Save model as pickle
with open('model.pkl', 'wb') as file:
    pickle.dump(dtc, file)

In [None]:
# --- Go to Pycharm --- #