# Case Study: Predicting a person's income level

Source: [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Adult)

Data extracted from a 1994 census.

$50,000 in 1994 [are equivalent to approximately](https://www.aier.org/cost-of-living-calculator/?utm_source=Google%20Ads&utm_medium=Google%20CPC&utm_campaign=COLA&gclid=Cj0KCQjw0tKiBhC6ARIsAAOXutmugoX1IdNXAUsady2hgleQ7FkRxVVP5DtVPiaaO4MJz09meKuKOzsaAnCZEALw_wcB) $100,000 in 2023.

## Importing the required packages

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Add a Dartmouth-y color theme
import matplotlib as mpl
dartmouth_colors = ["#00693E", "#12312B", "#C3DD88", "#6EAA8D", "#797979", "#EBF3EF"]
mpl.rcParams.update({
                        'figure.facecolor': "#EBF3EF",
                        'figure.figsize': [7.50, 3.50],
                        'axes.prop_cycle': mpl.cycler(color=dartmouth_colors),
                        'axes.facecolor': "#FFFFFF",
                        'axes.labelcolor': '#12312B',
                        'text.color': '#12312B'
                    })

%config InlineBackend.figure_formats = ['svg']


## Loading the dataset

In [None]:
train_df = pd.read_csv('../data/adult.data', header=None)
test_df = pd.read_csv('../data/adult.test', header=None, skiprows=1)

names = ['age', 'workclass', 'fnlwgt', 
         'education', 'education_num', 'marital_status',
         'occupation', 'relationship', 'race', 'sex',
         'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
         'income_level']

for df in (train_df, test_df):
    df.columns = names
    df.drop(columns=['education_num', 'fnlwgt', 'capital_gain', 'capital_loss'], inplace=True)
    df.income_level = df.income_level.str.replace('.', '', regex=False)
    display(df.sample(5))

## Encoding the categorical variables

In [None]:
train_df = pd.get_dummies(train_df, columns=['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])
train_df.education = train_df.education.str.strip()
test_df = pd.get_dummies(test_df, columns=['workclass', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])
test_df.education = test_df.education.str.strip()

In [None]:
from pandas.api.types import CategoricalDtype

education_levels = CategoricalDtype(
    categories=['Preschool',  '1st-4th', '5th-6th', '7th-8th', '9th', 
                '10th', '11th', '12th', 'HS-grad', 
                'Assoc-voc', 'Assoc-acdm', 
                'Some-college', 'Bachelors', 'Masters', 
                'Prof-school', 'Doctorate'],
    ordered=True
    )
for df in (train_df, test_df):
    df.education = df.education.astype(education_levels).cat.codes

In [None]:
features = [feature for feature in df.columns if feature != 'income_level']
target = 'income_level'

X_train = train_df
X_test = test_df


## Train the model

In [None]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

candidates = {
    'max_depth': [3, 5, 6, 7, 8],
}

dt = GridSearchCV(DecisionTreeClassifier(class_weight='balanced'), param_grid=candidates, n_jobs=5, verbose=3).fit(X_train[features], X_train[target])

print(dt.best_params_)

## Interpret the model

In [None]:
from sklearn.tree import plot_tree

plt.figure(figsize=(65, 15))
plot_tree(dt.best_estimator_, 
          feature_names=features, class_names=df[target].unique(), 
          filled=True, fontsize=10);

## Evaluate the model's performance

In [None]:
print(classification_report(X_test[target], dt.predict(X_test[features])))

<table >
<tbody>
  <tr>
    <td style="padding:0px;border-width:0px;vertical-align:center">    
    Created by Simon Stone for Dartmouth College Library under <a href="https://creativecommons.org/licenses/by/4.0/">Creative Commons CC BY-NC 4.0 License</a>.<br>For questions, comments, or improvements, email <a href="mailto:researchdatahelp@groups.dartmouth.edu">Research Data Services</a>.
    </td>
    <td style="padding:0 0 0 1em;border-width:0px;vertical-align:center"><img alt="Creative Commons License" src="https://i.creativecommons.org/l/by/4.0/88x31.png"/></td>
  </tr>
</tbody>
</table>
