In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

columns = ['age', 'workclass', 'fnlwgt', 'education', 
           'educational-num','marital-status', 'occupation', 'relationship', 
           'race', 'gender','capital-gain', 'capital-loss', 'hours-per-week', 
           'native-country','income']

train = pd.read_csv('../data/adult_data.txt', sep=',\s', header=None, names=columns)
test = pd.read_csv('../data/adult_test.txt', sep=',\s', header=None, names=columns)

  del sys.path[0]
  


In [2]:
test.drop(index=0, inplace=True)
test['income'].replace(regex=True, inplace=True, to_replace=r'\.', value=r'')
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
5,18,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [4]:
adult_df = pd.concat([test,train])
adult_df.reset_index(inplace=True, drop=True)

In [5]:
def convert_dtype(df):
    for col in set(df.columns) - set(df.describe().columns):
        df[col] = df[col].astype('category')
    print(df.info())

def remove_na(df):
    for i,j in zip(df.columns,(df.values.astype(str) == '?').sum(axis=0)):
        if j > 0:
            print(f'{str(i)}: {str(j)} missing reconds.')
            df[i].replace('?', np.nan, inplace=True)
    print(f'Before dropping NA values, the DataFrame has {len(df.index)} records.')
    df.dropna(inplace=True) 
    print(f'After dropping NA values, the DataFrame has {len(df.index)} records.')

In [6]:
convert_dtype(adult_df)
remove_na(adult_df)

# convert_dtype(train)
# remove_na(train)

# convert_dtype(test)
# remove_na(test)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   age              48842 non-null  category
 1   workclass        48842 non-null  category
 2   fnlwgt           48842 non-null  float64 
 3   education        48842 non-null  category
 4   educational-num  48842 non-null  float64 
 5   marital-status   48842 non-null  category
 6   occupation       48842 non-null  category
 7   relationship     48842 non-null  category
 8   race             48842 non-null  category
 9   gender           48842 non-null  category
 10  capital-gain     48842 non-null  float64 
 11  capital-loss     48842 non-null  float64 
 12  hours-per-week   48842 non-null  float64 
 13  native-country   48842 non-null  category
 14  income           48842 non-null  category
dtypes: category(10), float64(5)
memory usage: 2.4 MB
None
workclass: 2799 missing reconds.


In [7]:
def to_one_hot(df):
    # convert categorical variables to one-hot encodings
    df_cat_oh = pd.get_dummies(df.select_dtypes('category'))
    df_non_cat = df.select_dtypes(exclude = 'category')
    
    df_oh = pd.concat([df_non_cat, df_cat_oh], axis=1, join='inner')
    return df_oh
    
    

In [8]:
exclude_columns = ['income', 'race', 'gender']

features = adult_df.drop(columns=exclude_columns)
data = to_one_hot(features)
labels = adult_df['income']

train_data, test_data, train_label, test_label = train_test_split(data, labels, test_size=0.2)

# train_features = train.drop(columns=exclude_columns)
# train_data = to_one_hot(train_features)
# train_label = train.income

# test_features = test.drop(columns=exclude_columns)
# test_data = to_one_hot(test_features)
# test_label = test.income

In [9]:
def evaluate_model(actual, predictions):
    confusion = pd.crosstab(actual, predictions, rownames=['Actual'], colnames=['Predicted'])
    TP = confusion.loc['>50K','>50K']
    TN = confusion.loc['<=50K','<=50K']
    FP = confusion.loc['<=50K','>50K']
    FN = confusion.loc['>50K','<=50K']

    accuracy = ((TP+TN))/(TP+FN+FP+TN)
    precision = (TP)/(TP+FP)
    recall = (TP)/(TP+FN)
    f_measure = (2*recall*precision)/(recall+precision)
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    error_rate = 1 - accuracy
    
    out = {}
    out['accuracy'] =  accuracy
    out['precision'] = precision
    out['recall'] = recall
    out['f_measure'] = f_measure
    out['sensitivity'] = sensitivity
    out['specificity'] = specificity
    out['error_rate'] = error_rate
    
    return out

In [10]:
# train model
forest_model = RandomForestClassifier(n_estimators=100, criterion='gini', max_features=None,
                                     min_impurity_split=0.05, min_samples_leaf=0.001)
forest_model.fit(train_data, train_label)
forest_model_predictions = forest_model.predict(test_data)
forest_model_result = evaluate_model(test_label, forest_model_predictions)

forest_result_df = pd.DataFrame([forest_model_result])
forest_result_df.head()











Unnamed: 0,accuracy,precision,recall,f_measure,sensitivity,specificity,error_rate
0,0.84942,0.765376,0.585876,0.663704,0.585876,0.938972,0.15058
