In [1]:
import pandas as pd
import numpy as np
import os

## Define variables and functions

In [2]:
data_path = 'datasets'

In [3]:
def load_data(dat_path = data_path):
    
    csv_path = os.path.join(dat_path, 'adult.csv')
    features = ['age', 'wkclss', 'fnlwgt', 'educa', 'educa_n', 'marit_st', 'occupa', 'relatnshp', 'race', 'sex', 'cap_gn', 'cap_lss', 'hrwkwk', 'nativ_cntry', 'income']
    return pd.read_csv(csv_path, header = None, sep = ',', names=features)

## Visualize & Clean data

In [4]:
adult_df = load_data()
adult_df.head(7)

Unnamed: 0,age,wkclss,fnlwgt,educa,educa_n,marit_st,occupa,relatnshp,race,sex,cap_gn,cap_lss,hrwkwk,nativ_cntry,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K


In [5]:
adult_df_cat = adult_df[['income']]
print(type(adult_df_cat))
adult_df_cat.head(10)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,income
0,<=50K
1,<=50K
2,>50K
3,>50K
4,<=50K
5,<=50K
6,<=50K
7,>50K
8,<=50K
9,<=50K


In [6]:
from sklearn.preprocessing import OrdinalEncoder


In [7]:
ord_encoder = OrdinalEncoder()
adult_df_income_Ordi = ord_encoder.fit_transform(adult_df_cat)
adult_df_income_Ordi[:10]

array([[0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [8]:
num_income_df = pd.DataFrame(adult_df_income_Ordi, columns=['num_income'])
num_income_df.shape

(48842, 1)

In [9]:
adult_df = pd.concat([adult_df, num_income_df], sort=True, axis=1)
adult_df.head()

Unnamed: 0,age,wkclss,fnlwgt,educa,educa_n,marit_st,occupa,relatnshp,race,sex,cap_gn,cap_lss,hrwkwk,nativ_cntry,income,num_income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K,0.0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K,0.0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K,1.0
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K,1.0
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K,0.0


In [10]:
adult_df['new_feature'] = (adult_df['hrwkwk'])*(adult_df['educa_n'])*adult_df['age'] + 0.223013*adult_df['cap_gn'] - 0.147554*adult_df['cap_lss']
corr_matrix = adult_df.corr(numeric_only=True)
corr_matrix['num_income'].sort_values(ascending=False)

num_income     1.000000
new_feature    0.450264
educa_n        0.332613
age            0.230369
hrwkwk         0.227687
cap_gn         0.223013
cap_lss        0.147554
fnlwgt        -0.006339
Name: num_income, dtype: float64

In [11]:
adult_df.head()

Unnamed: 0,age,wkclss,fnlwgt,educa,educa_n,marit_st,occupa,relatnshp,race,sex,cap_gn,cap_lss,hrwkwk,nativ_cntry,income,num_income,new_feature
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K,0.0,7000.0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K,0.0,17100.0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K,1.0,13440.0
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K,1.0,19314.523944
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K,0.0,5400.0


## Train model

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


In [14]:
# Do simple train test split

# an alternative proper way to split data is to base on stratified sample train test split

# for now just make things simple (we don't intentionally focus on model. just to show how the ML pipeline works)


train_set, test_set = train_test_split(adult_df, test_size=0.2, random_state=77)

In [15]:
cal_attributes = ['new_feature', 'educa_n', 'age', 'hrwkwk']
#cal_attributes = ['new_feature']

In [16]:
train_labels = train_set['num_income']
train_features = train_set[cal_attributes]

test_labels = test_set['num_income']
test_features = test_set[cal_attributes]

In [17]:
train_features.head()

Unnamed: 0,new_feature,educa_n,age,hrwkwk
26662,15200.0,10,38,40
17455,16920.0,9,47,40
30387,1530.0,6,17,15
8764,11520.0,10,24,48
46402,13320.0,9,37,40


In [18]:
train_labels.head(7)

26662    0.0
17455    0.0
30387    0.0
8764     0.0
46402    0.0
35831    1.0
33838    0.0
Name: num_income, dtype: float64

In [19]:
# it's a binary classification problem
# we want to classify 2 classes  1/0 from num_income
# 0:income <= 50k$/year
# 1: income > 50k$/year

In [22]:
clf = LogisticRegression()
clf.fit(train_features, train_labels)

In [23]:
clf.predict(train_features[:15])

array([0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0.])

In [24]:
clf.predict_proba(train_features[:15])

array([[0.82674886, 0.17325114],
       [0.81646413, 0.18353587],
       [0.97826071, 0.02173929],
       [0.8821026 , 0.1178974 ],
       [0.87187081, 0.12812919],
       [0.7819858 , 0.2180142 ],
       [0.37958179, 0.62041821],
       [0.82426084, 0.17573916],
       [0.97418886, 0.02581114],
       [0.31635172, 0.68364828],
       [0.10670019, 0.89329981],
       [0.29338471, 0.70661529],
       [0.71047364, 0.28952636],
       [0.80339975, 0.19660025],
       [0.93906863, 0.06093137]])

In [25]:
train_labels[:15]

26662    0.0
17455    0.0
30387    0.0
8764     0.0
46402    0.0
35831    1.0
33838    0.0
43287    0.0
18885    0.0
20281    0.0
47872    1.0
3510     1.0
12406    0.0
47688    0.0
37338    0.0
Name: num_income, dtype: float64

In [26]:
# pretty good!

In [27]:
# try to evaluate model with test data

pred_labels = clf.predict(test_features)

In [28]:
num_correctly_clf = np.where(pred_labels == test_labels)
num_correctly_clf

(array([   0,    1,    2, ..., 9766, 9767, 9768]),)

In [29]:
test_labels_arr = np.array(test_labels)

In [30]:
eff = 100*len(test_labels_arr[num_correctly_clf])/len(test_labels)
eff

#pretty good

79.93653393387245

In [31]:
cap_gn = 0
cap_lss = 0

a = 84
b = 24
c = 15
d = a*b*c + 0.223013*cap_gn - 0.147554*cap_lss

X_input = [[d, c, b, a]]
print(X_input)

[[30240.0, 15, 24, 84]]


In [32]:
test_pred_labels = clf.predict(X_input)



In [33]:
test_pred_labels

array([1.])

In [34]:
test_features.head(10)

Unnamed: 0,new_feature,educa_n,age,hrwkwk
40869,11700.0,9,52,25
10887,9660.0,12,23,35
44570,13320.0,9,37,40
26772,7200.0,10,30,24
30951,7326.0,9,22,37
47503,12000.0,10,30,40
48558,20250.0,9,90,25
47783,37938.135018,11,42,80
37853,17160.0,13,33,40
34301,22050.0,14,45,35


In [35]:
test_labels.head(10)

40869    0.0
10887    0.0
44570    0.0
26772    0.0
30951    0.0
47503    0.0
48558    0.0
47783    1.0
37853    0.0
34301    0.0
Name: num_income, dtype: float64