In [41]:
import pandas as pd
import numpy as np

In [2]:
data_path = 'datasets'

In [3]:
import os
def load_data(dat_path = data_path):
    csv_path = os.path.join(dat_path, 'adult.csv')
    features = ['age', 'wkclss', 'fnlwgt', 'educa', 'educa_n', 'marit_st', 'occupa', 'relatnshp', 'race', 'sex', 'cap_gn', 'cap_lss', 'hrwkwk', 'nativ_cntry', 'income']
    return pd.read_csv(csv_path, header = None, sep = ',', names=features)

In [4]:
adult_df = load_data()

In [5]:
adult_df.head(7)

Unnamed: 0,age,wkclss,fnlwgt,educa,educa_n,marit_st,occupa,relatnshp,race,sex,cap_gn,cap_lss,hrwkwk,nativ_cntry,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K


In [6]:
adult_df_cat = adult_df[['income']]
print(type(adult_df_cat))
adult_df_cat.head(10)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,income
0,<=50K
1,<=50K
2,>50K
3,>50K
4,<=50K
5,<=50K
6,<=50K
7,>50K
8,<=50K
9,<=50K


In [7]:
from sklearn.preprocessing import OrdinalEncoder
ord_encoder = OrdinalEncoder()
adult_df_income_Ordi = ord_encoder.fit_transform(adult_df_cat)
adult_df_income_Ordi[:10]

array([[0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.]])

In [8]:
num_income_df = pd.DataFrame(adult_df_income_Ordi, columns=['num_income'])
num_income_df.shape

(48842, 1)

In [9]:
adult_df = pd.concat([adult_df, num_income_df], sort=True, axis=1)
adult_df.head()

Unnamed: 0,age,wkclss,fnlwgt,educa,educa_n,marit_st,occupa,relatnshp,race,sex,cap_gn,cap_lss,hrwkwk,nativ_cntry,income,num_income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K,0.0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K,0.0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K,1.0
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K,1.0
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K,0.0


In [10]:
adult_df['new_feature'] = (adult_df['hrwkwk'])*(adult_df['educa_n'])*adult_df['age'] + 0.223013*adult_df['cap_gn'] - 0.147554*adult_df['cap_lss']

corr_matrix = adult_df.corr()
corr_matrix['num_income'].sort_values(ascending=False)

num_income     1.000000
new_feature    0.450264
educa_n        0.332613
age            0.230369
hrwkwk         0.227687
cap_gn         0.223013
cap_lss        0.147554
fnlwgt        -0.006339
Name: num_income, dtype: float64

In [11]:
adult_df.head()

Unnamed: 0,age,wkclss,fnlwgt,educa,educa_n,marit_st,occupa,relatnshp,race,sex,cap_gn,cap_lss,hrwkwk,nativ_cntry,income,num_income,new_feature
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K,0.0,7000.0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K,0.0,17100.0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K,1.0,13440.0
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K,1.0,19314.523944
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K,0.0,5400.0


In [12]:
# Do simple train test split

# an alternative proper way to split data is to base on stratified sample train test split

# for now just make things simple (we don't intentionally focus on model. just to show how the ML pipeline works)

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(adult_df, test_size=0.2, random_state=77)

In [13]:
cal_attributes = ['new_feature', 'educa_n', 'age', 'hrwkwk']

In [14]:
train_labels = train_set['num_income']
train_features = train_set[cal_attributes]

test_labels = test_set['num_income']
test_features = test_set[cal_attributes]

In [18]:
train_features.head()

Unnamed: 0,new_feature,educa_n,age,hrwkwk
26662,15200.0,10,38,40
17455,16920.0,9,47,40
30387,1530.0,6,17,15
8764,11520.0,10,24,48
46402,13320.0,9,37,40


In [21]:
train_labels.head(7)

26662    0.0
17455    0.0
30387    0.0
8764     0.0
46402    0.0
35831    1.0
33838    0.0
Name: num_income, dtype: float64

In [15]:
# it's a binary classification problem
# we want to classify 2 classes  1/0 from num_income
# 0:income <= 50k$/year
# 1: income > 50k$/year

In [16]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(train_features, train_labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
clf.predict(train_features[:15])

array([0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 0., 0.])

In [35]:
clf.predict_proba(train_features[:15])

array([[0.81709177, 0.18290823],
       [0.80758273, 0.19241727],
       [0.85218936, 0.14781064],
       [0.89306281, 0.10693719],
       [0.85405793, 0.14594207],
       [0.78634607, 0.21365393],
       [0.36524133, 0.63475867],
       [0.82643582, 0.17356418],
       [0.89777221, 0.10222779],
       [0.29163347, 0.70836653],
       [0.07332022, 0.92667978],
       [0.25699471, 0.74300529],
       [0.75223118, 0.24776882],
       [0.84861754, 0.15138246],
       [0.90397997, 0.09602003]])

In [36]:
train_labels[:15]

26662    0.0
17455    0.0
30387    0.0
8764     0.0
46402    0.0
35831    1.0
33838    0.0
43287    0.0
18885    0.0
20281    0.0
47872    1.0
3510     1.0
12406    0.0
47688    0.0
37338    0.0
Name: num_income, dtype: float64

In [None]:
# pretty good!

In [37]:
# try to evaluate model with test data

pred_labels = clf.predict(test_features)

In [79]:
num_correctly_clf = np.where(pred_labels == test_labels)
num_correctly_clf

(array([   0,    1,    2, ..., 9766, 9767, 9768]),)

In [80]:
test_labels_arr = np.array(test_labels)

In [83]:
eff = 100*len(test_labels_arr[num_correctly_clf])/len(test_labels)
eff

#pretty good

80.13102671716655