Objective : To extract occupation & gender information and predict income range

Highlights:
* cateogrical data is encoded into numeric as input to ML algorithmn
    * Nominal : pd.get_dummies / sklearn.preprocessing.OneHotEncoder
    * Ordinal : sklearn.preprocessing.LabelEncoder
    * ref: https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/

# Import Module 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Import Data & Pre-Process

In [2]:
!wget https://raw.githubusercontent.com/AlvinChiew/MachineLearning/main/raw_data/adults.csv

--2021-01-06 16:53:35--  https://raw.githubusercontent.com/AlvinChiew/MachineLearning/main/raw_data/adults.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3974478 (3.8M) [text/plain]
Saving to: ‘adults.csv.1’


2021-01-06 16:53:35 (48.2 MB/s) - ‘adults.csv.1’ saved [3974478/3974478]



In [3]:
df = pd.read_csv("adults.csv", index_col=False, skipinitialspace=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,maritial_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data = df[['occupation','gender', 'income']]
data.head()

Unnamed: 0,occupation,gender,income
0,Adm-clerical,Male,<=50K
1,Exec-managerial,Male,<=50K
2,Handlers-cleaners,Male,<=50K
3,Handlers-cleaners,Male,<=50K
4,Prof-specialty,Female,<=50K


In [5]:
data = pd.get_dummies(data)     # One-Hot-Encoding
data.head()

Unnamed: 0,occupation_?,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,gender_Female,gender_Male,income_<=50K,income_>50K
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
4,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0


In [6]:
X = data.iloc[:,0:-2].values
y = data.iloc[:,-1]
print(f"X shape : {X.shape}")       
print(f"y shape : {y.shape}")

# 17 features, 32561 records

X shape : (32561, 17)
y shape : (32561,)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# Train Model & result

In [8]:
model = LogisticRegression().fit(X_train, y_train)

In [9]:
print(f'Train score : {model.score(X_train, y_train)}')
print(f'Test score : {model.score(X_test, y_test)}')
# Result doesn't look overfitted. 
# Need more dimension to improve the accuracy.

Train score : 0.7857493857493858
Test score : 0.7768087458543177


# Extra: Incorrect Encoding

In [10]:
data_dummy  = pd.DataFrame({'checkbox':[0, 1, 0, 2],
                    'color':['white','white','blue','black']})
data_dummy_wrong = pd.get_dummies(data_dummy)
data_dummy_wrong

# get_dummies() treated checkbox as integer instead of numeric category.

Unnamed: 0,checkbox,color_black,color_blue,color_white
0,0,0,0,1
1,1,0,0,1
2,0,0,1,0
3,2,1,0,0


In [11]:
# correct the behavior by converting 'checkbox' datatype to str
data_dummy_processed = data_dummy.copy()
data_dummy_processed['checkbox'] = data_dummy_processed['checkbox'].astype(str) 
data_dummy_correct = pd.get_dummies(data_dummy_processed)
data_dummy_correct

Unnamed: 0,checkbox_0,checkbox_1,checkbox_2,color_black,color_blue,color_white
0,1,0,0,0,0,1
1,0,1,0,0,0,1
2,1,0,0,0,1,0
3,0,0,1,1,0,0
