## Prework

* import basic dependencies
* load data
* check data

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/salary-prediction/train.csv
/kaggle/input/salary-prediction/test.csv


In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [3]:
train = pd.read_csv('/kaggle/input/salary-prediction/train.csv')
test = pd.read_csv('/kaggle/input/salary-prediction/test.csv')
train.shape, test.shape

((32561, 15), (16281, 14))

In [4]:
# check data
def show_info(data, is_matrix_transpose=False):
    # basic shape
    print('data shape is: {}   sample number {}   attribute number {}\n'.format(data.shape, data.shape[0], data.shape[1]))
    # attribute(key)
    print('data columns number {}  \nall columns: {}\n'.format(len(data.columns) ,data.columns))
    # value's null
    print('data all attribute count null:\n', data.isna().sum())
    # data value analysis and data demo
    if is_matrix_transpose:
        print('data value analysis: ', data.describe().T)
        print('data demo without matrix transpose: ', data.head().T)
    else:
        print('data value analysis: ', data.describe())
        print('data demo without matrix transpose: ', data.head())
show_info(train)
show_info(test)

data shape is: (32561, 15)   sample number 32561   attribute number 15

data columns number 15  
all columns: Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

data all attribute count null:
 age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64
data value analysis:                  age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e

## data preprocessing


In [5]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [6]:
target = pd.DataFrame(train['income'].map({' <=50K': 0, ' >50K': 1}))
target

Unnamed: 0,income
0,0
1,0
2,0
3,0
4,0
...,...
32556,0
32557,1
32558,0
32559,0


In [7]:
# concat train and test
train = train.drop(['income'], axis=1)
data = pd.concat([train, test], axis=0)
data.shape

(48842, 14)

In [8]:
# attribute 'sex' labelencode
data['sex'] = data['sex'].map({' Male': 1, ' Female': 0})
data

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,0,0,0,36,United-States
16277,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,1,0,0,40,United-States
16278,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,1,0,0,50,United-States
16279,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,1,5455,0,40,United-States


In [9]:
categorical_cols = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']
numerical_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

# then encode categorical data
for col in categorical_cols:
    new = pd.get_dummies(data[col], prefix=col)
    data = pd.concat([data, new], axis=1)
    data = data.drop([col], axis=1)
data.shape

(48842, 107)

### Normalization -- Standard Normalization

* formula：`x' = (x - mean(x))/std(x)` 
* geometry: 样本集中分布于原点之间
</br>
* 实现方法：
    * 通过利用手写封装函数
    * 利用`sklearn.preprocessing.StandardScaler`直接进行StandardScaler

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
# normalize numerical data
def _normalize_column_normal(X, train=True, specified_column=None, X_mean=None, X_std=None):
    if train:
        if specified_column == None:
            specified_column = np.arange(X.shape[1])
        length = len(specified_column)
        X_mean = np.reshape(np.mean(X[:, specified_column], 0), (1, length))
        X_std = np.reshape(np.std(X[:, specified_column], 0), (1, length))
    X[:, specified_column] = np.divide(np.subtract(X[:, specified_column], X_mean), X_std)
    
    return X, X_mean, X_std

In [12]:
for col in numerical_cols:
    data[col] = (data[col] - data[col].mean()) / data[col].std()
data

Unnamed: 0,age,fnlwgt,education_num,sex,capital_gain,capital_loss,hours_per_week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,0.025996,-1.061968,1.136500,1,0.146931,-0.217125,-0.034087,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0.828300,-1.007094,1.136500,1,-0.144802,-0.217125,-2.213009,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,-0.046941,0.246031,-0.419331,1,-0.144802,-0.217125,-0.034087,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1.047110,0.426659,-1.197247,1,-0.144802,-0.217125,-0.034087,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,-0.776309,1.408515,1.136500,0,-0.144802,-0.217125,-0.034087,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,0.025996,0.243881,1.136500,0,-0.144802,-0.217125,-0.356890,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16277,1.849414,1.247480,-0.419331,1,-0.144802,-0.217125,-0.034087,1,0,0,...,0,0,0,0,0,0,0,1,0,0
16278,-0.046941,1.754847,1.136500,1,-0.144802,-0.217125,0.772922,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16279,0.390679,-1.001601,1.136500,1,0.587214,-0.217125,-0.034087,0,0,0,...,0,0,0,0,0,0,0,1,0,0


use `sklearn.preprocessing.StandardScaler`

In [13]:
# get data copy
scaled_features = data.copy()
# extract specified column
features = scaled_features[numerical_cols]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)
# fill normalized values
scaled_features[numerical_cols] = features
# numpy -> DataFrame
data = pd.DataFrame(scaled_features)

In [14]:
# remove special
data = data.drop(['native_country_ Holand-Netherlands'], axis=1)
data.shape

(48842, 106)

In [15]:
# split train and test data
train = data.iloc[:len(train), :]
test = data.iloc[len(train):, :]
train.shape, test.shape

((32561, 106), (16281, 106))

In [16]:
target.shape

(32561, 1)

## Model

* prepare some methods
* prepare all dataset [train_set, validation_set, test_set]
* initial some data
    * weight matrix => zero
    * bias matrix => zero
    * some hyperparams
        * is_regularize/regulariization
        * lamda
    * list for training process

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
# cross_entropy
def _cross_entropy_loss(y_pred, Y_label):
    '''
    y_pred [float]: output prediction(based on probabilistic) 
    Y_label [bool]: label
    '''
    cross_entropy = -np.dot(Y_label, np.log(y_pred)) - np.dot((1 - Y_label), np.log(1 - y_pred))
    return cross_entropy

# sigmoid
def _sigmoid(z):
    '''
    calculate probability
    '''
    return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1 - (1e-8))

# get probabilistiic
def get_prob(X, w, b):
    '''
    X: input data, shape = [batch_size, data_dimension]
    w: weight vector, shape = [data_dimension, ]
    b: bias, scalar
    '''
    
    return _sigmoid(np.matmul(X, w) + b)

# get prediction
def infer(X, w, b):
    '''
    get prediction and transform data type
    '''
    return np.round(get_prob(X, w, b)).astype(np.int)

# loss: call cross_entropy_loss
def _loss(y_pred, Y_label, lambd, w):
    return _cross_entropy_loss(y_pred, Y_label) + lamda*np.sum(np.square(w))

In [19]:
# gradient descent
def _gradient_regularization(X, Y_label, w, b):
    '''
    use cross_entropy to update weight and bias matrix
    add a lambda penalty term to avoid overfitting
    '''
    y_pred = get_prob(X, w, b)
    pred_error = Y_label - y_pred
    
    w_grad = -np.sum(pred_error * X.T, 1) + lamda*w
    b_grad = -np.sum(pred_error)
    return w_grad, b_grad

def _gradient(X, Y_label, w, b):
    '''
    use cross_entropy to update weight and bias matrix
    without a lambda penalty term
    '''
    y_pred = get_prob(X, w, b)
    pred_error = Y_label - y_pred
    pred_error = np.pad(pred_error, ((0, 98), (0, 0)), 'constant', constant_values=(0, 0))
    w_grad = -np.sum(pred_error * X.T, 1)
    b_grad = -np.sum(pred_error)
    return w_grad, b_grad

### prepare dataset

* 实现方法：
    * 通过利用手写封装函数
    * 利用`sklearn.model_selection.train_test_split`直接进行StandardScaler

In [20]:
# train_test_split

# artificial
def train_dev_split(X, Y, dev_ratio = 0.25):
    '''
    artificial split dataset into train_set and val_set
    '''
    train_size = int(len(X) * (1 - dev_ratio))
    return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]

X_train, y_train, X_val, y_val = train_dev_split(train, target)
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((24420, 106), (24420, 1), (8141, 106), (8141, 1))

In [21]:
# # use sklearn.model_selection.train_test_split to get dataset
# X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.25)
# X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [22]:
def _shuffle(X, Y):
    '''
    mess up order
    '''
    randomize = np.arange(len(X))
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

def _accuracy(Y_pred, Y_label):
    '''
    get accuracy of prediction
    '''
    # This function calculates prediction accuracy
    acc = 1 - np.mean(np.abs(Y_pred - Y_label))
    return acc

In [23]:
# some prework

# init weight and bias with zero
w = np.zeros((X_train.shape[1], ))
b = np.zeros((1, ))

# hyperparam 
max_iter = 100
batch_size = 8
learning_rate = 0.2
# regulariization
regularize = True
if regularize:
    lamda = 0.001
else:
    lamda = 0

loss_train = []
loss_validation = []
train_acc = []
dev_acc = []

step = 1
train_size = X_train.shape[0]
val_size = X_val.shape[0]
test_size = test.shape[0]
data_dim = X_train.shape[1]

In [24]:
train.size, val_size, test_size, data_dim

(3451466, 8141, 16281, 106)

In [25]:
X_train, Y_train = _shuffle(X_train.values, y_train.values)
        
for idx in range(int(np.floor(len(Y_train) / batch_size))):
    X = X_train[idx*batch_size:(idx+1)*batch_size]
    Y = Y_train[idx*batch_size:(idx+1)*batch_size]

    w_grad, b_grad = _gradient(X, Y, w, b)
            
    w = w - learning_rate/np.sqrt(step) * w_grad
    b = b - learning_rate/np.sqrt(step) * b_grad
    
    step = step + 1

In [26]:
col = [0, 1, 3, 4, 5, 7, 10, 12, 25, 26, 27, 28]

X_train, X_mean, X_std = _normalize_column_normal(X_train, specified_column=col)

### predict


In [27]:
test = test.iloc[:, 1]
w = np.pad(w,(0, 16281-106),'constant', constant_values=(0,0)) 

In [28]:
preds = infer(test.values, w, b)