In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression #for logistic regression
from sklearn.preprocessing import MinMaxScaler #for连续数据归一化

min_max_scaler = MinMaxScaler()
lr = LogisticRegression()

In [2]:
train_data_init = pd.read_csv('./data/train.csv')
test_data_init = pd.read_csv('./data/test.csv')

data = pd.concat([train_data_init,test_data_init],axis=0)
mp = data['native-country'].value_counts() / data.shape[0]
data['native-country'] = data['native-country'].map(mp)
# print(data['native-country'])
data[['age','fnlwgt' ,'education-num' ,'capital-gain' ,'capital-loss' ,'hours-per-week']] = \
min_max_scaler.fit_transform(data[['age','fnlwgt' ,'education-num' ,'capital-gain' ,'capital-loss' ,'hours-per-week']])

data.loc[data['workclass']=='?','workclass']='Private'
data.loc[data['occupation']=='?','occupation']='other'

# print(data)

cols=['workclass', 'education','marital-status', 'occupation', 'relationship', 'race', 'sex']

# dummies函数操作：对列进行one-hot encoding
def p_data(data,col):
    tmp=pd.get_dummies(data[col],prefix=col)
    data=pd.concat([data,tmp],axis=1)
    data=data.drop(col,axis=1)
    return data

for col in cols:
    data = p_data(data,col)

In [3]:
lable_d={
     '<=50K':  0,
     '<=50K.': 0,
     '>50K' : 1,
     '>50K.': 1  
}
# print(data.columns)
data.lable = data.lable.map(lable_d)
# print(data.lable)

In [4]:
data['lable'] = data['lable'].astype(int)

# 获得train and test data
train_data=data[0:train_data_init.shape[0]]
test_data=data[train_data_init.shape[0]:]

In [5]:
train_lable = train_data['lable']
train_data = train_data.drop('lable', axis = 1)

test_lable = test_data['lable']
test_data = test_data.drop('lable', axis = 1)

In [6]:
lr.fit(train_data,train_lable)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [7]:
lr.score(train_data,train_lable)

0.8511409354749547

In [8]:
lr.score(test_data,test_lable)

0.8508076899453351

In [9]:
# 打印参数
print(lr.coef_, '\n', lr.intercept_)
print(lr.coef_.shape[1])

[[ 1.84763952e+00  8.98904852e-01  1.78742018e+00  1.80247873e+01
   2.54885575e+00  2.77663606e+00  3.03822092e-01  2.77322800e-01
  -3.79524688e-01 -1.60235358e-01 -2.05478738e-01 -2.84554845e-02
  -6.83140490e-01 -5.26025552e-01 -1.12263896e+00 -6.35787912e-01
  -6.48490620e-01 -3.69619990e-01 -6.62796090e-01 -5.57822174e-01
  -8.54180818e-01 -7.65757846e-01 -2.20471742e-02  1.11951722e-01
   4.45357503e-01  1.07681394e+00 -1.93300591e-01  6.64657378e-01
  -1.45619838e+00  1.01070007e+00  2.83445039e-02 -9.17255300e-01
   1.20994248e+00  1.01902788e+00 -9.15424612e-01 -1.38288991e+00
  -1.04047931e+00 -8.01097693e-01 -7.46456007e-03 -4.05999080e-01
   5.90938632e-02  7.60041581e-01 -9.56420638e-01 -6.88337655e-01
  -2.99650252e-01 -8.43223425e-01 -1.56499328e+00  5.08273810e-01
   5.36974813e-01  2.59464086e-01  6.21594584e-01 -1.25174861e-01
  -6.82355465e-01 -5.64598355e-01 -2.08510982e-01 -1.03678932e+00
  -1.37087220e+00 -3.74603006e-01  7.27197397e-01 -8.14729031e-01
  -3.13375

In [10]:
from sklearn.svm import SVC
clf=SVC(kernel='linear')
clf.fit(train_data,train_lable)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [11]:
clf.score(train_data,train_lable)

0.8510795122999908

In [12]:
clf.score(test_data,test_lable)

0.8528960137583687

In [13]:
y_lr_predict = lr.predict(test_data)

In [14]:
diff_cnt = 0
for i in range(len(y_lr_predict)):
    if y_lr_predict[i] == test_lable[i]:
        continue
    else:
        diff_cnt += 1
print(1 - diff_cnt / len(y_lr_predict))

0.8508076899453351
