In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from sklearn.linear_model import LogisticRegression

In [2]:
cols = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex',
         'capital-gain','capital-loss','hours-per-week','native-country','label']
train_data = pd.read_csv('./data/adult.data',names=cols,na_values='?')
train_data = train_data.dropna(axis=0,how='any')
test_data = pd.read_csv('./data/adult.test',names=cols, na_values='?')
test_data = test_data.dropna(axis=0,how='any')
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
test_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
5,18,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K.


In [4]:
train_data['is_train'] = 1
test_data['is_train'] = 0
train_data['label'] = train_data['label'].apply(lambda x:1 if x == ' >50K' else 0)
test_data['label'] = test_data['label'].apply(lambda x:1 if x == ' >50K.' else 0)
data = pd.concat([train_data,test_data],ignore_index=True)
del data['fnlwgt']
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label,is_train
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,0,1
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,0,1
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,0,1
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,0,1
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,0,1


## 离散特征处理

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_cols = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']
for col in cat_cols:
    data[col] = le.fit_transform(data[col])
    data[col] = data[col].astype(str)
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label,is_train
0,39,7,9,13.0,4,1,1,4,1,2174.0,0.0,40.0,39,0,1
1,50,6,9,13.0,2,4,0,4,1,0.0,0.0,13.0,39,0,1
2,38,4,11,9.0,0,6,1,4,1,0.0,0.0,40.0,39,0,1
3,53,4,1,7.0,2,6,0,2,1,0.0,0.0,40.0,39,0,1
4,28,4,9,13.0,2,10,5,2,0,0.0,0.0,40.0,5,0,1


## 连续特征离散化

In [6]:
con_cols = ['age','education-num','capital-gain','capital-loss','hours-per-week']
for col in con_cols:
    data[col] = data[col].astype(float)
    min_num = data[col].min()
    max_num = data[col].max()
    div = max_num - min_num
    data[col] = data[col].apply(lambda x : (x-min_num)/div)

In [7]:
def list_trans(input_dic):
    output_dic = [0]*5
    key_list = ["min","25%","50%","75%","max"]
    for index in range(len(key_list)):
        key = key_list[index]
        if key not in input_dic:
            print("false")
        else:
            output_dic[index] = input_dic[key]
    return output_dic

In [8]:
for col in ['age','education-num']:
    origin_dic = data[col].describe().to_dict()
    bin_list = list_trans(origin_dic)
    data[col] = pd.cut(data[col],bin_list,right=False,labels=[1,2,3,4])
for col in ['capital-gain','capital-loss','hours-per-week']:
    data[col] = pd.cut(data[col],4,labels=[1,2,3,4])

In [9]:
data.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label,is_train
0,3,7,9,4,4,1,1,4,1,1,1,2,39,0,1
1,4,6,9,4,2,4,0,4,1,1,1,1,39,0,1
2,3,4,11,2,0,6,1,4,1,1,1,2,39,0,1
3,4,4,1,1,2,6,0,2,1,1,1,2,39,0,1
4,2,4,9,4,2,10,5,2,0,1,1,2,5,0,1


In [10]:
onehot_cols = cat_cols + con_cols
df = data[onehot_cols]
df = pd.get_dummies(df)
df.head()

Unnamed: 0,workclass_0,workclass_1,workclass_2,workclass_3,workclass_4,workclass_5,workclass_6,workclass_7,workclass_8,education_0,...,capital-gain_3,capital-gain_4,capital-loss_1,capital-loss_2,capital-loss_3,capital-loss_4,hours-per-week_1,hours-per-week_2,hours-per-week_3,hours-per-week_4
0,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0


In [11]:
train_index = data[data.is_train == 1].index
test_index = data[data.is_train == 0].index
train = df.loc[train_index]
test = df.loc[test_index]
train_labels = data[data.is_train == 1]['label']
test_labels = data[data.is_train == 0]['label']

In [12]:
len(train),len(train_labels)

(32561, 32561)

In [13]:
train_labels.value_counts()

0    24720
1     7841
Name: label, dtype: int64

In [14]:
from sklearn.linear_model import LogisticRegressionCV
model = LogisticRegressionCV(Cs=[1],penalty="l2",tol=0.0001,max_iter=500,cv=5,scoring='roc_auc')
model.fit(train,train_labels.values)
scores = model.scores_.values()
scores = list(scores)[0]
print(scores)
print(','.join(str(ele) for ele in scores.mean(axis = 0)))

[[0.89249144]
 [0.88996486]
 [0.89497377]
 [0.89887035]
 [0.89804864]]
0.8948698128814974


In [15]:
from sklearn.metrics import roc_auc_score
pred = model.predict_proba(test)
score = roc_auc_score(test_labels.values,pred[:,1])
print(score)

0.8947513549918757
