# Whether the income of a person exceeds \$50K/yr (ML)

## Download dataset

In [4]:
import os
import urllib.request

print('Begin downloading adult dataset...')

# We use UCI Machine Learning dataset - Adult here
train_data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
test_data_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
description = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names'
if not os.path.isfile("adult.data"):
    urllib.request.urlretrieve(train_data_url, 'adult.data')
    urllib.request.urlretrieve(test_data_url, 'adult.test')
    urllib.request.urlretrieve(description, 'adult.names')
    
print('Successfully downloaded...')

Begin downloading adult dataset...
Successfully downloaded...


## Data Processing

In [5]:
import numpy as np
import pandas as pd

In [6]:
attr = ["age", "workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country","income"]
data_train = pd.read_csv("adult.data",names=attr)
data_test = pd.read_csv("adult.test",names=attr)
data_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.


In [7]:
data_test.drop(index=0,inplace=True)
data_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
5,18,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K.


In [8]:
data_train.replace("?", np.nan, inplace=True)
print(data_train.isna().sum().sum())

data_test.replace("?", np.nan, inplace=True)
print(data_test.isna().sum().sum())

0
0


In [9]:
y_train = data_train["income"]
y_train = pd.Categorical(y_train).codes
data_train.drop(["income"],axis=1,inplace=True)

y_test = data_test["income"]
y_test = pd.Categorical(y_test).codes
data_test.drop(["income"],axis=1,inplace=True)
print("Sucessful!")

Sucessful!


In [10]:
train_size = len(data_train)
data = pd.concat([data_train,data_test],axis=0)
print(train_size)

32561


In [11]:
cat_attr = ["workclass","education","marital-status","occupation","relationship","race","sex","native-country"]
for a in cat_attr:
    data[a] = pd.Categorical(data[a]) # change column type to categorical
    dummies = pd.get_dummies(data[a],prefix="{}_category".format(a))
    data = pd.concat([data,dummies],axis=1)

data.drop(cat_attr,axis=1,inplace=True)
data.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_category_ ?,workclass_category_ Federal-gov,workclass_category_ Local-gov,workclass_category_ Never-worked,...,native-country_category_ Portugal,native-country_category_ Puerto-Rico,native-country_category_ Scotland,native-country_category_ South,native-country_category_ Taiwan,native-country_category_ Thailand,native-country_category_ Trinadad&Tobago,native-country_category_ United-States,native-country_category_ Vietnam,native-country_category_ Yugoslavia
0,39,77516.0,13.0,2174.0,0.0,40.0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311.0,13.0,0.0,0.0,13.0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646.0,9.0,0.0,0.0,40.0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721.0,7.0,0.0,0.0,40.0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409.0,13.0,0.0,0.0,40.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
X_train=data[:train_size]
X_test=data[train_size:]
data.isna().sum().sum()

0

## Scaling

In [13]:
'''
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
'''

from sklearn.preprocessing import MinMaxScaler
mmscaler = MinMaxScaler().fit(X_train)
X_train = mmscaler.transform(X_train)
X_test = mmscaler.transform(X_test)

'''
from sklearn.preprocessing import Normalizer
normalizer = Normalizer().fit(X_train)
X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)
'''

'\nfrom sklearn.preprocessing import Normalizer\nnormalizer = Normalizer().fit(X_train)\nX_train = normalizer.transform(X_train)\nX_test = normalizer.transform(X_test)\n'

## Training & Evaluation


In [14]:
from sklearn.model_selection import cross_val_score

In [20]:
# 交叉验证提高正确率
from sklearn.tree import DecisionTreeClassifier

best_max_depth, best_score = 0, 0
for k in range(1,10):
    clf = DecisionTreeClassifier(max_depth=k).fit(X_train, y_train)
    scores = cross_val_score(clf, X_train, y_train)
    score = np.mean(scores)
    if score > best_score:
        best_max_depth, best_score = k, score
        
print(best_max_depth)
DecisionTreeClassifier(max_depth=best_max_depth).fit(X_train,y_train).score(X_test,y_test)

9


0.8578097168478594

In [16]:
from sklearn.naive_bayes import BernoulliNB

best_alpha, best_score = 0, 0
for k in np.logspace(-2,5,200):
    bnl = BernoulliNB(alpha=k).fit(X_train, y_train)
    scores = cross_val_score(bnl, X_train, y_train)
    score = np.mean(scores)
    if score > best_score:
        best_alpha, best_score = k, score
        
print(best_alpha)
BernoulliNB(alpha=best_alpha).fit(X_train,y_train).score(X_test,y_test)

318.0625692794119


0.8048645660585959

In [17]:
from sklearn.naive_bayes import GaussianNB
GaussianNB().fit(X_train,y_train).score(X_test,y_test)

0.5417357656163626

In [18]:
from sklearn.naive_bayes import MultinomialNB

best_alpha, best_score = 0, 0
for k in np.logspace(0,5,10):
    mnm = MultinomialNB(alpha=k).fit(X_train, y_train)
    scores = cross_val_score(mnm, X_train, y_train)
    score = np.mean(scores)
    if score > best_score:
        best_alpha, best_score = k, score
        
print(best_alpha)
MultinomialNB(alpha=best_alpha).fit(X_train,y_train).score(X_test,y_test)

166.81005372000593


0.8173330876481789

In [19]:
from sklearn.linear_model import LinearRegression
LinearRegression().fit(X_train,y_train).score(X_test,y_test)

0.36163152989776315