In [1]:
import pandas as pd


In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.metrics import accuracy_score

In [4]:
from sklearn.tree import DecisionTreeClassifier

In [5]:
from sklearn.linear_model import LogisticRegression

In [6]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [7]:
from sklearn.neighbors import KNeighborsClassifier

In [8]:
from sklearn.naive_bayes import GaussianNB

In [9]:
adult = pd.read_csv('adult.csv', sep = ',', names = ['age', 'workclass', 'fnIwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income'])

In [10]:
adult.shape

(32562, 15)

In [11]:
adult.head()

Unnamed: 0,age,workclass,fnIwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
2,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
3,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
4,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K


In [12]:
adult.tail()

Unnamed: 0,age,workclass,fnIwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
32557,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32558,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32559,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32560,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32561,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [13]:
adult.describe()

Unnamed: 0,age,workclass,fnIwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
count,32562,32562,32562,32562,32562,32562,32562,32562,32562,32562,32562,32562,32562,32562,32562
unique,74,10,21649,17,17,8,16,7,6,3,120,93,95,43,3
top,36,Private,123011,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
freq,898,22696,13,10501,10501,14976,4140,13193,27816,21790,29849,31042,15217,29170,24720


In [14]:
n_records = adult.shape[0]
n_greater_50k = adult[adult['income'] == '>50K']. shape[0]
n_at_most_50k = adult[adult['income'] == '<=50K']. shape[0]
greater_percent = (n_greater_50k / n_records) * 100
print("Total number of records: {}".format(n_records))
print("Individuals making more that $50,000: {}".format(n_greater_50k))
print("Individuals making at most $50,000: {}".format(n_at_most_50k))
print("Percentage of individuals making more than $50,00: {}".format(greater_percent))

Total number of records: 32562
Individuals making more that $50,000: 7841
Individuals making at most $50,000: 24720
Percentage of individuals making more than $50,00: 24.080216202935937


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
le = LabelEncoder()

In [17]:
for col in adult.columns:
    if adult[col].dtypes == 'object':
        adult[col] = le.fit_transform(adult[col])

In [18]:
adult.head()

Unnamed: 0,age,workclass,fnIwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,73,9,21648,16,16,7,15,6,5,2,119,92,94,42,2
1,72,0,20407,11,15,6,0,1,4,0,0,86,34,39,0
2,65,4,2652,11,15,6,4,1,4,0,0,86,9,39,0
3,49,0,7318,15,1,6,0,4,2,0,0,86,34,39,0
4,37,4,3149,5,10,0,7,4,4,0,0,84,34,39,0


In [19]:
X = adult[['age', 'workclass', 'fnIwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income']]

In [20]:
Y = adult.income

In [21]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.30, random_state = 42)

In [22]:
model = []

In [23]:
model.append(("LR", LogisticRegression()))
model.append(("LDA", LinearDiscriminantAnalysis()))
model.append(("KNN", KNeighborsClassifier()))
model.append(("CART", DecisionTreeClassifier()))
model.append(("NB", GaussianNB()))

In [24]:
result = []

In [25]:
names = []

In [26]:
from sklearn import model_selection

In [27]:
for name, models in model:
    kfold = model_selection.KFold(n_splits = 10, random_state = 7)
    cv_result = model_selection.cross_val_score(models, x_train, y_train, cv = kfold, scoring = "accuracy")
    result.append(cv_result)
    names.append(name)
    msg = "%s,%f,(%f)" % (name, cv_result.mean(), cv_result.std())
    print(msg)



LR,0.999912,(0.000175)




LDA,0.789541,(0.006553)
KNN,0.752116,(0.011613)
CART,0.999956,(0.000132)
NB,0.985960,(0.003249)
