In [99]:
from time import time
from sklearn import preprocessing
from sklearn import metrics
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import maxabs_scale
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.externals import joblib
import pandas as pd
import bisect
import numpy as np

with open('CensusIncome.names.txt', 'r') as fname:
    sname = fname.read()
names=np.array(sname[sname.find("age"):].split(".\n"))
#print(names)

#Encode Data Workclass
workclass = preprocessing.LabelEncoder()
workclass.fit(np.array(names[1][names[1].find(":")+2:].split(", ")))
#Handle Unknown Data : '?'
le_classes = workclass.classes_.tolist()
bisect.insort_left(le_classes, '?')
workclass.classes_ = le_classes
#print(workclass.classes_)

#Encode Data Education
education = preprocessing.LabelEncoder()
education.fit(np.array(names[3][names[3].find(":")+2:].split(", ")))
le_classes = education.classes_.tolist()
bisect.insort_left(le_classes, '?')
education.classes_ = le_classes
#print(education.classes_)

marital_status = preprocessing.LabelEncoder()
marital_status.fit(np.array(names[5][names[5].find(":")+2:].split(", ")))
le_classes = marital_status.classes_.tolist()
bisect.insort_left(le_classes, '?')
marital_status.classes_ = le_classes
#print(marital_status.classes_)

occupation = preprocessing.LabelEncoder()
occupation.fit(np.array(names[6][names[6].find(":")+2:].split(", ")))
le_classes = occupation.classes_.tolist()
bisect.insort_left(le_classes, '?')
occupation.classes_ = le_classes
#print(occupation.classes_)

relationship = preprocessing.LabelEncoder()
relationship.fit(np.array(names[7][names[7].find(":")+2:].split(", ")))
le_classes = relationship.classes_.tolist()
bisect.insort_left(le_classes, '?')
relationship.classes_ = le_classes
#print(relationship.classes_)

race = preprocessing.LabelEncoder()
race.fit(np.array(names[8][names[8].find(":")+2:].split(", ")))
le_classes = race.classes_.tolist()
bisect.insort_left(le_classes, '?')
race.classes_ = le_classes
#print(race.classes_)

sex = preprocessing.LabelEncoder()
sex.fit(np.array(names[9][names[9].find(":")+2:].split(", ")))
le_classes = sex.classes_.tolist()
bisect.insort_left(le_classes, '?')
sex.classes_ = le_classes
#print(sex.classes_)

native_country = preprocessing.LabelEncoder()
native_country.fit(np.array(names[13][names[13].find(":")+2:].split(", ")))
le_classes = native_country.classes_.tolist()
bisect.insort_left(le_classes, '?')
native_country.classes_ = le_classes
#print(native_country.classes_)

#Open Data Cencus
with open('CencusIncome.data.txt', 'r') as fdata:
    s = fdata.read()

raw = s.split("\n")
A = []
#print(raw[32560])
length = 32560
for i in range(length):
    A.append(raw[i].split(", "))
    
#Mapping from raw data to  
data_length = 14
cencus_data = []
cencus_target = []

for i in range(length):
    temp = []
    for j in range(data_length):
        if j == 7:
            if A[i][j] == '?':
                temp.append(relationship.transform(["Husband"])[0])
            else:
                temp.append(relationship.transform([A[i][j]])[0])
        elif j == 1:
            if A[i][j] == '?':
                temp.append(workclass.transform(["Private"])[0])
            else:
                temp.append(workclass.transform([A[i][j]])[0])
        elif j == 3:
            if A[i][j] == '?':
                temp.append(education.transform(["HS-grad"])[0])
            else:
                temp.append(education.transform([A[i][j]])[0])
        elif j == 5:
            if A[i][j] == '?':
                temp.append(marital_status.transform(["Married-civ-spouse"])[0])
            else:
                temp.append(marital_status.transform([A[i][j]])[0])
        elif j == 6:
            if A[i][j] == '?':
                temp.append(occupation.transform(["Prof-specialty"])[0])
            else:
                temp.append(occupation.transform([A[i][j]])[0])
        elif j == 8:
            if A[i][j] == '?':
                temp.append(race.transform(["White"])[0])
            else:
                temp.append(race.transform([A[i][j]])[0])
        elif j == 9:
            if A[i][j] == '?':
                temp.append(sex.transform(["Male"])[0])
            else:
                temp.append(sex.transform([A[i][j]])[0])
        elif j == 13:
            if A[i][j] == '?':
                temp.append(native_country.transform(["United-States"])[0])
            else:
                temp.append(native_country.transform([A[i][j]])[0])
        else:
            temp.append(int(A[i][j]))
                            
    cencus_data.append(temp)
    cencus_target.append(A[i][data_length])

#Encode Target
target = preprocessing.LabelEncoder()
target.fit(cencus_target)
#Ready to Use Data
y = target.transform(cencus_target)    
X = np.array(cencus_data)

#Scaling
y = maxabs_scale(y, axis=0, copy=False)
X = maxabs_scale(X, axis=0, copy=False)

from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import EditedNearestNeighbours
from collections import Counter

ratio = 'auto'
rus = RandomUnderSampler(ratio=ratio, random_state=0)
X_resampled, y_resampled = rus.fit_sample(X, y)
print(sorted(Counter(y_resampled).items()))

#a = np.asarray(cencus_data)
#np.savetxt("icha.csv", a, delimiter=",")



[(0.0, 7840), (1.0, 7840)]


In [9]:
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(100,), random_state=1)
kf = KFold(n_splits=10, shuffle=True)

table = []
training_times = []
prediction_times = []
scores = []
i = 0

for train, test in kf.split(X_resampled, y_resampled):
    x_train = [X_resampled[i] for i in train]
    y_train = [y_resampled[i] for i in train]
    x_test = [X_resampled[i] for i in test]
    y_test = [y_resampled[i] for i in test]
    
    #Xcoba = maxabs_scale(X, axis=0, copy=True)
    x_train = maxabs_scale(x_train, axis=0, copy=False)
    y_train = maxabs_scale(y_train, axis=0, copy=False)
    x_test = maxabs_scale(x_test, axis=0, copy=False)
    y_test = maxabs_scale(y_test, axis=0, copy=False)
    
    t0 = time()
    mlp.fit(x_train, y_train)
    training_time = '{:.6}'.format(time() - t0)
    
    t0 = time()
    pred = mlp.predict(x_test)
    prediction_time = '{:.6}'.format(time() - t0)
    
    score = accuracy_score(pred, y_test)
    cm = confusion_matrix(y_test, pred)
    i = i+1
    print ("Confusion Matrix Iterasi", i)
    display(pd.DataFrame(confusion_matrix(y_test, pred), columns=['>50K','<=50K'], index=['>50K','<=50K']))
    
    table += [[training_time, prediction_time, score]]
    training_times += [training_time]
    prediction_times += [prediction_time]
    scores += [score]

display(pd.DataFrame(table, columns=['training time', 'prediction time', 'score'], index=range(1,len(table)+1)))

Confusion Matrix Iterasi 1


Unnamed: 0,>50K,<=50K
>50K,582,186
<=50K,123,677


Confusion Matrix Iterasi 2


Unnamed: 0,>50K,<=50K
>50K,600,174
<=50K,107,687


Confusion Matrix Iterasi 3


Unnamed: 0,>50K,<=50K
>50K,608,191
<=50K,87,682


Confusion Matrix Iterasi 4


Unnamed: 0,>50K,<=50K
>50K,616,192
<=50K,97,663


Confusion Matrix Iterasi 5


Unnamed: 0,>50K,<=50K
>50K,611,179
<=50K,91,687


Confusion Matrix Iterasi 6


Unnamed: 0,>50K,<=50K
>50K,594,165
<=50K,117,692


Confusion Matrix Iterasi 7


Unnamed: 0,>50K,<=50K
>50K,642,168
<=50K,97,661


Confusion Matrix Iterasi 8


Unnamed: 0,>50K,<=50K
>50K,604,155
<=50K,103,706


Confusion Matrix Iterasi 9


Unnamed: 0,>50K,<=50K
>50K,621,157
<=50K,93,697


Confusion Matrix Iterasi 10


Unnamed: 0,>50K,<=50K
>50K,629,166
<=50K,120,653


Unnamed: 0,training time,prediction time,score
1,8.3097,0.00351119,0.802934
2,7.59606,0.0020051,0.820791
3,7.4568,0.00150394,0.822704
4,7.3786,0.00200558,0.815689
5,7.44597,0.00150442,0.827806
6,7.20565,0.00150442,0.820153
7,7.88785,0.00150418,0.830995
8,7.74404,0.00150394,0.835459
9,7.4498,0.00150418,0.840561
10,7.59919,0.00150418,0.817602


In [100]:
#Open Test
with open('CencusIncome.test.txt', 'r') as fdata:
    stest = fdata.read()
    
test_raw = np.array(stest[stest.find("\n")+1:].split(".\n"))
B = []
length = len(test_raw)-1
for i in range(length):
    B.append(test_raw[i].split(", "))

#Mapping from raw data to  
data_length = 14
cencus_data = []
cencus_target = []
for i in range(length):
    temp = []
    for j in range(data_length):
        if j == 7:
            if B[i][j] == '?':
                temp.append(relationship.transform(["Husband"])[0])
            else:
                temp.append(relationship.transform([B[i][j]])[0])
        elif j == 1:
            if B[i][j] == '?':
                temp.append(workclass.transform(["Private"])[0])
            else:
                temp.append(workclass.transform([B[i][j]])[0])
        elif j == 3:
            if B[i][j] == '?':
                temp.append(education.transform(["HS-grad"])[0])
            else:
                temp.append(education.transform([B[i][j]])[0])
        elif j == 5:
            if B[i][j] == '?':
                temp.append(marital_status.transform(["Married-civ-spouse"])[0])
            else:
                temp.append(marital_status.transform([B[i][j]])[0])
        elif j == 6:
            if B[i][j] == '?':
                temp.append(occupation.transform(["Prof-specialty"])[0])
            else:
                temp.append(occupation.transform([B[i][j]])[0])
        elif j == 8:
            if B[i][j] == '?':
                temp.append(race.transform(["White"])[0])
            else:
                temp.append(race.transform([B[i][j]])[0])
        elif j == 9:
            if B[i][j] == '?':
                temp.append(sex.transform(["Male"])[0])
            else:
                temp.append(sex.transform([B[i][j]])[0])
        elif j == 13:
            if B[i][j] == '?':
                temp.append(native_country.transform(["United-States"])[0])
            else:
                temp.append(native_country.transform([B[i][j]])[0])
        else:
            temp.append(int(B[i][j]))
                            
    cencus_data.append(temp)
    cencus_target.append(B[i][data_length])

#Encode Target
target = preprocessing.LabelEncoder()
target.fit(cencus_target)

#Ready to Use Data
y_test = target.transform(cencus_target)    
X_test = np.array(cencus_data)

#Scaling
y_test = maxabs_scale(y_test, axis=0, copy=False)
X_test = maxabs_scale(X_test, axis=0, copy=False)

In [101]:
model = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(100,), random_state=1)
model.fit(X,y)
joblib.dump(model, 'Best.model')

bestmodel = joblib.load('Best.model')
prediction = bestmodel.predict(X_test)

display(pd.DataFrame(confusion_matrix(y_test, prediction), columns=['>50K','<=50K'], index=['>50K','<=50K']))

Unnamed: 0,>50K,<=50K
>50K,11572,863
<=50K,1586,2260


In [102]:
print("Score: ", accuracy_score(prediction, y_test))

Score:  0.849579264173
