In [169]:
import os
import numpy as np
import pandas as pd
import pymc
os.chdir('C:\Users\SYARLAG1\Desktop\Income-Prediction-Using-Bayes-Net')

In [170]:
features = ['age','workclass','fnlwgt','education','education-num','marital-status','occupation',\
           'relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','class']
train = pd.read_csv('train.csv',delimiter=',',names=features,na_values=[' ?'])
test = pd.read_csv('test.csv',delimiter=',', names=features,na_values=[' ?'])

In [171]:
train.shape

(32561, 15)

In [172]:
test.shape

(16281, 15)

In [173]:
len(features)

15

In [174]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Preprocessing

In [175]:
# drop all continuous variables
continuous_features = ['education-num','fnlwgt','capital-gain','capital-loss','hours-per-week']
for feature in continuous_features:
    del train[feature]
    del test[feature]

In [176]:
# tackle missing values
# train data
np.sum(pd.isnull(train),0)

age                  0
workclass         1836
education            0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
native-country     583
class                0
dtype: int64

In [177]:
# test data
np.sum(pd.isnull(test),0)

age                 0
workclass         963
education           0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
native-country    274
class               0
dtype: int64

In [178]:
# since there are only a few NA's compared to the size of the dataset, we remove all such values
train = train.dropna()
test = test.dropna()

In [179]:
# function binarize age
def binarizeVar(train, test, feature, bins = 5):
    
    # get max and min from both datasets
    minVal = np.min([np.min(test[feature]),np.min(train[feature])])
    maxVal = np.max([np.max(test[feature]),np.max(train[feature])])
    
    data_range = maxVal - minVal
    binSize = int(data_range/bins)
    
    newBins = []
    global newBins
    minBinBound = minVal
    maxBinBound = binSize + 1
    
    for _ in range(bins):
        
        newBins.append((minBinBound, maxBinBound))
        
        minBinBound = maxBinBound
        maxBinBound += binSize
         
    newTrainFeat = []
    trainE = []; global trainE
    testE = []; global testE
    
    for featVal in np.array(train[feature]):
        
        foundBin = False
        
        for minBin, maxBin in newBins: 
            
            if featVal >= minBin and featVal < maxBin:
                
                binStr = str(minBin) + '-' + str(maxBin)
                newTrainFeat.append(binStr)
                foundBin = True
            
            if not foundBin: trainE.append(featVal)

                
    newTestFeat = []
    
    for featVal in np.array(test[feature]):
        
        foundBin = False
        
        for minBin, maxBin in newBins: 
            
            if featVal >= minBin and featVal <= maxBin:
                
                binStr = str(minBin) + '-' + str(maxBin)
                newTestFeat.append(binStr)
                foundBin = True
                            
            if not foundBin: testE.append(featVal)
                
                
   

    train[feature] = pd.Series(newTrainFeat)
    test[feature] = pd.Series(newTestFeat)
            
    return train, test
        

  global newBins
  trainE = []; global trainE
  testE = []; global testE


In [180]:
train, test = binarizeVar(train, test, 'age', bins = 5)

In [181]:
newBins

[(17, 15), (15, 29), (29, 43), (43, 57), (57, 71)]

In [183]:
len(trainE)

66685

In [184]:
train.shape

(30162, 10)

In [125]:
# set income class (class) as the target
train_Y = train['class']; del train['class']
test_Y = test['class']; del test['class']
train_X = train
test_X = test

In [138]:
test

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,15-29,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,United-States
1,29-43,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,United-States
2,15-29,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,United-States
3,43-57,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,United-States
5,57-71,Private,10th,Never-married,Other-service,Not-in-family,White,Male,United-States
7,43-57,Self-emp-not-inc,Prof-school,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States
8,57-71,Private,Some-college,Never-married,Other-service,Unmarried,White,Female,United-States
9,29-43,Private,7th-8th,Married-civ-spouse,Craft-repair,Husband,White,Male,United-States
10,15-29,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
11,43-57,Federal-gov,Bachelors,Married-civ-spouse,Adm-clerical,Husband,White,Male,United-States
