In [6]:
import numpy as np
import pandas as pd

In [7]:
def sigmoid(z):
  '''
  Returns the sigmoid of z.
  Given the logit (AKA the log of the odds), then the sigmoid inverses the log and returns the original value of the odds.
  '''
  return 1 / (1 + np.exp(-z))


def predict(features, weights):

    return sigmoid( np.dot(features, weights) )

def cost_function_for_all_training_samples(features, labels, weights):
    m = features.shape[0]  # m = number of samples
    predictions = predict(features, weights)
    return -(1/m) * np.sum( labels*np.log(predictions) + (1-labels)*np.log(1-predictions) )

def decision_boundary(probability, threshold=0.5):
      return 1 if probability >= threshold else 0

def calculate_gradient(features, labels, weights):
    predictions = predict(features, weights)
    matrixOfAggregateSlopeOfCostFunction = np.dot(features.T, predictions - labels)
    return matrixOfAggregateSlopeOfCostFunction
  

def update_weights(features, labels, weights, lr):

    matrixOfAggregateSlopeOfCostFunction = calculate_gradient(features, labels, weights)

    m = len(features)
    averageCostDerivativeForEachFeature = matrixOfAggregateSlopeOfCostFunction / m

    gradient = averageCostDerivativeForEachFeature * lr

    return weights - gradient


def fit(features, labels, weights, lr, iterations):

    for i in range(iterations):
        weights = update_weights(features, labels, weights, lr)

        # Log Progress
        if i % 100 == 0:
            cost = cost_function_for_all_training_samples(features, labels, weights)
            print("iteration:", str(i), "cost:", str(cost))

    return weights
  
def _classify(predictions):
    decide = np.vectorize(decision_boundary)
    return decide(predictions).flatten()

def scale(X, norm_params):
    x_min = norm_params[0]
    x_max = norm_params[1]
    nom = (X-X.min(axis=0))*(x_max-x_min)
    denom = X.max(axis=0) - X.min(axis=0)
    denom[denom==0] = 1
    return x_min + nom/denom

In [12]:
def train_with_file(data_file, iters):
    columns = ['age', 'workclass','fnlwgt','education','education_num','marital_status','occupation','relationship','race','gender','capital_gain','capital_loss','hours_per_week','native_country','income_bracket']
    train = pd.read_csv(data_file, names=columns)
    test = pd.read_csv('adult-test.csv', names=columns, skiprows=1)
    test['income_bracket'] = test['income_bracket'].apply(lambda x: 0 if x==' >50K' else 0)
    train['income_bracket'] = train['income_bracket'].apply(lambda x: 1 if x==' >50K' else 0)
    train_y = train['income_bracket']
    train['training_set'] = True
    test['training_set'] = False
    all_data = pd.concat([train,test])
    
    all_data = all_data.drop(['income_bracket'], axis=1)
    all_data = pd.get_dummies(all_data)
    train_x = all_data[all_data['training_set']==True]
    train_x = train_x.drop('training_set', axis=1)
    normalization_params = [0,1]
    train_x = scale(train_x, normalization_params)
    lr = 0.1
    initial_weights = [0] * train_x.shape[1]
    iterations = iters
    weights = fit(train_x, train_y, initial_weights, lr, iterations)
    return weights, normalization_params



def classify(data_file, weights, normalization_params):
    columns = ['age', 'workclass','fnlwgt','education','education_num','marital_status','occupation','relationship','race','gender','capital_gain','capital_loss','hours_per_week','native_country','income_bracket']
    test = pd.read_csv(data_file, names=columns, skiprows=1)
    train = pd.read_csv('adult-training.csv', names=columns)
    test['income_bracket'] = test['income_bracket'].apply(lambda x: 0 if x==' >50K' else 0)
    train['income_bracket'] = train['income_bracket'].apply(lambda x: 1 if x==' >50K' else 0)
    test_y = test['income_bracket']
    train['training_set'] = True
    test['training_set'] = False
    all_data = pd.concat([train,test])
    all_data = all_data.drop(['income_bracket'], axis=1)
    all_data = pd.get_dummies(all_data)
    all_data = pd.get_dummies(all_data)
    test_x = all_data[all_data['training_set']==False]
    test_x = test_x.drop('training_set', axis=1)
    test_x = scale(test_x, normalization_params)
    y_test_probabilities = predict(test_x, weights).flatten()
    labels = _classify(y_test_probabilities)
    
    
    return labels

In [14]:
weights, normalization_params = train_with_file("adult-training.csv",1000)

labels = classify('adult-test.csv',weights, normalization_params)



iteration: 0 cost: 0.6673527327501676
iteration: 100 cost: 0.4405495171880037
iteration: 200 cost: 0.41404065290826714
iteration: 300 cost: 0.4015694066704977
iteration: 400 cost: 0.3937806593650562
iteration: 500 cost: 0.3882647907144724
iteration: 600 cost: 0.38405401830484465
iteration: 700 cost: 0.38067443839377296
iteration: 800 cost: 0.3778658790412143
iteration: 900 cost: 0.37547278440910753
