In [3]:
import time
import random
import numpy as np
np.seterr(all = 'ignore')

# transfer functions
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# derivative of sigmoid
def dsigmoid(y):
    return y * (1.0 - y)

# using softmax as output layer is recommended for classification where outputs are mutually exclusive
def softmax(w):
    e = np.exp(w - np.amax(w))
    dist = e / np.sum(e)
    return dist

# using tanh over logistic sigmoid for the hidden layer is recommended   
def tanh(x):
    return np.tanh(x)
    
# derivative for tanh sigmoid
def dtanh(y):
    return 1 - y*y

class MLP_Classifier(object):
    """
    Basic MultiLayer Perceptron (MLP) neural network with regularization and learning rate decay
    Consists of three layers: input, hidden and output. The sizes of input and output must match data
    the size of hidden is user defined when initializing the network.
    The algorithm can be used on any dataset.
    As long as the data is in this format: [[[x1, x2, x3, ..., xn], [y1, y2, ..., yn]],
                                           [[[x1, x2, x3, ..., xn], [y1, y2, ..., yn]],
                                           ...
                                           [[[x1, x2, x3, ..., xn], [y1, y2, ..., yn]]]
    An example is provided below with the digit recognition dataset provided by sklearn
    Fully pypy compatible.
    """
    def __init__(self, input, hidden, output, iterations = 50, learning_rate = 0.01, 
                l2_in = 0, l2_out = 0, momentum = 0, rate_decay = 0, 
                output_layer = 'logistic', verbose = True):
        """
        :param input: number of input neurons
        :param hidden: number of hidden neurons
        :param output: number of output neurons
        :param iterations: how many epochs
        :param learning_rate: initial learning rate
        :param l2: L2 regularization term
        :param momentum: momentum
        :param rate_decay: how much to decrease learning rate by on each iteration (epoch)
        :param output_layer: activation (transfer) function of the output layer
        :param verbose: whether to spit out error rates while training
        """
        # initialize parameters
        self.iterations = iterations
        self.learning_rate = learning_rate
        self.l2_in = l2_in
        self.l2_out = l2_out
        self.momentum = momentum
        self.rate_decay = rate_decay
        self.verbose = verbose
        self.output_activation = output_layer
        
        # initialize arrays
        self.input = input + 1 # add 1 for bias node
        self.hidden = hidden 
        self.output = output

        # set up array of 1s for activations
        self.ai = np.ones(self.input)
        self.ah = np.ones(self.hidden)
        self.ao = np.ones(self.output)

        # create randomized weights
        # use scheme from Efficient Backprop by LeCun 1998 to initialize weights for hidden layer
        input_range = 1.0 / self.input ** (1/2)
        self.wi = np.random.normal(loc = 0, scale = input_range, size = (self.input, self.hidden))
        self.wo = np.random.uniform(size = (self.hidden, self.output)) / np.sqrt(self.hidden)
        
        # create arrays of 0 for changes
        # this is essentially an array of temporary values that gets updated at each iteration
        # based on how much the weights need to change in the following iteration
        self.ci = np.zeros((self.input, self.hidden))
        self.co = np.zeros((self.hidden, self.output))

    def feedForward(self, inputs):
        """
        The feedforward algorithm loops over all the nodes in the hidden layer and
        adds together all the outputs from the input layer * their weights
        the output of each node is the sigmoid function of the sum of all inputs
        which is then passed on to the next layer.
        :param inputs: input data
        :return: updated activation output vector
        """
        if len(inputs) != self.input-1:
            raise ValueError('Wrong number of inputs you silly goose!')

        # input activations
        self.ai[0:self.input -1] = inputs

        # hidden activations
        sum = np.dot(self.wi.T, self.ai)
        self.ah = tanh(sum)
        
        # output activations
        sum = np.dot(self.wo.T, self.ah)
        if self.output_activation == 'logistic':
            self.ao = sigmoid(sum)
        elif self.output_activation == 'softmax':
            self.ao = softmax(sum)
        else:
            raise ValueError('Choose a compatible output layer activation or check your spelling ;-p') 
        
        
        return self.ao

    def backPropagate(self, targets):
        """
        For the output layer
        1. Calculates the difference between output value and target value
        2. Get the derivative (slope) of the sigmoid function in order to determine how much the weights need to change
        3. update the weights for every node based on the learning rate and sig derivative

        For the hidden layer
        1. calculate the sum of the strength of each output link multiplied by how much the target node has to change
        2. get derivative to determine how much weights need to change
        3. change the weights based on learning rate and derivative
        :param targets: y values
        :param N: learning rate
        :return: updated weights
        """
        if len(targets) != self.output:
            raise ValueError('Wrong number of targets you silly goose!')

        # calculate error terms for output
        # the delta (theta) tell you which direction to change the weights
        if self.output_activation == 'logistic':
            output_deltas = dsigmoid(self.ao) * -(targets - self.ao)
        elif self.output_activation == 'softmax':
            output_deltas = -(targets - self.ao)
        else:
            raise ValueError('Choose a compatible output layer activation or check your spelling ;-p') 
        
        # calculate error terms for hidden
        # delta (theta) tells you which direction to change the weights
        error = np.dot(self.wo, output_deltas)
        hidden_deltas = dtanh(self.ah) * error
        
        # update the weights connecting hidden to output, change == partial derivative
        change = output_deltas * np.reshape(self.ah, (self.ah.shape[0],1))
        regularization = self.l2_out * self.wo
        self.wo -= self.learning_rate * (change + regularization) + self.co * self.momentum 
        self.co = change 

        # update the weights connecting input to hidden, change == partial derivative
        change = hidden_deltas * np.reshape(self.ai, (self.ai.shape[0], 1))
        regularization = self.l2_in * self.wi
        self.wi -= self.learning_rate * (change + regularization) + self.ci * self.momentum 
        self.ci = change

        # calculate error
        if self.output_activation == 'softmax':
            error = -sum(targets * np.log(self.ao))
        elif self.output_activation == 'logistic':
            error = sum(0.5 * (targets - self.ao)**2)
        
        return error

    def test(self, patterns):
        """
        Currently this will print out the targets next to the predictions.
        Not useful for actual ML, just for visual inspection.
        """
        for p in patterns:
            print(p[1], '->', self.feedForward(p[0]))

    def fit(self, patterns):
        if self.verbose == True:
            if self.output_activation == 'softmax':
                print('Using softmax activation in output layer')
            elif self.output_activation == 'logistic':
                print('Using logistic sigmoid activation in output layer')
                
        num_example = np.shape(patterns)[0]
                
        for i in range(self.iterations):
            error = 0.0
            random.shuffle(patterns)
            for p in patterns:
                inputs = p[0]
                targets = p[1]
                self.feedForward(inputs)
                error += self.backPropagate(targets)
                
            with open('error.txt', 'a') as errorfile:
                errorfile.write(str(error) + '\n')
                errorfile.close()
                
            if i % 10 == 0 and self.verbose == True:
                error = error/num_example
                print('Training error %-.5f' % error)
                
            # learning rate decay
            self.learning_rate = self.learning_rate * (self.learning_rate / (self.learning_rate + (self.learning_rate * self.rate_decay)))
                
    def predict(self, X):
        """
        return list of predictions after training algorithm
        """
        predictions = []
        for p in X:
            predictions.append(self.feedForward(p))
        return predictions

def demo():
    from sklearn.preprocessing import scale
    """
    run NN demo on the digit recognition dataset from sklearn
    """
    def load_data():
        data = np.loadtxt('sklearn_digits.csv', delimiter = ',')

        # first ten values are the one hot encoded y (target) values
        y = data[:,0:10]
        
        data = data[:,10:] # x data
        data = scale(data)
        
        out = []
        #print data.shape

        # populate the tuple list with the data
        for i in range(data.shape[0]):
            tupledata = list((data[i,:].tolist(), y[i].tolist())) # don't mind this variable name
            out.append(tupledata)

        return out
    
    start = time.time()
    
    X = load_data()

    #print X[9] # make sure the data looks right

    NN = MLP_Classifier(64, 4000, 10, iterations = 50, learning_rate = 0.01, 
                        momentum = 0.5, rate_decay = 0.0001, 
                        output_layer = 'logistic')

    NN.fit(X)
    
    end = time.time()
    print(end - start)
    
    #NN.test(X)

if __name__ == '__main__':
    demo()


Using logistic sigmoid activation in output layer
Training error 0.74844
Training error 0.19977
Training error 0.18120
Training error 0.15055
Training error 0.13782
701.2341079711914


In [41]:
import math
import numpy as np


class Logit(object):
    """
    logistic regression using gradient descent
    """
    def __init__(self, learning_rate = 0.01, iterations = 100, verbose = True, 
                tolerance = 0, l2 = 0, intercept = True):
        """
        :param learning_rate: learning rate constant
        :param iterations: how many epochs
        :param tolerance: the error value in which to stop training
        :param intercept: whether to fit an intercept
        :param verbose: whether to spit out error rates while training
        :param l2: L2 regularization term
        """
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.tolerance = tolerance
        self.intercept = intercept
        self.verbose = verbose
        self.l2 = l2
        self.theta = None
        self.mean = []
        self.std = []

    def sigmoid(self, x):
        """
        Typical sigmoid function created from input vector x

        :param x: input vector
        :return: sigmoided vector
        """
        return 1 / (1 + np.exp(-x))

    def fit(self, X, y):
        """
        Gradient descent, loops over theta and updates to
        take steps in direction of steepest decrease of J.
        :input x: must be numpy array
        :input y: must be numpy vector of 0 and 1
        :return: value of theta that minimizes J(theta) and J_history
        """
        if self.intercept:
            intercept = np.ones((np.shape(X)[0],1))
            X = np.concatenate((intercept, X), 1)
        
        num_examples, num_features = np.shape(X)

        # initialize theta to 1
        self.theta = np.ones(num_features)

        for i in range(self.iterations):
            # make predictions
            predicted = self.sigmoid(np.dot(X, self.theta.T))
            # update theta with gradient descent
            self.theta = (self.theta * (1 - (self.learning_rate * self.l2))) - self.learning_rate * np.dot((predicted - y).T, X)
            # sum of squares cost
            error = predicted - y
            cost = np.sum(error**2) / (2 * num_examples)

            if i % (self.iterations/10) == 0 and self.verbose == True:
                print('iteration:', i)
                print('theta:', self.theta)
                print('cost:', cost)

            if cost < self.tolerance:
                return self.theta
                break

        return self.theta

    def predict(self, X, labels):
        """
        Make linear prediction based on cost and gradient descent

        :param X: new data to make predictions on
        :param labels: boolean
        :return: return prediction
        """
        if self.intercept:
            intercept = np.ones((np.shape(X)[0],1))
            X = np.concatenate((intercept, X), 1)
            
        num_examples, num_features = np.shape(X)
        prediction = []
        for sample in range(num_examples):
            yhat = 0
            for value in range(num_features):
                yhat += X[sample, value] * self.theta[value]
            
            pred = self.sigmoid(yhat)
            
            if labels:
                if pred > 0.5:
                    prediction.append(int(1))
                else:
                    prediction.append(int(0))
            else:
                prediction.append(yhat)   
                
        return prediction
        

def demo():
    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import confusion_matrix, classification_report 
    # initialize linear regression parameters
    max_iterations = 50000
    learning_rate = 0.0001
    l2 = 1.0

    lgit = Logit(learning_rate = learning_rate, iterations = max_iterations, 
                verbose = True, tolerance = 0.001, l2 = l2)

    data = np.loadtxt('denver.csv', delimiter = ',')
    X = data[:, 1:]
    y = data[:, 0]
    
    # scale data
    max = np.amax(X)
    X /= max
    
    prediction = []
    correct = []
    for i in range(0,10):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
#         print(X_train, X_test)
#         print(X_train,X_test)
        
         # fit the reg
        lgit.fit(X = X_train, y = y_train)
    
        # make a predictions
        prediction.append(lgit.predict(X_test, labels = False))
        correct.append(y_test.tolist())
    
    print(classification_report(np.array(correct), np.array(prediction)))
    
if __name__ == '__main__':
    demo()

iteration: 0
theta: [ 1.02440858  1.00011749  1.00085052  1.00157691  1.00082042  1.00282016
  0.99937091]
cost: 26.3918671985
iteration: 5000
theta: [ 93.74339728   1.42872705   4.22308357   6.98661633   4.10426735
  11.64666126  -1.41091707]
cost: 25.0141025641
iteration: 10000
theta: [ 149.9552568     1.68844766    6.17653315   10.61508635    5.98566948
   18.09916212   -2.87233299]
cost: 25.0141025641
iteration: 15000
theta: [ 184.04862063    1.84597223    7.36133059   12.81580964    7.12676902
   22.01270387   -3.75870439]
cost: 25.0141025641
iteration: 20000
theta: [ 204.7267741     1.94151333    8.0799286    14.15058242    7.81886357
   24.38632758   -4.29630238]
cost: 25.0141025641
iteration: 25000
theta: [ 217.26839459    1.99946048    8.51576942   14.96014279    8.23862964
   25.82596715   -4.62236389]
cost: 25.0141025641
iteration: 30000
theta: [ 224.87508177    2.03460633    8.78011364   15.4511537     8.49322427
   26.69913085   -4.82012525]
cost: 25.0141025641
iteration: 

TypeError: not all arguments converted during string formatting

In [4]:
import json

'''
returns a list of json objects and each json object is a review.
'''
def get_reviews(file_path):
    reviews = []
    with open(file_path) as fin:
        for line in fin:
            reviews.append(json.loads(line))
    
    print("available keys in reviews: ", reviews[0].keys())
    return reviews

'''
returns a list of json objects and each json object is a business record.
This file is very big. May take a while to complete this step.
'''
def get_businesses(file_path):                           
    businesses = []
    with open(file_path) as fin:
        for line in fin:
            businesses.append(json.loads(line))
    
    print("available keys in businesses: ", businesses[0].keys())
    return businesses            

'''
returns a list of json objects and each json object is a starbucks business record.
'''                
def get_starbucks(businesses):                
    starbucks = []
    for b in businesses:
        if b['name'] == 'Starbucks' or b['name'] == 'starbucks':
            starbucks.append(b['business_id'])
    
    return starbucks

'''
returns a list of dates that are associated with starbucks reviews..
'''
def get_starbucks_review_dates(reviews, starbucks):        
    dates = []
    for r in reviews:
        if r['business_id'] in starbucks:
            dates.append(r['date'])
    
    return dates
            
def main():      
    #UPDATE FILE PATHS TO YOUR LOCAL FILES. IF ON WINDOWS, YOU WILL NEED TO CHANGE PATH SEPERATOR FROM "/" TO "\"
    bus_file_path = "yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json"
    rev_file_path = "yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json"
     
    businesses = get_businesses(bus_file_path)
    print("number of businesses available: ", len(businesses))
    
    reviews = get_reviews(rev_file_path)
    print("number of reviews available: ", len(reviews))
    
    sbux = get_starbucks(businesses)
    print("number of starbucks: ", len(sbux))
    dates = get_starbucks_review_dates((reviews, sbux))
    
    print("ranges of dates of starbucks reviews in the data set:", sorted(dates)[0],sorted(dates)[-1])


if __name__ == '__main__':
    main()

FileNotFoundError: [Errno 2] No such file or directory: 'yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json'

In [1]:
"""Convert the Yelp Dataset Challenge dataset from json format to csv.
For more information on the Yelp Dataset Challenge please visit http://yelp.com/dataset_challenge
"""
import argparse
import collections
import csv
import simplejson as json


def read_and_write_file(json_file_path, csv_file_path, column_names):
    """Read in the json dataset file and write it out to a csv file, given the column names."""
    with open(csv_file_path, 'wb+') as fout:
        csv_file = csv.writer(fout)
        csv_file.writerow(list(column_names))
        with open(json_file_path) as fin:
            for line in fin:
                line_contents = json.loads(line)
                csv_file.writerow(get_row(line_contents, column_names))

def get_superset_of_column_names_from_file(json_file_path):
    """Read in the json dataset file and return the superset of column names."""
    column_names = set()
    with open(json_file_path) as fin:
        for line in fin:
            line_contents = json.loads(line)
            column_names.update(
                    set(get_column_names(line_contents).keys())
                    )
    return column_names

def get_column_names(line_contents, parent_key=''):
    """Return a list of flattened key names given a dict.
    Example:
        line_contents = {
            'a': {
                'b': 2,
                'c': 3,
                },
        }
        will return: ['a.b', 'a.c']
    These will be the column names for the eventual csv file.
    """
    column_names = []
    for k, v in line_contents.iteritems():
        column_name = "{0}.{1}".format(parent_key, k) if parent_key else k
        if isinstance(v, collections.MutableMapping):
            column_names.extend(
                    get_column_names(v, column_name).items()
                    )
        else:
            column_names.append((column_name, v))
    return dict(column_names)

def get_nested_value(d, key):
    """Return a dictionary item given a dictionary `d` and a flattened key from `get_column_names`.
    
    Example:
        d = {
            'a': {
                'b': 2,
                'c': 3,
                },
        }
        key = 'a.b'
        will return: 2
    
    """
    if '.' not in key:
        if key not in d:
            return None
        return d[key]
    base_key, sub_key = key.split('.', 1)
    if base_key not in d:
        return None
    sub_dict = d[base_key]
    return get_nested_value(sub_dict, sub_key)

def get_row(line_contents, column_names):
    """Return a csv compatible row given column names and a dict."""
    row = []
    for column_name in column_names:
        line_value = get_nested_value(
                        line_contents,
                        column_name,
                        )
        if isinstance(line_value, unicode):
            row.append('{0}'.format(line_value.encode('utf-8')))
        elif line_value is not None:
            row.append('{0}'.format(line_value))
        else:
            row.append('')
    return row

if __name__ == '__main__':
    
    """Convert a yelp dataset file from json to csv."""

    parser = argparse.ArgumentParser(
            description='Convert Yelp Dataset Challenge data from JSON format to CSV.',
            )

    parser.add_argument(
            'json_file',
            type=str,
            help='The json file to convert.',
            )

    args = parser.parse_args()

    json_file = args.json_file
    csv_file = '{0}.csv'.format(json_file.split('.json')[0])

    column_names = get_superset_of_column_names_from_file(json_file)
    read_and_write_file(json_file, csv_file, column_names)

usage: __main__.py [-h] json_file
__main__.py: error: unrecognized arguments: -f --profile-dir C:\Users\wynsa2\.ipython\profile_default


SystemExit: 2

To exit: use 'exit', 'quit', or Ctrl-D.


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
import statsmodels.api as sm
from scipy import stats
np.set_printoptions(precision=4, suppress=True)
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
import re

In [2]:
data = pd.read_csv("Real_Property_Taxes.csv")

In [284]:
data.head(5)

Unnamed: 0,ward,sect,block,lot,propertyAddress,lotSize,cityTax,stateTax,resCode,amountDue,asOfDate
0,15,370,1,1,2045 W NORTH AVE,1528310,224951,$112.08,NOT A PRINCIPAL RESIDENCE,,3/25/2015
1,15,370,1,2,2043 W NORTH AVE,148310,111276,$55.44,NOT A PRINCIPAL RESIDENCE,,3/25/2015
2,15,370,1,3,2041 W NORTH AVE,148310,67440,$33.60,NOT A PRINCIPAL RESIDENCE,$801.99,3/25/2015
3,15,370,1,4,2039 W NORTH AVE,148310,67440,$33.60,NOT A PRINCIPAL RESIDENCE,,3/25/2015
4,15,370,1,5,2037 W NORTH AVE,148310,47208,$23.52,NOT A PRINCIPAL RESIDENCE,,3/25/2015


In [86]:
data.count()

ward               238308
sect               238308
block              238308
lot                238308
propertyAddress    238308
lotSize            238308
cityTax            217760
stateTax           217961
resCode            238308
amountDue           29292
asOfDate           238308
dtype: int64

In [3]:
data.fillna(0)

Unnamed: 0,ward,sect,block,lot,propertyAddress,lotSize,cityTax,stateTax,resCode,amountDue,asOfDate
0,15,370,1,1,2045 W NORTH AVE,15-2X83-10,"$2,249.51",$112.08,NOT A PRINCIPAL RESIDENCE,0,3/25/2015
1,15,370,1,2,2043 W NORTH AVE,14X83-10,"$1,112.76",$55.44,NOT A PRINCIPAL RESIDENCE,0,3/25/2015
2,15,370,1,3,2041 W NORTH AVE,14X83-10,$674.40,$33.60,NOT A PRINCIPAL RESIDENCE,$801.99,3/25/2015
3,15,370,1,4,2039 W NORTH AVE,14X83-10,$674.40,$33.60,NOT A PRINCIPAL RESIDENCE,0,3/25/2015
4,15,370,1,5,2037 W NORTH AVE,14X83-10,$472.08,$23.52,NOT A PRINCIPAL RESIDENCE,0,3/25/2015
5,15,370,1,6,2035 W NORTH AVE,14X83-10,$472.08,$23.52,NOT A PRINCIPAL RESIDENCE,0,3/25/2015
6,15,370,1,7,2033 W NORTH AVE,14X83-10,$472.08,$23.52,NOT A PRINCIPAL RESIDENCE,$553.68,3/25/2015
7,15,370,1,8,2031 W NORTH AVE,14X83-10,$67.44,$3.36,NOT A PRINCIPAL RESIDENCE,$90.09,3/25/2015
8,15,370,1,9,2029 W NORTH AVE,14X83-10,$472.08,$23.52,NOT A PRINCIPAL RESIDENCE,0,3/25/2015
9,15,370,1,10,2027 W NORTH AVE,14X83-10,$472.08,$23.52,NOT A PRINCIPAL RESIDENCE,$564.69,3/25/2015


In [258]:
list(data.columns)

['ward',
 'sect',
 'block',
 'lot',
 'propertyAddress',
 'lotSize',
 'cityTax',
 'stateTax',
 'resCode',
 'amountDue',
 'asOfDate']

In [4]:
data.shape

(238308, 11)

In [5]:
data.values #convert into arrays

array([[15, 370, '1', ..., 'NOT A PRINCIPAL RESIDENCE', nan, '3/25/2015'],
       [15, 370, '1', ..., 'NOT A PRINCIPAL RESIDENCE', nan, '3/25/2015'],
       [15, 370, '1', ..., 'NOT A PRINCIPAL RESIDENCE', '$801.99 ',
        '3/25/2015'],
       ..., 
       [4, 110, '1384', ..., 'NOT A PRINCIPAL RESIDENCE', '$40,672.24 ',
        '8/8/2015'],
       [4, 110, '1384', ..., 'NOT A PRINCIPAL RESIDENCE', '$29,790.28 ',
        '8/8/2015'],
       [4, 110, '1384', ..., 'NOT A PRINCIPAL RESIDENCE', '$57,786.96 ',
        '8/8/2015']], dtype=object)

In [9]:
lotsize_column = data['lotSize']

In [7]:
lotsize_column = data['lotSize'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
# lotsize_column = data['lotSize'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'') #remove all non-numeric chars

In [10]:
type(lotsize_column)
lotsizes = lotsize_column.values
list(lotsizes)

['1528310',
 '148310',
 '148310',
 '148310',
 '148310',
 '148310',
 '148310',
 '148310',
 '148310',
 '148310',
 '148310',
 '148310',
 '148310',
 '148310',
 '148310',
 '0231',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '1485',
 '13985',
 '13985',
 '13985',
 '13985',
 '13985',
 '13985',
 '13985',
 '13985',
 '13985',
 '14385',
 '1480',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '131080',
 '14280',
 '0278',
 '14685',
 '13585',
 '13585',
 '13585',
 '13585',
 '13585',
 '13585',
 '13585',
 '13585',
 '13585',
 '13585',
 '13585',
 '13585',
 '1485',
 '1585',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '2885',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1485',
 '1585',
 '14285',
 '13885',
 '13885',
 '13885',
 '13885',
 '13885',


In [11]:
type(data)

pandas.core.frame.DataFrame

In [13]:
data.groupby('lotSize').count() #The original data frame can be aggregated by counting rows for each possible value of the lotsize column:

Unnamed: 0_level_0,ward,sect,block,lot,propertyAddress,cityTax,stateTax,resCode,amountDue,asOfDate
lotSize,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,111,111,111,111,111,69,72,111,4,111
0001,10,10,10,10,10,1,1,10,0,10
00016,1,1,1,1,1,0,0,1,0,1
0002,14,14,14,14,14,4,4,14,1,14
0003,10,10,10,10,10,6,6,10,1,10
0004,7,7,7,7,7,3,3,7,0,7
0005,10,10,10,10,10,5,5,10,2,10
000513,1,1,1,1,1,1,1,1,0,1
0006,12,12,12,12,12,3,3,12,2,12
0007,21,21,21,21,21,16,16,21,7,21


In [14]:
data.groupby('cityTax').count()

Unnamed: 0_level_0,ward,sect,block,lot,propertyAddress,lotSize,stateTax,resCode,amountDue,asOfDate
cityTax,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
$0.00,16,16,16,16,16,16,16,16,16,16
"$1,000.36",51,51,51,51,51,51,51,51,4,51
"$1,001.87",1,1,1,1,1,1,1,1,0,1
"$1,002.61",11,11,11,11,11,11,11,11,3,11
"$1,003.35",1,1,1,1,1,1,1,1,1,1
"$1,004.86",16,16,16,16,16,16,16,16,4,16
"$1,005.60",5,5,5,5,5,5,5,5,1,5
"$1,006.20",1,1,1,1,1,1,1,1,1,1
"$1,006.36",4,4,4,4,4,4,4,4,1,4
"$1,007.10",17,17,17,17,17,17,17,17,4,17


In [15]:
city_propertytax_column = data['cityTax']
city_propertytaxes = data['cityTax'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')

In [16]:
city_propertytaxes = city_propertytax_column.values

In [17]:
list(city_propertytaxes)

['224951',
 '111276',
 '67440',
 '67440',
 '47208',
 '47208',
 '47208',
 '6744',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 nan,
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 nan,
 nan,
 '47208',
 '47208',
 '47208',
 '6744',
 '47208',
 '6744',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '67440',
 '47208',
 '47208',
 '6744',
 '47208',
 '6744',
 '23229',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '136678',
 '47208',
 '47208',
 '74933',
 '74933',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '47208',
 '6744',
 '6744',
 nan,
 '6744',
 '6744',
 '6744',
 '47208',
 '6744',
 '74933',
 '6744',
 '74933',
 '74933',
 '74933',
 '47208',
 '74933',
 '6744',
 '6744',
 '47208',
 '6744',
 '47208',
 '74933',
 '6744',
 '6744',
 '6744',
 '47208',
 '47208',
 '47208',
 '20232',
 '47208',
 '6744',
 '74933',
 '20232',
 '74933',
 '47208',
 '47208',
 '47208',
 '47208',
 '

In [18]:
target = lotsizes

In [19]:
type(target)

numpy.ndarray

In [20]:
target.dtype

dtype('O')

In [21]:
target

array(['1528310', '148310', '148310', ..., '9745', '7138', '13846'], dtype=object)

In [22]:
for (i,y) in np.ndenumerate(target):
    print(i,y)

(0,) 1528310
(1,) 148310
(2,) 148310
(3,) 148310
(4,) 148310
(5,) 148310
(6,) 148310
(7,) 148310
(8,) 148310
(9,) 148310
(10,) 148310
(11,) 148310
(12,) 148310
(13,) 148310
(14,) 148310
(15,) 0231
(16,) 131080
(17,) 131080
(18,) 131080
(19,) 131080
(20,) 131080
(21,) 131080
(22,) 131080
(23,) 131080
(24,) 131080
(25,) 131080
(26,) 131080
(27,) 131080
(28,) 131080
(29,) 1485
(30,) 13985
(31,) 13985
(32,) 13985
(33,) 13985
(34,) 13985
(35,) 13985
(36,) 13985
(37,) 13985
(38,) 13985
(39,) 14385
(40,) 1480
(41,) 131080
(42,) 131080
(43,) 131080
(44,) 131080
(45,) 131080
(46,) 131080
(47,) 131080
(48,) 131080
(49,) 131080
(50,) 131080
(51,) 131080
(52,) 14280
(53,) 0278
(54,) 14685
(55,) 13585
(56,) 13585
(57,) 13585
(58,) 13585
(59,) 13585
(60,) 13585
(61,) 13585
(62,) 13585
(63,) 13585
(64,) 13585
(65,) 13585
(66,) 13585
(67,) 1485
(68,) 1585
(69,) 1485
(70,) 1485
(71,) 1485
(72,) 1485
(73,) 1485
(74,) 2885
(75,) 1485
(76,) 1485
(77,) 1485
(78,) 1485
(79,) 1485
(80,) 1485
(81,) 1485
(82,)

In [23]:
target[:5]

array(['1528310', '148310', '148310', '148310', '148310'], dtype=object)

In [24]:
numerical_features = data[['cityTax']]

In [25]:
numerical_features.count()

cityTax    217760
dtype: int64

In [26]:
median_features = numerical_features.dropna().median()
median_features

cityTax    235590
dtype: float64

In [27]:
imputed_features = numerical_features.fillna(median_features)
imputed_features.count()

cityTax    238308
dtype: int64

In [28]:
features_array = imputed_features.values

In [29]:
features_array.astype(int)

array([[ 224951],
       [ 111276],
       [  67440],
       ..., 
       [3874203],
       [2837650],
       [5504453]])

In [30]:
type(features_array)

numpy.ndarray

In [31]:
# features_array = np.asarray(features_array, dtype=np.float64, order=None) #convert to float64

In [32]:
features_array

array([['224951'],
       ['111276'],
       ['67440'],
       ..., 
       ['3874203'],
       ['2837650'],
       ['5504453']], dtype=object)

In [33]:
from sklearn.cross_validation import train_test_split

features_train, features_test, target_train, target_test = train_test_split(
    features_array, target, test_size=0.20, random_state=0)

In [34]:
features_train.shape

(190646, 1)

In [35]:
features_test.shape

(47662, 1)

In [36]:
target_train.shape

(190646,)

In [37]:
target_test.shape

(47662,)

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=1)
logreg.fit(features_train, target_train)

In [None]:
target_predicted = logreg.predict(features_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(target_test, target_predicted)

In [None]:
logreg.score(features_test, target_test)