# Housing Prices: Advanced Regression


In [1]:
import random
import os
import numpy as np
import pandas as pd
import tensorflow as tf

In [12]:
def encode_features(df_train, df_test):
    '''
    Takes columns whose values are strings (objects)
    and categorizes them into discrete numbers.
    This makes it feasible to use regression
    '''
    features = list(df_train.select_dtypes(include=['object']).columns)
    df_combined = pd.concat([df_train[features], df_test[features]])

    for feature in features:
        unique_categories = list(df_combined[feature].unique())
        map_dict = {}
        for idx, category in enumerate(unique_categories):
            map_dict[category] = idx + 1
        df_train[feature] = df_train[feature].map(map_dict)
        df_test[feature] = df_test[feature].map(map_dict)
    
    return df_train, df_test


def cleanup(df):
    '''
    Cleans data
        1. Drops unwanted features
        2. Fills missing values with the mode
    '''
    to_drop = ['MiscFeature', 'MiscVal', 'GarageArea', 'GarageYrBlt', 'Street', 'Alley',
              'LotShape', 'LandContour', 'LandSlope', 'RoofMatl', 'Exterior2nd', 'MasVnrType',
              'MasVnrArea', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
              'BsmtFinSF1', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating']
    #to_drop = list(df.select_dtypes(include=['object']).columns)
    df = df.drop(to_drop, axis=1)
    for column in df.columns:
        x = df[column].dropna().value_counts().index[0]
        df = df.fillna(x)
    return df

In [13]:
data_dir = 'data'

train_dataset = pd.read_csv(os.path.join(data_dir, 'train.csv'))
test_dataset = pd.read_csv(os.path.join(data_dir, 'test.csv'))

train_dataset = cleanup(train_dataset)
test_dataset = cleanup(test_dataset)
train_dataset, test_dataset = encode_features(train_dataset, test_dataset)

In [14]:
# Shuffle data
train_dataset = train_dataset.sample(frac=1)
# Split into training, validation, and testing datasets
train, valid, test = np.split(train_dataset,
                        [int(.6 * len(train_dataset)), int(.8 * len(train_dataset))])

# Convert into numpy arrays
x_train = train.drop(['SalePrice', 'Id'], axis=1).as_matrix().astype(np.float32)
y_train = train['SalePrice'].as_matrix().astype(np.float32).reshape((np.shape(x_train)[0], 1))
x_test = test.drop(['SalePrice', 'Id'], axis=1).as_matrix().astype(np.float32)
y_test = test['SalePrice'].as_matrix().astype(np.float32).reshape((np.shape(x_test)[0], 1))
x_valid = valid.drop(['SalePrice', 'Id'], axis=1).as_matrix().astype(np.float32)
y_valid = valid['SalePrice'].as_matrix().astype(np.float32).reshape((np.shape(x_valid)[0], 1))

## Training with only Linear Regression
Using stochasitc gradient descent with batch size of 16

In [33]:
batch_size = 16
train_size = np.shape(x_train)[0]
valid_size = np.shape(x_valid)[0]
test_size = np.shape(x_test)[0]
num_features = np.shape(x_train)[1]

graph = tf.Graph()
with graph.as_default():
    
    # Input
    tf_train_dataset = tf.constant(x_train)
    tf_train_labels = tf.constant(y_train)
    tf_valid_dataset = tf.constant(x_valid)
    tf_test_dataset = tf.constant(x_test)
    
    # Variables
    weights = tf.Variable(tf.truncated_normal([num_features, 1]))
    biases = tf.Variable(tf.zeros([1]))
    
    # Loss Computation
    train_prediction = tf.matmul(tf_train_dataset, weights) + biases
    #loss = tf.losses.mean_squared_error(tf_train_labels, train_prediction)
    loss = tf.reduce_mean((train_prediction - tf_train_labels) ** 2)
    
    # Optimizer
    # Gradient descent optimizer with learning rate = alpha
    alpha = tf.constant(0.000000003, dtype=tf.float64)
    optimizer = tf.train.GradientDescentOptimizer(alpha).minimize(loss)
    
    # Predictions
    valid_prediction = tf.matmul(tf_valid_dataset, weights) + biases
    test_prediction = tf.matmul(tf_test_dataset, weights) + biases

In [34]:
num_steps = 100001

def accuracy(prediction, labels):
    return ((prediction - labels) ** 2).mean(axis=None)

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        # Run the computations. We tell .run() that we want to run the optimizer,
        # and get the loss value and the training predictions returned as numpy
        # arrays.
        _, l, predictions = session.run([optimizer, loss, train_prediction])
        if (step % 1000 == 0):
            print('Loss at step %d: %f' % (step, l))
            # Calling .eval() on valid_prediction is basically like calling run(), but
            # just to get that one numpy array. Note that it recomputes all its graph
            # dependencies.
            print('Validation accuracy: %.1f%%' % accuracy(valid_prediction.eval(), y_valid))
    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), y_test))

Initialized
Loss at step 0: 43246637056.000000
Validation accuracy: 32909051904.0%
Loss at step 1000: 2836700928.000000
Validation accuracy: 3298028032.0%
Loss at step 2000: 2712974080.000000
Validation accuracy: 3025679360.0%
Loss at step 3000: 2675836672.000000
Validation accuracy: 2947610368.0%
Loss at step 4000: 2649684992.000000
Validation accuracy: 2915950592.0%


KeyboardInterrupt: 

## Training with Neural Networks and Regression