In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.formula.api import ols

from matplotlib import pyplot as plt

from sklearn import linear_model
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

import seaborn as sns

import pickle

In [2]:
house_final = pd.read_csv('kc_house_data_test_features.csv', index_col = 0)

# Functions to add distance to water and last renovation to the house

In [3]:
with open('water_lat', 'rb') as handle:
    water_lat = pickle.load(handle)

In [4]:
with open('water_long', 'rb') as handle:
    water_long = pickle.load(handle)

In [5]:
from functions import water_distances, last_change

In [6]:
house_final['last_change'] = last_change(house_final)

In [7]:
house_final['water_distance'] = water_distances(house_final, water_lat, water_long)

# Replacing 1 house with 33 bedrooms to the average for a house with 1.75 baths

In [8]:
house_final['bedrooms'] = np.where(house_final['bedrooms'] != 33, house_final['bedrooms'], 3)

# Replacing basement sqft with # of basements

In [9]:
house_final['basement'] = np.where(house_final['sqft_basement'] > 0, 1, 0)

# Creating Dummies for zipcode, view and grade and removing the columns deemed unnecessary by the feature selection

In [10]:
zip_dummy = pd.get_dummies(house_final['zipcode'], prefix = 'zip', drop_first = True)
#view_dummy = pd.get_dummies(house_final['view'], prefix = 'view', drop_first = True)
grade_dummy = pd.get_dummies(house_final['grade'], prefix = 'grade', drop_first = True)

In [11]:
grade_dummy.drop(columns = ['grade_4', 'grade_5', 'grade_8'], axis=1, inplace = True)

In [12]:
#view_dummy.drop(columns = ['view_1'], axis=1, inplace = True)

In [13]:
zip_dummy = zip_dummy[['zip_98004']]

# Concat the dataframe with the dummy columns left

In [14]:
house_final_w_dummies = pd.concat([house_final, zip_dummy, grade_dummy], axis = 1)

# Dropping all the remaining columns that are not needed

In [15]:
house_final_w_dummies.drop(columns = ['view', 'condition', 'grade', 'zipcode', 'sqft_lot',
                                     'id', 'date', 'sqft_above', 'sqft_basement', 
                                      'sqft_living15', 'sqft_lot15','yr_built', 'yr_renovated', 
                                      'lat', 'long', 'waterfront', 'bathrooms', 
                                      'bedrooms', 'floors', 'last_change'], axis = 1, inplace = True)

# Checking for any extremes in the dataframe

In [16]:
def ext_values(df, extreme_cols):
    new_df = df.copy()
    for col in extreme_cols:
        std = new_df[col].std()
        mean = new_df[col].mean()
        value = mean+(5*std)
        new_df[col] = new_df[col].apply(lambda x: value if (np.abs(x-mean) > 5*std) else x)
    return new_df

In [17]:
house_final_w_dummies = ext_values(house_final_w_dummies, house_final_w_dummies.columns)

# Importing scaler and regression from pickle

In [68]:
with open('final_scaler', 'rb') as handle:
    final_scaler = pickle.load(handle)

In [18]:
with open('final_lasso', 'rb') as handle:
    final_lasso = pickle.load(handle)

# Scaling the new data using previous scale

In [None]:
# scaler = StandardScaler()
# scaler.fit(final_scaler)
# house_final_w_dummies_scaled = pd.DataFrame(data=scaler.transform(house_final_w_dummies))

In [72]:
house_final_w_dummies

Unnamed: 0,sqft_living,water_distance,basement,zip_98004,grade_6,grade_7,grade_9,grade_10,grade_11,grade_12,grade_13
0,2270.0,0.008357,1,0.0,0,0,0,0,0.0,0.0,0.0
1,2270.0,0.008357,1,0.0,0,0,0,0,0.0,0.0,0.0
2,1470.0,0.064514,1,0.0,0,0,0,0,0.0,0.0,0.0
3,1280.0,0.098561,1,0.0,0,0,0,0,0.0,0.0,0.0
4,2830.0,0.075481,0,0.0,0,0,0,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
4318,1530.0,0.070393,0,0.0,0,0,0,0,0.0,0.0,0.0
4319,2310.0,0.019473,0,0.0,0,0,0,0,0.0,0.0,0.0
4320,1020.0,0.012149,0,0.0,0,1,0,0,0.0,0.0,0.0
4321,1600.0,0.035074,0,0.0,0,0,0,0,0.0,0.0,0.0


# Making Predictions

In [19]:
from sklearn.linear_model import Lasso

In [20]:
price_predictions = np.exp(final_lasso.predict(house_final_w_dummies))

In [21]:
price_predictions.mean()

533599.6450545924

In [22]:
price_dataframe = pd.DataFrame(price_predictions)

In [23]:
price_dataframe.rename(columns = {0 : 'Price'}, inplace = True)

In [24]:
price_dataframe.to_csv('housing_preds_AHSR.csv', header = None)

In [25]:
house_stuff = house_final_w_dummies

In [26]:
house_stuff['price'] = price_predictions

In [27]:
house_stuff.head()

Unnamed: 0,sqft_living,water_distance,basement,zip_98004,grade_6,grade_7,grade_9,grade_10,grade_11,grade_12,grade_13,price
0,2270.0,0.008357,1,0.0,0,0,0,0,0.0,0.0,0.0,537284.128099
1,2270.0,0.008357,1,0.0,0,0,0,0,0.0,0.0,0.0,537284.128099
2,1470.0,0.064514,1,0.0,0,0,0,0,0.0,0.0,0.0,398495.243841
3,1280.0,0.098561,1,0.0,0,0,0,0,0.0,0.0,0.0,371193.348982
4,2830.0,0.075481,0,0.0,0,0,0,0,0.0,0.0,0.0,634793.874564
